Skip to content

Commit b5b7e6d

Browse files
committed
Add GoT OCR 2.0
Signed-off-by: Christoph Auer <[email protected]>
1 parent 4a107f4 commit b5b7e6d

File tree

4 files changed

+45
-10
lines changed

4 files changed

+45
-10
lines changed

docling/cli/main.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
)
6060
from docling.datamodel.settings import settings
6161
from docling.datamodel.vlm_model_specs import (
62+
GOT2_TRANSFORMERS,
6263
GRANITE_VISION_OLLAMA,
6364
GRANITE_VISION_TRANSFORMERS,
6465
SMOLDOCLING_MLX,
@@ -621,6 +622,8 @@ def convert( # noqa: C901
621622
pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
622623
elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
623624
pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
625+
elif vlm_model == VlmModelType.GOT_OCR_2:
626+
pipeline_options.vlm_options = GOT2_TRANSFORMERS
624627
elif vlm_model == VlmModelType.SMOLDOCLING:
625628
pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
626629
if sys.platform == "darwin":

docling/datamodel/pipeline_options_vlm_model.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ class TransformersModelType(str, Enum):
4646
class TransformersPromptStyle(str, Enum):
4747
CHAT = "chat"
4848
RAW = "raw"
49+
NONE = "none"
4950

5051

5152
class InlineVlmOptions(BaseVlmOptions):

docling/datamodel/vlm_model_specs.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,26 @@
194194
temperature=0.0,
195195
)
196196

197+
# GoT 2.0
198+
GOT2_TRANSFORMERS = InlineVlmOptions(
199+
repo_id="stepfun-ai/GOT-OCR-2.0-hf",
200+
prompt="",
201+
response_format=ResponseFormat.MARKDOWN,
202+
inference_framework=InferenceFramework.TRANSFORMERS,
203+
transformers_prompt_style=TransformersPromptStyle.NONE,
204+
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
205+
supported_devices=[
206+
AcceleratorDevice.CPU,
207+
AcceleratorDevice.CUDA,
208+
# AcceleratorDevice.MPS,
209+
],
210+
scale=2.0,
211+
temperature=0.0,
212+
stop_strings=["<|im_end|>"],
213+
# extra_generation_config={"format": True},
214+
)
215+
216+
197217
# Gemma-3
198218
GEMMA3_12B_MLX = InlineVlmOptions(
199219
repo_id="mlx-community/gemma-3-12b-it-bf16",
@@ -215,6 +235,8 @@
215235
temperature=0.0,
216236
)
217237

238+
# Dolphin
239+
218240
DOLPHIN_TRANSFORMERS = InlineVlmOptions(
219241
repo_id="ByteDance/Dolphin",
220242
prompt="<s>Read text in the image. <Answer/>",
@@ -238,3 +260,4 @@ class VlmModelType(str, Enum):
238260
GRANITE_VISION = "granite_vision"
239261
GRANITE_VISION_VLLM = "granite_vision_vllm"
240262
GRANITE_VISION_OLLAMA = "granite_vision_ollama"
263+
GOT_OCR_2 = "got_ocr_2"

docling/models/vlm_models_inline/hf_transformers_model.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -270,16 +270,24 @@ def process_images(
270270
user_prompts = prompt
271271

272272
# Use your prompt formatter verbatim
273-
prompts: list[str] = [self.formulate_prompt(p) for p in user_prompts]
274-
275-
# -- Processor performs BOTH text+image preprocessing + batch padding (recommended)
276-
inputs = self.processor(
277-
text=prompts,
278-
images=pil_images,
279-
return_tensors="pt",
280-
padding=True, # pad across batch for both text and vision
281-
# no truncation by default; match SmolDocling examples
282-
)
273+
if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.NONE:
274+
inputs = self.processor(
275+
pil_images,
276+
return_tensors="pt",
277+
padding=True, # pad across batch for both text and vision
278+
# no truncation by default; match SmolDocling examples
279+
)
280+
else:
281+
prompts: list[str] = [self.formulate_prompt(p) for p in user_prompts]
282+
283+
# -- Processor performs BOTH text+image preprocessing + batch padding (recommended)
284+
inputs = self.processor(
285+
text=prompts,
286+
images=pil_images,
287+
return_tensors="pt",
288+
padding=True, # pad across batch for both text and vision
289+
# no truncation by default; match SmolDocling examples
290+
)
283291
inputs = {k: v.to(self.device) for k, v in inputs.items()}
284292

285293
# -- Optional stopping criteria

0 commit comments

Comments
 (0)