Skip to content

Commit 4ba3c50

Browse files
committed
Add GoT OCR 2.0
Signed-off-by: Christoph Auer <[email protected]>
1 parent 4a107f4 commit 4ba3c50

File tree

2 files changed

+26
-0
lines changed

2 files changed

+26
-0
lines changed

docling/cli/main.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
)
6060
from docling.datamodel.settings import settings
6161
from docling.datamodel.vlm_model_specs import (
62+
GOT2_TRANSFORMERS,
6263
GRANITE_VISION_OLLAMA,
6364
GRANITE_VISION_TRANSFORMERS,
6465
SMOLDOCLING_MLX,
@@ -621,6 +622,8 @@ def convert( # noqa: C901
621622
pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
622623
elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
623624
pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
625+
elif vlm_model == VlmModelType.GOT_OCR_2:
626+
pipeline_options.vlm_options = GOT2_TRANSFORMERS
624627
elif vlm_model == VlmModelType.SMOLDOCLING:
625628
pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
626629
if sys.platform == "darwin":

docling/datamodel/vlm_model_specs.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,26 @@
194194
temperature=0.0,
195195
)
196196

197+
# GoT 2.0
198+
GOT2_TRANSFORMERS = InlineVlmOptions(
199+
repo_id="stepfun-ai/GOT-OCR-2.0-hf",
200+
prompt="",
201+
response_format=ResponseFormat.MARKDOWN,
202+
inference_framework=InferenceFramework.TRANSFORMERS,
203+
transformers_prompt_style=TransformersPromptStyle.RAW,
204+
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
205+
supported_devices=[
206+
AcceleratorDevice.CPU,
207+
AcceleratorDevice.CUDA,
208+
# AcceleratorDevice.MPS,
209+
],
210+
scale=2.0,
211+
temperature=0.0,
212+
stop_strings=["<|im_end|>"],
213+
extra_generation_config={"format": True},
214+
)
215+
216+
197217
# Gemma-3
198218
GEMMA3_12B_MLX = InlineVlmOptions(
199219
repo_id="mlx-community/gemma-3-12b-it-bf16",
@@ -215,6 +235,8 @@
215235
temperature=0.0,
216236
)
217237

238+
# Dolphin
239+
218240
DOLPHIN_TRANSFORMERS = InlineVlmOptions(
219241
repo_id="ByteDance/Dolphin",
220242
prompt="<s>Read text in the image. <Answer/>",
@@ -238,3 +260,4 @@ class VlmModelType(str, Enum):
238260
GRANITE_VISION = "granite_vision"
239261
GRANITE_VISION_VLLM = "granite_vision_vllm"
240262
GRANITE_VISION_OLLAMA = "granite_vision_ollama"
263+
GOT_OCR_2 = "got_ocr_2"

0 commit comments

Comments
 (0)