194194 temperature = 0.0 ,
195195)
196196
197+ # GoT 2.0
198+ GOT2_TRANSFORMERS = InlineVlmOptions (
199+ repo_id = "stepfun-ai/GOT-OCR-2.0-hf" ,
200+ prompt = "" ,
201+ response_format = ResponseFormat .MARKDOWN ,
202+ inference_framework = InferenceFramework .TRANSFORMERS ,
203+ transformers_prompt_style = TransformersPromptStyle .RAW ,
204+ transformers_model_type = TransformersModelType .AUTOMODEL_IMAGETEXTTOTEXT ,
205+ supported_devices = [
206+ AcceleratorDevice .CPU ,
207+ AcceleratorDevice .CUDA ,
208+ # AcceleratorDevice.MPS,
209+ ],
210+ scale = 2.0 ,
211+ temperature = 0.0 ,
212+ stop_strings = ["<|im_end|>" ],
213+ extra_generation_config = {"format" : True },
214+ )
215+
216+
197217# Gemma-3
198218GEMMA3_12B_MLX = InlineVlmOptions (
199219 repo_id = "mlx-community/gemma-3-12b-it-bf16" ,
215235 temperature = 0.0 ,
216236)
217237
238+ # Dolphin
239+
218240DOLPHIN_TRANSFORMERS = InlineVlmOptions (
219241 repo_id = "ByteDance/Dolphin" ,
220242 prompt = "<s>Read text in the image. <Answer/>" ,
@@ -238,3 +260,4 @@ class VlmModelType(str, Enum):
238260 GRANITE_VISION = "granite_vision"
239261 GRANITE_VISION_VLLM = "granite_vision_vllm"
240262 GRANITE_VISION_OLLAMA = "granite_vision_ollama"
263+ GOT_OCR_2 = "got_ocr_2"
0 commit comments