Adjust example instatiation of multi-stage VLM pipeline

cau-git · cau-git · commit 4a107f4f57e7 · 2025-08-18T14:36:42.000+02:00
Signed-off-by: Christoph Auer &lt;cau@zurich.ibm.com&gt;
diff --git a/docling/datamodel/vlm_model_specs.py b/docling/datamodel/vlm_model_specs.py
@@ -229,7 +229,6 @@
     ],
     scale=2.0,
     temperature=0.0,
-    max_new_tokens=4096,
 )
 
 
diff --git a/docling/models/vlm_models_inline/hf_transformers_model.py b/docling/models/vlm_models_inline/hf_transformers_model.py
@@ -280,9 +280,7 @@ def process_images(
             padding=True,  # pad across batch for both text and vision
             # no truncation by default; match SmolDocling examples
         )
-        inputs = {
-            k: (v.to(self.device) if hasattr(v, "to") else v) for k, v in inputs.items()
-        }
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
 
         # -- Optional stopping criteria
         stopping_criteria = None
@@ -302,7 +300,7 @@ def process_images(
             "max_new_tokens": self.max_new_tokens,
             "use_cache": self.use_cache,
             "generation_config": self.generation_config,
-            "temperature": self.temperature,
+            # "temperature": self.temperature,
             **self.vlm_options.extra_generation_config,
         }
         if stopping_criteria is not None:
diff --git a/docling/pipeline/threaded_multistage_vlm_pipeline.py b/docling/pipeline/threaded_multistage_vlm_pipeline.py
@@ -160,16 +160,14 @@ def create_default(cls) -> ThreadedMultiStageVlmPipelineOptions:
         smoldocling_model = SMOLDOCLING_TRANSFORMERS
 
         text_opts = base_model.model_copy()
-        # text_opts.prompt = "Convert this page to docling."
-        text_opts.prompt = "What does it say?"
-        text_opts.response_format = ResponseFormat.PLAINTEXT
-        text_opts.max_new_tokens = 4096
+        text_opts.prompt = "Convert this page to docling."
+        text_opts.response_format = ResponseFormat.DOCTAGS
+        text_opts.max_new_tokens = 1024
 
         formula_opts = base_model.model_copy()
-        # formula_opts.prompt = "Convert formula to latex."
-        formula_opts.prompt = "What does it say?"
-        formula_opts.response_format = ResponseFormat.PLAINTEXT
-        formula_opts.max_new_tokens = 4096
+        formula_opts.prompt = "Convert formula to latex."
+        formula_opts.response_format = ResponseFormat.DOCTAGS
+        formula_opts.max_new_tokens = 512
 
         code_opts = smoldocling_model.model_copy()
         code_opts.prompt = "Convert code to text."

Original file line number	Diff line number	Diff line change
`@@ -229,7 +229,6 @@`
`229`	`229`	`],`
`230`	`230`	`scale=2.0,`
`231`	`231`	`temperature=0.0,`
`232`		`- max_new_tokens=4096,`
`233`	`232`	`)`
`234`	`233`
`235`	`234`