roboflow
diff --git a/‎docs/foundation/smolvlm.md
+52 b/‎docs/foundation/smolvlm.md
+52
diff --git a/‎inference/core/entities/requests/inference.py
+1-1 b/‎inference/core/entities/requests/inference.py
+1-1
diff --git a/‎inference/core/env.py
+2 b/‎inference/core/env.py
+2
diff --git a/‎inference/core/managers/base.py
+1 b/‎inference/core/managers/base.py
+1
diff --git a/‎inference/core/models/roboflow.py
+1 b/‎inference/core/models/roboflow.py
+1
diff --git a/‎inference/core/registries/roboflow.py
+5 b/‎inference/core/registries/roboflow.py
+5
diff --git a/‎inference/core/utils/roboflow.py
+1 b/‎inference/core/utils/roboflow.py
+1
diff --git a/‎inference/core/workflows/core_steps/loader.py
+4 b/‎inference/core/workflows/core_steps/loader.py
+4
diff --git a/‎inference/core/workflows/core_steps/models/foundation/smolvlm/__init__.py b/‎inference/core/workflows/core_steps/models/foundation/smolvlm/__init__.py
diff --git a/‎inference/core/workflows/core_steps/models/foundation/smolvlm/v1.py
+168 b/‎inference/core/workflows/core_steps/models/foundation/smolvlm/v1.py
+168
diff --git a/‎inference/models/README.md
+1 b/‎inference/models/README.md
+1
diff --git a/‎inference/models/__init__.py
+5 b/‎inference/models/__init__.py
+5
diff --git a/‎inference/models/aliases.py
+4 b/‎inference/models/aliases.py
+4
@@ -0,0 +1,52 @@
+<a href="https://blog.roboflow.com/smolvlm2/" target="_blank">SmolVLM2</a> is a multimodal model developed by Hugging Face.
+
+You can use SmolVLM2 for a range of multimodal tasks, including VQA, document OCR, document VQA, and object counting.
+
+You can deploy SmolVLM2 with Inference.
+
+### Installation
+
+To install inference with the extra dependencies necessary to run SmolVLM2, run
+
+```pip install inference[transformers]```
+
+or
+
+```pip install inference-gpu[transformers]```
+
+### How to Use SmolVLM2
+
+Create a new Python file called `app.py` and add the following code:
+
+```python
+from PIL import Image
+
+from inference.models.smolvlm.smolvlm import SmolVLM
+
+pg = SmolVLM(api_key="API_KEY")
+
+image = Image.open("dog.jpeg")
+
+prompt = "How many dogs are in this image?"
+
+result = pg.predict(image,prompt)
+
+print(result)
+```
+
+In this code, we load SmolVLM2 run SmolVLM2 on an image, and annotate the image with the predictions from the model.
+
+Above, replace:
+
+1. `prompt` with the prompt for the model.
+2. `image.jpeg` with the path to the image that you want to run inference on.
+
+To use SmolVLM2 with Inference, you will need a Roboflow API key. If you don't already have a Roboflow account, <a href="https://app.roboflow.com" target="_blank">sign up for a free Roboflow account</a>.
+
+Then, run the Python script you have created:
+
+```
+python app.py
+```
+
+The result from your model will be printed to the console.
@@ -252,6 +252,6 @@ def request_from_type(model_type, request_dict):
     elif model_type == "object-detection":
         request = ObjectDetectionInferenceRequest(**request_dict)
     else:
-        raise ValueError(f"Uknown task type {model_type}")
+        raise ValueError(f"Unknown task type {model_type}")
     request.id = request_dict.get("id", request.id)
     return request
@@ -149,6 +149,8 @@
 
 QWEN_2_5_ENABLED = str2bool(os.getenv("QWEN_2_5_ENABLED", True))
 
+SMOLVLM2_ENABLED = str2bool(os.getenv("SMOLVLM2_ENABLED", True))
+
 # Flag to enable YOLO-World core model, default is True
 CORE_MODEL_YOLO_WORLD_ENABLED = str2bool(
     os.getenv("CORE_MODEL_YOLO_WORLD_ENABLED", True)
 
@@ -58,6 +58,7 @@ def add_model(
             )
             return
         logger.debug("ModelManager - model initialisation...")
+
         model = self.model_registry.get_model(resolved_identifier, api_key)(
             model_id=model_id,
             api_key=api_key,
 
@@ -230,6 +230,7 @@ def get_model_artifacts(self) -> None:
 
     def cache_model_artefacts(self) -> None:
         infer_bucket_files = self.get_all_required_infer_bucket_file()
+
         if are_all_files_cached(files=infer_bucket_files, model_id=self.endpoint):
             return None
         if is_model_artefacts_bucket_available():
 
@@ -53,6 +53,7 @@
     "paligemma": ("llm", "paligemma"),
     "yolo_world": ("object-detection", "yolo-world"),
     "owlv2": ("object-detection", "owlv2"),
+    "smolvlm2": ("lmm", "smolvlm-2.2b-instruct"),
 }
 
 STUB_VERSION_ID = "0"
@@ -79,6 +80,7 @@ def get_model(self, model_id: ModelID, api_key: str) -> Model:
         """
         model_type = get_model_type(model_id, api_key)
         logger.debug(f"Model type: {model_type}")
+
         if model_type not in self.registry_dict:
             raise ModelNotRecognisedError(f"Model type not supported: {model_type}")
         return self.registry_dict[model_type]
@@ -129,6 +131,7 @@ def get_model_type(
     """
     model_id = resolve_roboflow_model_alias(model_id=model_id)
     dataset_id, version_id = get_model_id_chunks(model_id=model_id)
+
     if dataset_id in GENERIC_MODELS:
         logger.debug(f"Loading generic model: {dataset_id}.")
         return GENERIC_MODELS[dataset_id]
@@ -144,6 +147,7 @@ def get_model_type(
     cached_metadata = get_model_metadata_from_cache(
         dataset_id=dataset_id, version_id=version_id
     )
+
     if cached_metadata is not None:
         return cached_metadata[0], cached_metadata[1]
     if version_id == STUB_VERSION_ID:
@@ -180,6 +184,7 @@ def get_model_type(
         project_task_type = api_data.get("taskType", "object-detection")
     if api_data is None:
         raise ModelArtefactError("Error loading model artifacts from Roboflow API.")
+
     # some older projects do not have type field - hence defaulting
     model_type = api_data.get("modelType")
     if model_type is None or model_type == "ort":
 
@@ -23,6 +23,7 @@ def get_model_id_chunks(
         "owlv2",
         "trocr",
         "yolo_world",
+        "smolvlm2",
     }:
         return dataset_id, version_id
     try:
 
@@ -197,6 +197,9 @@
 from inference.core.workflows.core_steps.models.foundation.segment_anything2.v1 import (
     SegmentAnything2BlockV1,
 )
+from inference.core.workflows.core_steps.models.foundation.smolvlm.v1 import (
+    SmolVLM2BlockV1,
+)
 from inference.core.workflows.core_steps.models.foundation.stability_ai.image_gen.v1 import (
     StabilityAIImageGenBlockV1,
 )
@@ -628,6 +631,7 @@ def load_blocks() -> List[Type[WorkflowBlock]]:
         LlamaVisionBlockV1,
         ImageSlicerBlockV2,
         Qwen25VLBlockV1,
+        SmolVLM2BlockV1,
     ]
 
 
 
@@ -0,0 +1,168 @@
+from typing import List, Literal, Optional, Type, Union
+
+from pydantic import ConfigDict, Field
+
+from inference.core.entities.requests.inference import LMMInferenceRequest
+from inference.core.managers.base import ModelManager
+from inference.core.workflows.core_steps.common.entities import StepExecutionMode
+from inference.core.workflows.execution_engine.entities.base import (
+    Batch,
+    OutputDefinition,
+    WorkflowImageData,
+)
+from inference.core.workflows.execution_engine.entities.types import (
+    DICTIONARY_KIND,
+    IMAGE_KIND,
+    ROBOFLOW_MODEL_ID_KIND,
+    ImageInputField,
+    Selector,
+)
+from inference.core.workflows.prototypes.block import (
+    BlockResult,
+    WorkflowBlock,
+    WorkflowBlockManifest,
+)
+
+
+class BlockManifest(WorkflowBlockManifest):
+    # SmolVLM needs an image and a text prompt.
+    images: Selector(kind=[IMAGE_KIND]) = ImageInputField
+    prompt: Optional[str] = Field(
+        default=None,
+        description="Optional text prompt to provide additional context to SmolVLM2. Otherwise it will just be None",
+        examples=["What is in this image?"],
+    )
+
+    # Standard model configuration for UI, schema, etc.
+    model_config = ConfigDict(
+        json_schema_extra={
+            "name": "SmolVLM2",
+            "version": "v1",
+            "short_description": "Run SmolVLM2 on an image.",
+            "long_description": (
+                "This workflow block runs SmolVLM2, a multimodal vision-language model. You can ask questions about images"
+                " -- including documents and photos -- and get answers in natural language."
+            ),
+            "license": "Apache-2.0",
+            "block_type": "model",
+            "search_keywords": [
+                "SmolVLM2",
+                "smolvlm",
+                "vision language model",
+                "VLM",
+            ],
+            "is_vlm_block": True,
+            "ui_manifest": {
+                "section": "model",
+                "icon": "fal fa-atom",
+                "blockPriority": 5.5,
+            },
+        },
+        protected_namespaces=(),
+    )
+    type: Literal["roboflow_core/smolvlm2@v1"]
+
+    model_version: Union[Selector(kind=[ROBOFLOW_MODEL_ID_KIND]), str] = Field(
+        default="smolvlm2/smolvlm-2.2b-instruct",
+        description="The SmolVLM2 model to be used for inference.",
+        examples=["smolvlm2/smolvlm-2.2b-instruct"],
+    )
+
+    @classmethod
+    def describe_outputs(cls) -> List[OutputDefinition]:
+        return [
+            OutputDefinition(
+                name="parsed_output",
+                kind=[DICTIONARY_KIND],
+                description="A parsed version of the output, provided as a dictionary containing the text.",
+            ),
+        ]
+
+    @classmethod
+    def get_parameters_accepting_batches(cls) -> List[str]:
+        # Only images can be passed in as a list/batch
+        return ["images"]
+
+    @classmethod
+    def get_execution_engine_compatibility(cls) -> Optional[str]:
+        return ">=1.3.0,<2.0.0"
+
+
+class SmolVLM2BlockV1(WorkflowBlock):
+    def __init__(
+        self,
+        model_manager: ModelManager,
+        api_key: Optional[str],
+        step_execution_mode: StepExecutionMode,
+    ):
+        self._model_manager = model_manager
+        self._api_key = api_key
+        self._step_execution_mode = step_execution_mode
+
+    @classmethod
+    def get_init_parameters(cls) -> List[str]:
+        return ["model_manager", "api_key", "step_execution_mode"]
+
+    @classmethod
+    def get_manifest(cls) -> Type[WorkflowBlockManifest]:
+        return BlockManifest
+
+    def run(
+        self,
+        images: Batch[WorkflowImageData],
+        model_version: str,
+        prompt: Optional[str],
+    ) -> BlockResult:
+        if self._step_execution_mode == StepExecutionMode.LOCAL:
+            return self.run_locally(
+                images=images,
+                model_version=model_version,
+                prompt=prompt,
+            )
+        elif self._step_execution_mode == StepExecutionMode.REMOTE:
+            raise NotImplementedError(
+                "Remote execution is not supported for SmolVLM2. Please use a local or dedicated inference server."
+            )
+        else:
+            raise ValueError(
+                f"Unknown step execution mode: {self._step_execution_mode}"
+            )
+
+    def run_locally(
+        self,
+        images: Batch[WorkflowImageData],
+        model_version: str,
+        prompt: Optional[str],
+    ) -> BlockResult:
+        # Convert each image to the format required by the model.
+        inference_images = [
+            i.to_inference_format(numpy_preferred=False) for i in images
+        ]
+        # Use the provided prompt (or an empty string if None) for every image.
+        prompt = prompt or ""
+        prompts = [prompt] * len(inference_images)
+
+        # Register SmolVLM2 with the model manager.
+        self._model_manager.add_model(model_id=model_version, api_key=self._api_key)
+
+        predictions = []
+        for image, single_prompt in zip(inference_images, prompts):
+            # Build an LMMInferenceRequest with both prompt and image.
+            request = LMMInferenceRequest(
+                api_key=self._api_key,
+                model_id=model_version,
+                image=image,
+                source="workflow-execution",
+                prompt=single_prompt,
+            )
+            # Run inference.
+            prediction = self._model_manager.infer_from_request_sync(
+                model_id=model_version, request=request
+            )
+            response_text = prediction.response
+            predictions.append(
+                {
+                    "parsed_output": response_text,
+                }
+            )
+        return predictions
@@ -26,6 +26,7 @@ The models supported by Roboflow Inference have their own licenses. View the lic
 | `inference/models/yolov10`        | [AGPL-3.0](https://github.com/THU-MIG/yolov10/blob/main/LICENSE)                     | ✅ |
 | `inference/models/yolov11`        | [AGPL-3.0](https://github.com/ultralytics/ultralytics/blob/master/LICENSE)           | ✅ |
 | `inference/models/yolov12`        | [AGPL-3.0](https://github.com/sunsmarterjie/yolov12?tab=AGPL-3.0-1-ov-file)          | ✅ |
+| `inference/models/smolvlm2`        | [Apache 2.0](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct)          | 👍 |
 | `inference/models/rfdetr`         | [Apache 2.0](https://github.com/roboflow/rf-detr/blob/main/LICENSE)                  | 👍 |
 
 ## Commercial Licenses
 
@@ -72,6 +72,11 @@
 except:
     pass
 
+try:
+    from inference.models.smolvlm import SmolVLM
+except:
+    pass
+
 from inference.models.resnet import ResNetClassification
 from inference.models.rfdetr import RFDETRObjectDetection
 from inference.models.vit import VitClassification
 
@@ -48,6 +48,10 @@
     **{k.replace("yolov11", "yolo11"): v for k, v in YOLOV11_ALIASES.items()},
 }
 
+SMOLVLM_ALIASES = {
+    "smolvlm2": "smolvlm-2.2b-instruct",
+}
+
 RFDETR_ALIASES = {
     "rfdetr-base": "coco/36",
     "rfdetr-large": "coco/37",
Original file line number	Diff line number	Diff line change
`@@ -58,6 +58,7 @@ def add_model(`
`58`	`58`	`)`
`59`	`59`	`return`
`60`	`60`	`logger.debug("ModelManager - model initialisation...")`
	`61`	`+`
`61`	`62`	`model = self.model_registry.get_model(resolved_identifier, api_key)(`
`62`	`63`	`model_id=model_id,`
`63`	`64`	`api_key=api_key,`
Original file line number	Diff line number	Diff line change
`@@ -197,6 +197,9 @@`
`197`	`197`	`from inference.core.workflows.core_steps.models.foundation.segment_anything2.v1 import (`
`198`	`198`	`SegmentAnything2BlockV1,`
`199`	`199`	`)`
	`200`	`+from inference.core.workflows.core_steps.models.foundation.smolvlm.v1 import (`
	`201`	`+ SmolVLM2BlockV1,`
	`202`	`+)`
`200`	`203`	`from inference.core.workflows.core_steps.models.foundation.stability_ai.image_gen.v1 import (`
`201`	`204`	`StabilityAIImageGenBlockV1,`
`202`	`205`	`)`
`@@ -628,6 +631,7 @@ def load_blocks() -> List[Type[WorkflowBlock]]:`
`628`	`631`	`LlamaVisionBlockV1,`
`629`	`632`	`ImageSlicerBlockV2,`
`630`	`633`	`Qwen25VLBlockV1,`
	`634`	`+ SmolVLM2BlockV1,`
`631`	`635`	`]`
`632`	`636`
`633`	`637`