Skip to content

Commit d97d4ae

Browse files
authored
Merge pull request #1106 from roboflow/add-smolvlm
Add SmolVLM2
2 parents 6312998 + 74ed02d commit d97d4ae

File tree

22 files changed

+691
-8
lines changed

22 files changed

+691
-8
lines changed

docs/foundation/smolvlm.md

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
<a href="https://blog.roboflow.com/smolvlm2/" target="_blank">SmolVLM2</a> is a multimodal model developed by Hugging Face.
2+
3+
You can use SmolVLM2 for a range of multimodal tasks, including VQA, document OCR, document VQA, and object counting.
4+
5+
You can deploy SmolVLM2 with Inference.
6+
7+
### Installation
8+
9+
To install inference with the extra dependencies necessary to run SmolVLM2, run
10+
11+
```pip install inference[transformers]```
12+
13+
or
14+
15+
```pip install inference-gpu[transformers]```
16+
17+
### How to Use SmolVLM2
18+
19+
Create a new Python file called `app.py` and add the following code:
20+
21+
```python
22+
from PIL import Image
23+
24+
from inference.models.smolvlm.smolvlm import SmolVLM
25+
26+
pg = SmolVLM(api_key="API_KEY")
27+
28+
image = Image.open("dog.jpeg")
29+
30+
prompt = "How many dogs are in this image?"
31+
32+
result = pg.predict(image,prompt)
33+
34+
print(result)
35+
```
36+
37+
In this code, we load SmolVLM2 run SmolVLM2 on an image, and annotate the image with the predictions from the model.
38+
39+
Above, replace:
40+
41+
1. `prompt` with the prompt for the model.
42+
2. `image.jpeg` with the path to the image that you want to run inference on.
43+
44+
To use SmolVLM2 with Inference, you will need a Roboflow API key. If you don't already have a Roboflow account, <a href="https://app.roboflow.com" target="_blank">sign up for a free Roboflow account</a>.
45+
46+
Then, run the Python script you have created:
47+
48+
```
49+
python app.py
50+
```
51+
52+
The result from your model will be printed to the console.

inference/core/entities/requests/inference.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,6 @@ def request_from_type(model_type, request_dict):
252252
elif model_type == "object-detection":
253253
request = ObjectDetectionInferenceRequest(**request_dict)
254254
else:
255-
raise ValueError(f"Uknown task type {model_type}")
255+
raise ValueError(f"Unknown task type {model_type}")
256256
request.id = request_dict.get("id", request.id)
257257
return request

inference/core/env.py

+2
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,8 @@
149149

150150
QWEN_2_5_ENABLED = str2bool(os.getenv("QWEN_2_5_ENABLED", True))
151151

152+
SMOLVLM2_ENABLED = str2bool(os.getenv("SMOLVLM2_ENABLED", True))
153+
152154
# Flag to enable YOLO-World core model, default is True
153155
CORE_MODEL_YOLO_WORLD_ENABLED = str2bool(
154156
os.getenv("CORE_MODEL_YOLO_WORLD_ENABLED", True)

inference/core/managers/base.py

+1
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ def add_model(
5858
)
5959
return
6060
logger.debug("ModelManager - model initialisation...")
61+
6162
model = self.model_registry.get_model(resolved_identifier, api_key)(
6263
model_id=model_id,
6364
api_key=api_key,

inference/core/models/roboflow.py

+1
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,7 @@ def get_model_artifacts(self) -> None:
230230

231231
def cache_model_artefacts(self) -> None:
232232
infer_bucket_files = self.get_all_required_infer_bucket_file()
233+
233234
if are_all_files_cached(files=infer_bucket_files, model_id=self.endpoint):
234235
return None
235236
if is_model_artefacts_bucket_available():

inference/core/registries/roboflow.py

+5
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
"paligemma": ("llm", "paligemma"),
5454
"yolo_world": ("object-detection", "yolo-world"),
5555
"owlv2": ("object-detection", "owlv2"),
56+
"smolvlm2": ("lmm", "smolvlm-2.2b-instruct"),
5657
}
5758

5859
STUB_VERSION_ID = "0"
@@ -79,6 +80,7 @@ def get_model(self, model_id: ModelID, api_key: str) -> Model:
7980
"""
8081
model_type = get_model_type(model_id, api_key)
8182
logger.debug(f"Model type: {model_type}")
83+
8284
if model_type not in self.registry_dict:
8385
raise ModelNotRecognisedError(f"Model type not supported: {model_type}")
8486
return self.registry_dict[model_type]
@@ -129,6 +131,7 @@ def get_model_type(
129131
"""
130132
model_id = resolve_roboflow_model_alias(model_id=model_id)
131133
dataset_id, version_id = get_model_id_chunks(model_id=model_id)
134+
132135
if dataset_id in GENERIC_MODELS:
133136
logger.debug(f"Loading generic model: {dataset_id}.")
134137
return GENERIC_MODELS[dataset_id]
@@ -144,6 +147,7 @@ def get_model_type(
144147
cached_metadata = get_model_metadata_from_cache(
145148
dataset_id=dataset_id, version_id=version_id
146149
)
150+
147151
if cached_metadata is not None:
148152
return cached_metadata[0], cached_metadata[1]
149153
if version_id == STUB_VERSION_ID:
@@ -180,6 +184,7 @@ def get_model_type(
180184
project_task_type = api_data.get("taskType", "object-detection")
181185
if api_data is None:
182186
raise ModelArtefactError("Error loading model artifacts from Roboflow API.")
187+
183188
# some older projects do not have type field - hence defaulting
184189
model_type = api_data.get("modelType")
185190
if model_type is None or model_type == "ort":

inference/core/utils/roboflow.py

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ def get_model_id_chunks(
2323
"owlv2",
2424
"trocr",
2525
"yolo_world",
26+
"smolvlm2",
2627
}:
2728
return dataset_id, version_id
2829
try:

inference/core/workflows/core_steps/loader.py

+4
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,9 @@
197197
from inference.core.workflows.core_steps.models.foundation.segment_anything2.v1 import (
198198
SegmentAnything2BlockV1,
199199
)
200+
from inference.core.workflows.core_steps.models.foundation.smolvlm.v1 import (
201+
SmolVLM2BlockV1,
202+
)
200203
from inference.core.workflows.core_steps.models.foundation.stability_ai.image_gen.v1 import (
201204
StabilityAIImageGenBlockV1,
202205
)
@@ -628,6 +631,7 @@ def load_blocks() -> List[Type[WorkflowBlock]]:
628631
LlamaVisionBlockV1,
629632
ImageSlicerBlockV2,
630633
Qwen25VLBlockV1,
634+
SmolVLM2BlockV1,
631635
]
632636

633637

inference/core/workflows/core_steps/models/foundation/smolvlm/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
from typing import List, Literal, Optional, Type, Union
2+
3+
from pydantic import ConfigDict, Field
4+
5+
from inference.core.entities.requests.inference import LMMInferenceRequest
6+
from inference.core.managers.base import ModelManager
7+
from inference.core.workflows.core_steps.common.entities import StepExecutionMode
8+
from inference.core.workflows.execution_engine.entities.base import (
9+
Batch,
10+
OutputDefinition,
11+
WorkflowImageData,
12+
)
13+
from inference.core.workflows.execution_engine.entities.types import (
14+
DICTIONARY_KIND,
15+
IMAGE_KIND,
16+
ROBOFLOW_MODEL_ID_KIND,
17+
ImageInputField,
18+
Selector,
19+
)
20+
from inference.core.workflows.prototypes.block import (
21+
BlockResult,
22+
WorkflowBlock,
23+
WorkflowBlockManifest,
24+
)
25+
26+
27+
class BlockManifest(WorkflowBlockManifest):
28+
# SmolVLM needs an image and a text prompt.
29+
images: Selector(kind=[IMAGE_KIND]) = ImageInputField
30+
prompt: Optional[str] = Field(
31+
default=None,
32+
description="Optional text prompt to provide additional context to SmolVLM2. Otherwise it will just be None",
33+
examples=["What is in this image?"],
34+
)
35+
36+
# Standard model configuration for UI, schema, etc.
37+
model_config = ConfigDict(
38+
json_schema_extra={
39+
"name": "SmolVLM2",
40+
"version": "v1",
41+
"short_description": "Run SmolVLM2 on an image.",
42+
"long_description": (
43+
"This workflow block runs SmolVLM2, a multimodal vision-language model. You can ask questions about images"
44+
" -- including documents and photos -- and get answers in natural language."
45+
),
46+
"license": "Apache-2.0",
47+
"block_type": "model",
48+
"search_keywords": [
49+
"SmolVLM2",
50+
"smolvlm",
51+
"vision language model",
52+
"VLM",
53+
],
54+
"is_vlm_block": True,
55+
"ui_manifest": {
56+
"section": "model",
57+
"icon": "fal fa-atom",
58+
"blockPriority": 5.5,
59+
},
60+
},
61+
protected_namespaces=(),
62+
)
63+
type: Literal["roboflow_core/smolvlm2@v1"]
64+
65+
model_version: Union[Selector(kind=[ROBOFLOW_MODEL_ID_KIND]), str] = Field(
66+
default="smolvlm2/smolvlm-2.2b-instruct",
67+
description="The SmolVLM2 model to be used for inference.",
68+
examples=["smolvlm2/smolvlm-2.2b-instruct"],
69+
)
70+
71+
@classmethod
72+
def describe_outputs(cls) -> List[OutputDefinition]:
73+
return [
74+
OutputDefinition(
75+
name="parsed_output",
76+
kind=[DICTIONARY_KIND],
77+
description="A parsed version of the output, provided as a dictionary containing the text.",
78+
),
79+
]
80+
81+
@classmethod
82+
def get_parameters_accepting_batches(cls) -> List[str]:
83+
# Only images can be passed in as a list/batch
84+
return ["images"]
85+
86+
@classmethod
87+
def get_execution_engine_compatibility(cls) -> Optional[str]:
88+
return ">=1.3.0,<2.0.0"
89+
90+
91+
class SmolVLM2BlockV1(WorkflowBlock):
92+
def __init__(
93+
self,
94+
model_manager: ModelManager,
95+
api_key: Optional[str],
96+
step_execution_mode: StepExecutionMode,
97+
):
98+
self._model_manager = model_manager
99+
self._api_key = api_key
100+
self._step_execution_mode = step_execution_mode
101+
102+
@classmethod
103+
def get_init_parameters(cls) -> List[str]:
104+
return ["model_manager", "api_key", "step_execution_mode"]
105+
106+
@classmethod
107+
def get_manifest(cls) -> Type[WorkflowBlockManifest]:
108+
return BlockManifest
109+
110+
def run(
111+
self,
112+
images: Batch[WorkflowImageData],
113+
model_version: str,
114+
prompt: Optional[str],
115+
) -> BlockResult:
116+
if self._step_execution_mode == StepExecutionMode.LOCAL:
117+
return self.run_locally(
118+
images=images,
119+
model_version=model_version,
120+
prompt=prompt,
121+
)
122+
elif self._step_execution_mode == StepExecutionMode.REMOTE:
123+
raise NotImplementedError(
124+
"Remote execution is not supported for SmolVLM2. Please use a local or dedicated inference server."
125+
)
126+
else:
127+
raise ValueError(
128+
f"Unknown step execution mode: {self._step_execution_mode}"
129+
)
130+
131+
def run_locally(
132+
self,
133+
images: Batch[WorkflowImageData],
134+
model_version: str,
135+
prompt: Optional[str],
136+
) -> BlockResult:
137+
# Convert each image to the format required by the model.
138+
inference_images = [
139+
i.to_inference_format(numpy_preferred=False) for i in images
140+
]
141+
# Use the provided prompt (or an empty string if None) for every image.
142+
prompt = prompt or ""
143+
prompts = [prompt] * len(inference_images)
144+
145+
# Register SmolVLM2 with the model manager.
146+
self._model_manager.add_model(model_id=model_version, api_key=self._api_key)
147+
148+
predictions = []
149+
for image, single_prompt in zip(inference_images, prompts):
150+
# Build an LMMInferenceRequest with both prompt and image.
151+
request = LMMInferenceRequest(
152+
api_key=self._api_key,
153+
model_id=model_version,
154+
image=image,
155+
source="workflow-execution",
156+
prompt=single_prompt,
157+
)
158+
# Run inference.
159+
prediction = self._model_manager.infer_from_request_sync(
160+
model_id=model_version, request=request
161+
)
162+
response_text = prediction.response
163+
predictions.append(
164+
{
165+
"parsed_output": response_text,
166+
}
167+
)
168+
return predictions

inference/models/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ The models supported by Roboflow Inference have their own licenses. View the lic
2626
| `inference/models/yolov10` | [AGPL-3.0](https://github.com/THU-MIG/yolov10/blob/main/LICENSE) ||
2727
| `inference/models/yolov11` | [AGPL-3.0](https://github.com/ultralytics/ultralytics/blob/master/LICENSE) ||
2828
| `inference/models/yolov12` | [AGPL-3.0](https://github.com/sunsmarterjie/yolov12?tab=AGPL-3.0-1-ov-file) ||
29+
| `inference/models/smolvlm2` | [Apache 2.0](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct) | 👍 |
2930
| `inference/models/rfdetr` | [Apache 2.0](https://github.com/roboflow/rf-detr/blob/main/LICENSE) | 👍 |
3031

3132
## Commercial Licenses

inference/models/__init__.py

+5
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,11 @@
7272
except:
7373
pass
7474

75+
try:
76+
from inference.models.smolvlm import SmolVLM
77+
except:
78+
pass
79+
7580
from inference.models.resnet import ResNetClassification
7681
from inference.models.rfdetr import RFDETRObjectDetection
7782
from inference.models.vit import VitClassification

inference/models/aliases.py

+4
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,10 @@
4848
**{k.replace("yolov11", "yolo11"): v for k, v in YOLOV11_ALIASES.items()},
4949
}
5050

51+
SMOLVLM_ALIASES = {
52+
"smolvlm2": "smolvlm-2.2b-instruct",
53+
}
54+
5155
RFDETR_ALIASES = {
5256
"rfdetr-base": "coco/36",
5357
"rfdetr-large": "coco/37",

0 commit comments

Comments
 (0)