diff --git a/supervision/detection/core.py b/supervision/detection/core.py index ffe5ed3fc..a6b551508 100644 --- a/supervision/detection/core.py +++ b/supervision/detection/core.py @@ -946,6 +946,26 @@ def from_lmm(cls, lmm: LMM | str, result: str | dict, **kwargs: Any) -> Detectio ``` !!! example "Gemini 2.0" + + ??? tip "Prompt engineering" + + From Gemini 2.0 onwards, models are further trained to detect objects in + an image and get their bounding box coordinates. The coordinates, + relative to image dimensions, scale to [0, 1000]. You need to descale + these coordinates based on your original image size. + + According to the Gemini API documentation on image prompts, when using + a single image with text, the recommended approach is to place the text + prompt after the image part in the contents array. This ordering has + been shown to produce significantly better results in practice. + + To get the best results from Google Gemini 2.0, use the following prompt. + + ``` + Detect all the cats and dogs in the image. The box_2d should be + [ymin, xmin, ymax, xmax] normalized to 0-1000. + ``` + ```python import supervision as sv @@ -983,6 +1003,11 @@ def from_lmm(cls, lmm: LMM | str, result: str | dict, **kwargs: Any) -> Detectio including small, distant, or partially visible ones, and to return tight bounding boxes. + According to the Gemini API documentation on image prompts, when using + a single image with text, the recommended approach is to place the text + prompt after the image part in the contents array. This ordering has + been shown to produce significantly better results in practice. + ``` Carefully examine this image and detect ALL visible objects, including small, distant, or partially visible ones. @@ -1323,6 +1348,26 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio ``` !!! example "Gemini 2.0" + + ??? tip "Prompt engineering" + + From Gemini 2.0 onwards, models are further trained to detect objects in + an image and get their bounding box coordinates. The coordinates, + relative to image dimensions, scale to [0, 1000]. You need to descale + these coordinates based on your original image size. + + According to the Gemini API documentation on image prompts, when using + a single image with text, the recommended approach is to place the text + prompt after the image part in the contents array. This ordering has + been shown to produce significantly better results in practice. + + To get the best results from Google Gemini 2.0, use the following prompt. + + ``` + Detect all the cats and dogs in the image. The box_2d should be + [ymin, xmin, ymax, xmax] normalized to 0-1000. + ``` + ```python import supervision as sv @@ -1360,6 +1405,11 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio including small, distant, or partially visible ones, and to return tight bounding boxes. + According to the Gemini API documentation on image prompts, when using + a single image with text, the recommended approach is to place the text + prompt after the image part in the contents array. This ordering has + been shown to produce significantly better results in practice. + ``` Carefully examine this image and detect ALL visible objects, including small, distant, or partially visible ones.