[PLT-0] Fix VideoClassificationText (#2044)

lb-pno · web-flow · commit 61c29f85a69d · 2026-03-18T08:14:45.000-05:00
Co-authored-by: paulnoirel &lt;87332996+paulnoirel@users.noreply.github.com&gt;
diff --git a/libs/labelbox/src/labelbox/data/serialization/ndjson/classification.py b/libs/labelbox/src/labelbox/data/serialization/ndjson/classification.py
@@ -209,6 +209,61 @@ def from_common(
         )
 
 
+class NDVideoTextAnswer(BaseModel):
+    value: str
+    frames: List[Dict[str, int]]
+
+    model_config = ConfigDict(populate_by_name=True)
+
+
+class NDVideoText(BaseModel):
+    """Video text classification with per-segment text values and frame ranges.
+
+    Produces NDJSON like:
+      {"name": "...", "answer": [{"value": "text", "frames": [{"start": 1, "end": 5}]}], ...}
+    """
+
+    name: Optional[str] = None
+    schema_id: Optional[str] = Field(default=None, alias="schemaId")
+    answer: List[NDVideoTextAnswer]
+    data_row: DataRow = Field(alias="dataRow")
+
+    model_config = ConfigDict(populate_by_name=True)
+
+    @model_validator(mode="after")
+    def must_set_one(self):
+        if not self.name and not self.schema_id:
+            raise ValueError("Schema id or name are not set. Set either one.")
+        return self
+
+    @model_serializer(mode="wrap")
+    def serialize_model(self, handler):
+        res = handler(self)
+        if "name" in res and res["name"] is None:
+            res.pop("name")
+        if "schemaId" in res and res["schemaId"] is None:
+            res.pop("schemaId")
+        return res
+
+    @classmethod
+    def from_video_text_group(
+        cls,
+        annotation_group: List["VideoClassificationAnnotation"],
+        frame_ranges_by_text: Dict[str, List[Dict[str, int]]],
+        data: "GenericDataRowData",
+    ) -> "NDVideoText":
+        first = annotation_group[0]
+        return cls(
+            name=first.name,
+            schema_id=first.feature_schema_id,
+            data_row=DataRow(id=data.uid, global_key=data.global_key),
+            answer=[
+                NDVideoTextAnswer(value=text_val, frames=ranges)
+                for text_val, ranges in frame_ranges_by_text.items()
+            ],
+        )
+
+
 class NDPromptTextSubclass(NDAnswer):
     answer: str
 
@@ -517,6 +572,7 @@ def from_common(
 NDRadioSubclass.model_rebuild()
 NDRadio.model_rebuild()
 NDText.model_rebuild()
+NDVideoText.model_rebuild()
 NDPromptText.model_rebuild()
 NDTextSubclass.model_rebuild()
 
diff --git a/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py b/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py
@@ -31,6 +31,7 @@
 )
 from .temporal import create_temporal_ndjson_classifications
 from labelbox.types import DocumentRectangle, DocumentEntity
+from ...annotation_types.classification.classification import Text
 from .classification import (
     NDChecklistSubclass,
     NDClassification,
@@ -39,6 +40,7 @@
     NDPromptClassificationType,
     NDPromptText,
     NDRadioSubclass,
+    NDVideoText,
 )
 from .metric import NDConfusionMatrixMetric, NDMetricAnnotation, NDScalarMetric
 from .mmc import NDMessageTask
@@ -61,6 +63,7 @@
     NDRelationship,
     NDPromptText,
     NDMessageTask,
+    NDVideoText,
 ]
 
 
@@ -142,18 +145,43 @@ def _create_video_annotations(
                 yield NDObject.from_common(annotation=annot, data=label.data)
 
         for annotation_group in video_annotations.values():
-            segment_frame_ranges = cls._get_segment_frame_ranges(
-                annotation_group
-            )
             if isinstance(annotation_group[0], VideoClassificationAnnotation):
                 annotation = annotation_group[0]
+
+                if isinstance(annotation.value, Text):
+                    by_text = defaultdict(list)
+                    for ann in annotation_group:
+                        by_text[ann.value.answer].append(ann)
+
+                    frame_ranges_by_text = {}
+                    for text_val, anns in sorted(
+                        by_text.items(),
+                        key=lambda x: min(a.frame for a in x[1]),
+                    ):
+                        ranges = [
+                            {"start": s, "end": e}
+                            for s, e in cls._get_segment_frame_ranges(anns)
+                        ]
+                        frame_ranges_by_text[text_val] = ranges
+
+                    yield NDVideoText.from_video_text_group(
+                        annotation_group, frame_ranges_by_text, label.data
+                    )
+                    continue
+
+                segment_frame_ranges = cls._get_segment_frame_ranges(
+                    annotation_group
+                )
                 frames_data = []
                 for frames in segment_frame_ranges:
                     frames_data.append({"start": frames[0], "end": frames[-1]})
                 annotation.extra.update({"frames": frames_data})
                 yield NDClassification.from_common(annotation, label.data)
 
             elif isinstance(annotation_group[0], VideoObjectAnnotation):
+                segment_frame_ranges = cls._get_segment_frame_ranges(
+                    annotation_group
+                )
                 segments = []
                 for start_frame, end_frame in segment_frame_ranges:
                     segment = []
diff --git a/libs/labelbox/tests/data/serialization/ndjson/test_video.py b/libs/labelbox/tests/data/serialization/ndjson/test_video.py
@@ -635,6 +635,93 @@ def test_video_classification_global_subclassifications():
     assert res == [expected_first_annotation, expected_second_annotation]
 
 
+def test_video_classification_text_produces_ndjson_with_frames():
+    """VideoClassificationAnnotation + Text serializes with answer as a list of {value, frames}."""
+    label = Label(
+        data=GenericDataRowData(global_key="sample-video-text"),
+        annotations=[
+            VideoClassificationAnnotation(
+                name="free_text",
+                frame=9,
+                segment_index=0,
+                value=Text(answer="Looks like a hungry big cat"),
+            ),
+            VideoClassificationAnnotation(
+                name="free_text",
+                frame=15,
+                segment_index=0,
+                value=Text(answer="Looks like a hungry big cat"),
+            ),
+            VideoClassificationAnnotation(
+                name="free_text",
+                frame=40,
+                segment_index=1,
+                value=Text(answer="It's getting closer!"),
+            ),
+            VideoClassificationAnnotation(
+                name="free_text",
+                frame=50,
+                segment_index=1,
+                value=Text(answer="It's getting closer!"),
+            ),
+        ],
+    )
+    serialized = list(NDJsonConverter.serialize([label]))
+    free_text_rows = [r for r in serialized if r.get("name") == "free_text"]
+    assert len(free_text_rows) == 1
+
+    row = free_text_rows[0]
+    assert row["dataRow"] == {"globalKey": "sample-video-text"}
+    assert "answer" in row
+    answer = row["answer"]
+    assert isinstance(answer, list)
+    assert len(answer) == 2
+
+    by_value = {a["value"]: a for a in answer}
+    assert "Looks like a hungry big cat" in by_value
+    assert "It's getting closer!" in by_value
+    assert by_value["Looks like a hungry big cat"]["frames"] == [
+        {"start": 9, "end": 15}
+    ]
+    assert by_value["It's getting closer!"]["frames"] == [
+        {"start": 40, "end": 50}
+    ]
+
+
+def test_video_classification_text_single_text_across_frames():
+    """VideoClassificationAnnotation + Text with same text across all frames."""
+    label = Label(
+        data=GenericDataRowData(global_key="sample-video-single-text"),
+        annotations=[
+            VideoClassificationAnnotation(
+                name="free_text_per_frame",
+                frame=9,
+                segment_index=0,
+                value=Text(answer="sample text"),
+            ),
+            VideoClassificationAnnotation(
+                name="free_text_per_frame",
+                frame=15,
+                segment_index=0,
+                value=Text(answer="sample text"),
+            ),
+        ],
+    )
+    serialized = list(NDJsonConverter.serialize([label]))
+    free_text_rows = [
+        r for r in serialized if r.get("name") == "free_text_per_frame"
+    ]
+    assert len(free_text_rows) == 1
+
+    row = free_text_rows[0]
+    assert row["dataRow"] == {"globalKey": "sample-video-single-text"}
+    answer = row["answer"]
+    assert isinstance(answer, list)
+    assert len(answer) == 1
+    assert answer[0]["value"] == "sample text"
+    assert answer[0]["frames"] == [{"start": 9, "end": 15}]
+
+
 def test_video_classification_nesting_bbox():
     bbox_annotation = [
         VideoObjectAnnotation(