From fcc1107d5b1c2de5d5766610a844a1ca91233bb9 Mon Sep 17 00:00:00 2001
From: paulnoirel <87332996+paulnoirel@users.noreply.github.com>
Date: Mon, 23 Feb 2026 18:18:58 +0000
Subject: [PATCH 1/2] Fix VideoClassificationText

---
 .../serialization/ndjson/classification.py    | 42 +++++++++
 .../data/serialization/ndjson/label.py        | 34 +++++++-
 .../data/serialization/ndjson/test_video.py   | 87 +++++++++++++++++++
 3 files changed, 160 insertions(+), 3 deletions(-)

diff --git a/libs/labelbox/src/labelbox/data/serialization/ndjson/classification.py b/libs/labelbox/src/labelbox/data/serialization/ndjson/classification.py
index 49177bba2..536b0bf53 100644
--- a/libs/labelbox/src/labelbox/data/serialization/ndjson/classification.py
+++ b/libs/labelbox/src/labelbox/data/serialization/ndjson/classification.py
@@ -209,6 +209,47 @@ def from_common(
         )
 
 
+class NDVideoTextAnswer(BaseModel):
+    value: str
+    frames: List[Dict[str, int]]
+
+    model_config = ConfigDict(populate_by_name=True)
+
+
+class NDVideoText(BaseModel):
+    """Video text classification with per-segment text values and frame ranges.
+
+    Produces NDJSON like:
+      {"name": "...", "answer": [{"value": "text", "frames": [{"start": 1, "end": 5}]}], ...}
+    """
+
+    name: str
+    answer: List[NDVideoTextAnswer]
+    dataRow: Dict[str, str]
+
+    @classmethod
+    def from_video_text_group(
+        cls,
+        annotation_group: List["VideoClassificationAnnotation"],
+        frame_ranges_by_text: Dict[str, List[Dict[str, int]]],
+        data: "GenericDataRowData",
+    ) -> "NDVideoText":
+        first = annotation_group[0]
+        data_row = {}
+        if data.global_key:
+            data_row["globalKey"] = data.global_key
+        elif data.uid:
+            data_row["id"] = data.uid
+        return cls(
+            name=first.name,
+            dataRow=data_row,
+            answer=[
+                NDVideoTextAnswer(value=text_val, frames=ranges)
+                for text_val, ranges in frame_ranges_by_text.items()
+            ],
+        )
+
+
 class NDPromptTextSubclass(NDAnswer):
     answer: str
 
@@ -517,6 +558,7 @@ def from_common(
 NDRadioSubclass.model_rebuild()
 NDRadio.model_rebuild()
 NDText.model_rebuild()
+NDVideoText.model_rebuild()
 NDPromptText.model_rebuild()
 NDTextSubclass.model_rebuild()
 
diff --git a/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py b/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py
index 39deafa64..fc06fd959 100644
--- a/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py
+++ b/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py
@@ -31,6 +31,7 @@
 )
 from .temporal import create_temporal_ndjson_classifications
 from labelbox.types import DocumentRectangle, DocumentEntity
+from ...annotation_types.classification.classification import Text
 from .classification import (
     NDChecklistSubclass,
     NDClassification,
@@ -39,6 +40,7 @@
     NDPromptClassificationType,
     NDPromptText,
     NDRadioSubclass,
+    NDVideoText,
 )
 from .metric import NDConfusionMatrixMetric, NDMetricAnnotation, NDScalarMetric
 from .mmc import NDMessageTask
@@ -61,6 +63,7 @@
     NDRelationship,
     NDPromptText,
     NDMessageTask,
+    NDVideoText,
 ]
 
 
@@ -142,11 +145,33 @@ def _create_video_annotations(
                 yield NDObject.from_common(annotation=annot, data=label.data)
 
         for annotation_group in video_annotations.values():
-            segment_frame_ranges = cls._get_segment_frame_ranges(
-                annotation_group
-            )
             if isinstance(annotation_group[0], VideoClassificationAnnotation):
                 annotation = annotation_group[0]
+
+                if isinstance(annotation.value, Text):
+                    by_text = defaultdict(list)
+                    for ann in annotation_group:
+                        by_text[ann.value.answer].append(ann)
+
+                    frame_ranges_by_text = {}
+                    for text_val, anns in sorted(
+                        by_text.items(),
+                        key=lambda x: min(a.frame for a in x[1]),
+                    ):
+                        ranges = [
+                            {"start": s, "end": e}
+                            for s, e in cls._get_segment_frame_ranges(anns)
+                        ]
+                        frame_ranges_by_text[text_val] = ranges
+
+                    yield NDVideoText.from_video_text_group(
+                        annotation_group, frame_ranges_by_text, label.data
+                    )
+                    continue
+
+                segment_frame_ranges = cls._get_segment_frame_ranges(
+                    annotation_group
+                )
                 frames_data = []
                 for frames in segment_frame_ranges:
                     frames_data.append({"start": frames[0], "end": frames[-1]})
@@ -154,6 +179,9 @@ def _create_video_annotations(
                 yield NDClassification.from_common(annotation, label.data)
 
             elif isinstance(annotation_group[0], VideoObjectAnnotation):
+                segment_frame_ranges = cls._get_segment_frame_ranges(
+                    annotation_group
+                )
                 segments = []
                 for start_frame, end_frame in segment_frame_ranges:
                     segment = []
diff --git a/libs/labelbox/tests/data/serialization/ndjson/test_video.py b/libs/labelbox/tests/data/serialization/ndjson/test_video.py
index 6c14343a4..119a614e1 100644
--- a/libs/labelbox/tests/data/serialization/ndjson/test_video.py
+++ b/libs/labelbox/tests/data/serialization/ndjson/test_video.py
@@ -635,6 +635,93 @@ def test_video_classification_global_subclassifications():
     assert res == [expected_first_annotation, expected_second_annotation]
 
 
+def test_video_classification_text_produces_ndjson_with_frames():
+    """VideoClassificationAnnotation + Text serializes with answer as a list of {value, frames}."""
+    label = Label(
+        data=GenericDataRowData(global_key="sample-video-text"),
+        annotations=[
+            VideoClassificationAnnotation(
+                name="free_text",
+                frame=9,
+                segment_index=0,
+                value=Text(answer="Looks like a hungry big cat"),
+            ),
+            VideoClassificationAnnotation(
+                name="free_text",
+                frame=15,
+                segment_index=0,
+                value=Text(answer="Looks like a hungry big cat"),
+            ),
+            VideoClassificationAnnotation(
+                name="free_text",
+                frame=40,
+                segment_index=1,
+                value=Text(answer="It's getting closer!"),
+            ),
+            VideoClassificationAnnotation(
+                name="free_text",
+                frame=50,
+                segment_index=1,
+                value=Text(answer="It's getting closer!"),
+            ),
+        ],
+    )
+    serialized = list(NDJsonConverter.serialize([label]))
+    free_text_rows = [r for r in serialized if r.get("name") == "free_text"]
+    assert len(free_text_rows) == 1
+
+    row = free_text_rows[0]
+    assert row["dataRow"] == {"globalKey": "sample-video-text"}
+    assert "answer" in row
+    answer = row["answer"]
+    assert isinstance(answer, list)
+    assert len(answer) == 2
+
+    by_value = {a["value"]: a for a in answer}
+    assert "Looks like a hungry big cat" in by_value
+    assert "It's getting closer!" in by_value
+    assert by_value["Looks like a hungry big cat"]["frames"] == [
+        {"start": 9, "end": 15}
+    ]
+    assert by_value["It's getting closer!"]["frames"] == [
+        {"start": 40, "end": 50}
+    ]
+
+
+def test_video_classification_text_single_text_across_frames():
+    """VideoClassificationAnnotation + Text with same text across all frames."""
+    label = Label(
+        data=GenericDataRowData(global_key="sample-video-single-text"),
+        annotations=[
+            VideoClassificationAnnotation(
+                name="free_text_per_frame",
+                frame=9,
+                segment_index=0,
+                value=Text(answer="sample text"),
+            ),
+            VideoClassificationAnnotation(
+                name="free_text_per_frame",
+                frame=15,
+                segment_index=0,
+                value=Text(answer="sample text"),
+            ),
+        ],
+    )
+    serialized = list(NDJsonConverter.serialize([label]))
+    free_text_rows = [
+        r for r in serialized if r.get("name") == "free_text_per_frame"
+    ]
+    assert len(free_text_rows) == 1
+
+    row = free_text_rows[0]
+    assert row["dataRow"] == {"globalKey": "sample-video-single-text"}
+    answer = row["answer"]
+    assert isinstance(answer, list)
+    assert len(answer) == 1
+    assert answer[0]["value"] == "sample text"
+    assert answer[0]["frames"] == [{"start": 9, "end": 15}]
+
+
 def test_video_classification_nesting_bbox():
     bbox_annotation = [
         VideoObjectAnnotation(

From 210d84e6528f824582bbdbae4ee627290cf3df8e Mon Sep 17 00:00:00 2001
From: paulnoirel <87332996+paulnoirel@users.noreply.github.com>
Date: Thu, 26 Feb 2026 10:36:17 +0000
Subject: [PATCH 2/2] use DataRow in NDVideoText

---
 .../serialization/ndjson/classification.py    | 30 ++++++++++++++-----
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/libs/labelbox/src/labelbox/data/serialization/ndjson/classification.py b/libs/labelbox/src/labelbox/data/serialization/ndjson/classification.py
index 536b0bf53..fc519edb2 100644
--- a/libs/labelbox/src/labelbox/data/serialization/ndjson/classification.py
+++ b/libs/labelbox/src/labelbox/data/serialization/ndjson/classification.py
@@ -223,9 +223,27 @@ class NDVideoText(BaseModel):
       {"name": "...", "answer": [{"value": "text", "frames": [{"start": 1, "end": 5}]}], ...}
     """
 
-    name: str
+    name: Optional[str] = None
+    schema_id: Optional[str] = Field(default=None, alias="schemaId")
     answer: List[NDVideoTextAnswer]
-    dataRow: Dict[str, str]
+    data_row: DataRow = Field(alias="dataRow")
+
+    model_config = ConfigDict(populate_by_name=True)
+
+    @model_validator(mode="after")
+    def must_set_one(self):
+        if not self.name and not self.schema_id:
+            raise ValueError("Schema id or name are not set. Set either one.")
+        return self
+
+    @model_serializer(mode="wrap")
+    def serialize_model(self, handler):
+        res = handler(self)
+        if "name" in res and res["name"] is None:
+            res.pop("name")
+        if "schemaId" in res and res["schemaId"] is None:
+            res.pop("schemaId")
+        return res
 
     @classmethod
     def from_video_text_group(
@@ -235,14 +253,10 @@ def from_video_text_group(
         data: "GenericDataRowData",
     ) -> "NDVideoText":
         first = annotation_group[0]
-        data_row = {}
-        if data.global_key:
-            data_row["globalKey"] = data.global_key
-        elif data.uid:
-            data_row["id"] = data.uid
         return cls(
             name=first.name,
-            dataRow=data_row,
+            schema_id=first.feature_schema_id,
+            data_row=DataRow(id=data.uid, global_key=data.global_key),
             answer=[
                 NDVideoTextAnswer(value=text_val, frames=ranges)
                 for text_val, ranges in frame_ranges_by_text.items()