From fcc1107d5b1c2de5d5766610a844a1ca91233bb9 Mon Sep 17 00:00:00 2001 From: paulnoirel <87332996+paulnoirel@users.noreply.github.com> Date: Mon, 23 Feb 2026 18:18:58 +0000 Subject: [PATCH 1/2] Fix VideoClassificationText --- .../serialization/ndjson/classification.py | 42 +++++++++ .../data/serialization/ndjson/label.py | 34 +++++++- .../data/serialization/ndjson/test_video.py | 87 +++++++++++++++++++ 3 files changed, 160 insertions(+), 3 deletions(-) diff --git a/libs/labelbox/src/labelbox/data/serialization/ndjson/classification.py b/libs/labelbox/src/labelbox/data/serialization/ndjson/classification.py index 49177bba2..536b0bf53 100644 --- a/libs/labelbox/src/labelbox/data/serialization/ndjson/classification.py +++ b/libs/labelbox/src/labelbox/data/serialization/ndjson/classification.py @@ -209,6 +209,47 @@ def from_common( ) +class NDVideoTextAnswer(BaseModel): + value: str + frames: List[Dict[str, int]] + + model_config = ConfigDict(populate_by_name=True) + + +class NDVideoText(BaseModel): + """Video text classification with per-segment text values and frame ranges. + + Produces NDJSON like: + {"name": "...", "answer": [{"value": "text", "frames": [{"start": 1, "end": 5}]}], ...} + """ + + name: str + answer: List[NDVideoTextAnswer] + dataRow: Dict[str, str] + + @classmethod + def from_video_text_group( + cls, + annotation_group: List["VideoClassificationAnnotation"], + frame_ranges_by_text: Dict[str, List[Dict[str, int]]], + data: "GenericDataRowData", + ) -> "NDVideoText": + first = annotation_group[0] + data_row = {} + if data.global_key: + data_row["globalKey"] = data.global_key + elif data.uid: + data_row["id"] = data.uid + return cls( + name=first.name, + dataRow=data_row, + answer=[ + NDVideoTextAnswer(value=text_val, frames=ranges) + for text_val, ranges in frame_ranges_by_text.items() + ], + ) + + class NDPromptTextSubclass(NDAnswer): answer: str @@ -517,6 +558,7 @@ def from_common( NDRadioSubclass.model_rebuild() NDRadio.model_rebuild() NDText.model_rebuild() +NDVideoText.model_rebuild() NDPromptText.model_rebuild() NDTextSubclass.model_rebuild() diff --git a/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py b/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py index 39deafa64..fc06fd959 100644 --- a/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py +++ b/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py @@ -31,6 +31,7 @@ ) from .temporal import create_temporal_ndjson_classifications from labelbox.types import DocumentRectangle, DocumentEntity +from ...annotation_types.classification.classification import Text from .classification import ( NDChecklistSubclass, NDClassification, @@ -39,6 +40,7 @@ NDPromptClassificationType, NDPromptText, NDRadioSubclass, + NDVideoText, ) from .metric import NDConfusionMatrixMetric, NDMetricAnnotation, NDScalarMetric from .mmc import NDMessageTask @@ -61,6 +63,7 @@ NDRelationship, NDPromptText, NDMessageTask, + NDVideoText, ] @@ -142,11 +145,33 @@ def _create_video_annotations( yield NDObject.from_common(annotation=annot, data=label.data) for annotation_group in video_annotations.values(): - segment_frame_ranges = cls._get_segment_frame_ranges( - annotation_group - ) if isinstance(annotation_group[0], VideoClassificationAnnotation): annotation = annotation_group[0] + + if isinstance(annotation.value, Text): + by_text = defaultdict(list) + for ann in annotation_group: + by_text[ann.value.answer].append(ann) + + frame_ranges_by_text = {} + for text_val, anns in sorted( + by_text.items(), + key=lambda x: min(a.frame for a in x[1]), + ): + ranges = [ + {"start": s, "end": e} + for s, e in cls._get_segment_frame_ranges(anns) + ] + frame_ranges_by_text[text_val] = ranges + + yield NDVideoText.from_video_text_group( + annotation_group, frame_ranges_by_text, label.data + ) + continue + + segment_frame_ranges = cls._get_segment_frame_ranges( + annotation_group + ) frames_data = [] for frames in segment_frame_ranges: frames_data.append({"start": frames[0], "end": frames[-1]}) @@ -154,6 +179,9 @@ def _create_video_annotations( yield NDClassification.from_common(annotation, label.data) elif isinstance(annotation_group[0], VideoObjectAnnotation): + segment_frame_ranges = cls._get_segment_frame_ranges( + annotation_group + ) segments = [] for start_frame, end_frame in segment_frame_ranges: segment = [] diff --git a/libs/labelbox/tests/data/serialization/ndjson/test_video.py b/libs/labelbox/tests/data/serialization/ndjson/test_video.py index 6c14343a4..119a614e1 100644 --- a/libs/labelbox/tests/data/serialization/ndjson/test_video.py +++ b/libs/labelbox/tests/data/serialization/ndjson/test_video.py @@ -635,6 +635,93 @@ def test_video_classification_global_subclassifications(): assert res == [expected_first_annotation, expected_second_annotation] +def test_video_classification_text_produces_ndjson_with_frames(): + """VideoClassificationAnnotation + Text serializes with answer as a list of {value, frames}.""" + label = Label( + data=GenericDataRowData(global_key="sample-video-text"), + annotations=[ + VideoClassificationAnnotation( + name="free_text", + frame=9, + segment_index=0, + value=Text(answer="Looks like a hungry big cat"), + ), + VideoClassificationAnnotation( + name="free_text", + frame=15, + segment_index=0, + value=Text(answer="Looks like a hungry big cat"), + ), + VideoClassificationAnnotation( + name="free_text", + frame=40, + segment_index=1, + value=Text(answer="It's getting closer!"), + ), + VideoClassificationAnnotation( + name="free_text", + frame=50, + segment_index=1, + value=Text(answer="It's getting closer!"), + ), + ], + ) + serialized = list(NDJsonConverter.serialize([label])) + free_text_rows = [r for r in serialized if r.get("name") == "free_text"] + assert len(free_text_rows) == 1 + + row = free_text_rows[0] + assert row["dataRow"] == {"globalKey": "sample-video-text"} + assert "answer" in row + answer = row["answer"] + assert isinstance(answer, list) + assert len(answer) == 2 + + by_value = {a["value"]: a for a in answer} + assert "Looks like a hungry big cat" in by_value + assert "It's getting closer!" in by_value + assert by_value["Looks like a hungry big cat"]["frames"] == [ + {"start": 9, "end": 15} + ] + assert by_value["It's getting closer!"]["frames"] == [ + {"start": 40, "end": 50} + ] + + +def test_video_classification_text_single_text_across_frames(): + """VideoClassificationAnnotation + Text with same text across all frames.""" + label = Label( + data=GenericDataRowData(global_key="sample-video-single-text"), + annotations=[ + VideoClassificationAnnotation( + name="free_text_per_frame", + frame=9, + segment_index=0, + value=Text(answer="sample text"), + ), + VideoClassificationAnnotation( + name="free_text_per_frame", + frame=15, + segment_index=0, + value=Text(answer="sample text"), + ), + ], + ) + serialized = list(NDJsonConverter.serialize([label])) + free_text_rows = [ + r for r in serialized if r.get("name") == "free_text_per_frame" + ] + assert len(free_text_rows) == 1 + + row = free_text_rows[0] + assert row["dataRow"] == {"globalKey": "sample-video-single-text"} + answer = row["answer"] + assert isinstance(answer, list) + assert len(answer) == 1 + assert answer[0]["value"] == "sample text" + assert answer[0]["frames"] == [{"start": 9, "end": 15}] + + def test_video_classification_nesting_bbox(): bbox_annotation = [ VideoObjectAnnotation( From 210d84e6528f824582bbdbae4ee627290cf3df8e Mon Sep 17 00:00:00 2001 From: paulnoirel <87332996+paulnoirel@users.noreply.github.com> Date: Thu, 26 Feb 2026 10:36:17 +0000 Subject: [PATCH 2/2] use DataRow in NDVideoText --- .../serialization/ndjson/classification.py | 30 ++++++++++++++----- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/libs/labelbox/src/labelbox/data/serialization/ndjson/classification.py b/libs/labelbox/src/labelbox/data/serialization/ndjson/classification.py index 536b0bf53..fc519edb2 100644 --- a/libs/labelbox/src/labelbox/data/serialization/ndjson/classification.py +++ b/libs/labelbox/src/labelbox/data/serialization/ndjson/classification.py @@ -223,9 +223,27 @@ class NDVideoText(BaseModel): {"name": "...", "answer": [{"value": "text", "frames": [{"start": 1, "end": 5}]}], ...} """ - name: str + name: Optional[str] = None + schema_id: Optional[str] = Field(default=None, alias="schemaId") answer: List[NDVideoTextAnswer] - dataRow: Dict[str, str] + data_row: DataRow = Field(alias="dataRow") + + model_config = ConfigDict(populate_by_name=True) + + @model_validator(mode="after") + def must_set_one(self): + if not self.name and not self.schema_id: + raise ValueError("Schema id or name are not set. Set either one.") + return self + + @model_serializer(mode="wrap") + def serialize_model(self, handler): + res = handler(self) + if "name" in res and res["name"] is None: + res.pop("name") + if "schemaId" in res and res["schemaId"] is None: + res.pop("schemaId") + return res @classmethod def from_video_text_group( @@ -235,14 +253,10 @@ def from_video_text_group( data: "GenericDataRowData", ) -> "NDVideoText": first = annotation_group[0] - data_row = {} - if data.global_key: - data_row["globalKey"] = data.global_key - elif data.uid: - data_row["id"] = data.uid return cls( name=first.name, - dataRow=data_row, + schema_id=first.feature_schema_id, + data_row=DataRow(id=data.uid, global_key=data.global_key), answer=[ NDVideoTextAnswer(value=text_val, frames=ranges) for text_val, ranges in frame_ranges_by_text.items()