diff --git a/libs/labelbox/src/labelbox/data/serialization/ndjson/classification.py b/libs/labelbox/src/labelbox/data/serialization/ndjson/classification.py index 49177bba2..fc519edb2 100644 --- a/libs/labelbox/src/labelbox/data/serialization/ndjson/classification.py +++ b/libs/labelbox/src/labelbox/data/serialization/ndjson/classification.py @@ -209,6 +209,61 @@ def from_common( ) +class NDVideoTextAnswer(BaseModel): + value: str + frames: List[Dict[str, int]] + + model_config = ConfigDict(populate_by_name=True) + + +class NDVideoText(BaseModel): + """Video text classification with per-segment text values and frame ranges. + + Produces NDJSON like: + {"name": "...", "answer": [{"value": "text", "frames": [{"start": 1, "end": 5}]}], ...} + """ + + name: Optional[str] = None + schema_id: Optional[str] = Field(default=None, alias="schemaId") + answer: List[NDVideoTextAnswer] + data_row: DataRow = Field(alias="dataRow") + + model_config = ConfigDict(populate_by_name=True) + + @model_validator(mode="after") + def must_set_one(self): + if not self.name and not self.schema_id: + raise ValueError("Schema id or name are not set. Set either one.") + return self + + @model_serializer(mode="wrap") + def serialize_model(self, handler): + res = handler(self) + if "name" in res and res["name"] is None: + res.pop("name") + if "schemaId" in res and res["schemaId"] is None: + res.pop("schemaId") + return res + + @classmethod + def from_video_text_group( + cls, + annotation_group: List["VideoClassificationAnnotation"], + frame_ranges_by_text: Dict[str, List[Dict[str, int]]], + data: "GenericDataRowData", + ) -> "NDVideoText": + first = annotation_group[0] + return cls( + name=first.name, + schema_id=first.feature_schema_id, + data_row=DataRow(id=data.uid, global_key=data.global_key), + answer=[ + NDVideoTextAnswer(value=text_val, frames=ranges) + for text_val, ranges in frame_ranges_by_text.items() + ], + ) + + class NDPromptTextSubclass(NDAnswer): answer: str @@ -517,6 +572,7 @@ def from_common( NDRadioSubclass.model_rebuild() NDRadio.model_rebuild() NDText.model_rebuild() +NDVideoText.model_rebuild() NDPromptText.model_rebuild() NDTextSubclass.model_rebuild() diff --git a/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py b/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py index 39deafa64..fc06fd959 100644 --- a/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py +++ b/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py @@ -31,6 +31,7 @@ ) from .temporal import create_temporal_ndjson_classifications from labelbox.types import DocumentRectangle, DocumentEntity +from ...annotation_types.classification.classification import Text from .classification import ( NDChecklistSubclass, NDClassification, @@ -39,6 +40,7 @@ NDPromptClassificationType, NDPromptText, NDRadioSubclass, + NDVideoText, ) from .metric import NDConfusionMatrixMetric, NDMetricAnnotation, NDScalarMetric from .mmc import NDMessageTask @@ -61,6 +63,7 @@ NDRelationship, NDPromptText, NDMessageTask, + NDVideoText, ] @@ -142,11 +145,33 @@ def _create_video_annotations( yield NDObject.from_common(annotation=annot, data=label.data) for annotation_group in video_annotations.values(): - segment_frame_ranges = cls._get_segment_frame_ranges( - annotation_group - ) if isinstance(annotation_group[0], VideoClassificationAnnotation): annotation = annotation_group[0] + + if isinstance(annotation.value, Text): + by_text = defaultdict(list) + for ann in annotation_group: + by_text[ann.value.answer].append(ann) + + frame_ranges_by_text = {} + for text_val, anns in sorted( + by_text.items(), + key=lambda x: min(a.frame for a in x[1]), + ): + ranges = [ + {"start": s, "end": e} + for s, e in cls._get_segment_frame_ranges(anns) + ] + frame_ranges_by_text[text_val] = ranges + + yield NDVideoText.from_video_text_group( + annotation_group, frame_ranges_by_text, label.data + ) + continue + + segment_frame_ranges = cls._get_segment_frame_ranges( + annotation_group + ) frames_data = [] for frames in segment_frame_ranges: frames_data.append({"start": frames[0], "end": frames[-1]}) @@ -154,6 +179,9 @@ def _create_video_annotations( yield NDClassification.from_common(annotation, label.data) elif isinstance(annotation_group[0], VideoObjectAnnotation): + segment_frame_ranges = cls._get_segment_frame_ranges( + annotation_group + ) segments = [] for start_frame, end_frame in segment_frame_ranges: segment = [] diff --git a/libs/labelbox/tests/data/serialization/ndjson/test_video.py b/libs/labelbox/tests/data/serialization/ndjson/test_video.py index 6c14343a4..119a614e1 100644 --- a/libs/labelbox/tests/data/serialization/ndjson/test_video.py +++ b/libs/labelbox/tests/data/serialization/ndjson/test_video.py @@ -635,6 +635,93 @@ def test_video_classification_global_subclassifications(): assert res == [expected_first_annotation, expected_second_annotation] +def test_video_classification_text_produces_ndjson_with_frames(): + """VideoClassificationAnnotation + Text serializes with answer as a list of {value, frames}.""" + label = Label( + data=GenericDataRowData(global_key="sample-video-text"), + annotations=[ + VideoClassificationAnnotation( + name="free_text", + frame=9, + segment_index=0, + value=Text(answer="Looks like a hungry big cat"), + ), + VideoClassificationAnnotation( + name="free_text", + frame=15, + segment_index=0, + value=Text(answer="Looks like a hungry big cat"), + ), + VideoClassificationAnnotation( + name="free_text", + frame=40, + segment_index=1, + value=Text(answer="It's getting closer!"), + ), + VideoClassificationAnnotation( + name="free_text", + frame=50, + segment_index=1, + value=Text(answer="It's getting closer!"), + ), + ], + ) + serialized = list(NDJsonConverter.serialize([label])) + free_text_rows = [r for r in serialized if r.get("name") == "free_text"] + assert len(free_text_rows) == 1 + + row = free_text_rows[0] + assert row["dataRow"] == {"globalKey": "sample-video-text"} + assert "answer" in row + answer = row["answer"] + assert isinstance(answer, list) + assert len(answer) == 2 + + by_value = {a["value"]: a for a in answer} + assert "Looks like a hungry big cat" in by_value + assert "It's getting closer!" in by_value + assert by_value["Looks like a hungry big cat"]["frames"] == [ + {"start": 9, "end": 15} + ] + assert by_value["It's getting closer!"]["frames"] == [ + {"start": 40, "end": 50} + ] + + +def test_video_classification_text_single_text_across_frames(): + """VideoClassificationAnnotation + Text with same text across all frames.""" + label = Label( + data=GenericDataRowData(global_key="sample-video-single-text"), + annotations=[ + VideoClassificationAnnotation( + name="free_text_per_frame", + frame=9, + segment_index=0, + value=Text(answer="sample text"), + ), + VideoClassificationAnnotation( + name="free_text_per_frame", + frame=15, + segment_index=0, + value=Text(answer="sample text"), + ), + ], + ) + serialized = list(NDJsonConverter.serialize([label])) + free_text_rows = [ + r for r in serialized if r.get("name") == "free_text_per_frame" + ] + assert len(free_text_rows) == 1 + + row = free_text_rows[0] + assert row["dataRow"] == {"globalKey": "sample-video-single-text"} + answer = row["answer"] + assert isinstance(answer, list) + assert len(answer) == 1 + assert answer[0]["value"] == "sample text" + assert answer[0]["frames"] == [{"start": 9, "end": 15}] + + def test_video_classification_nesting_bbox(): bbox_annotation = [ VideoObjectAnnotation(