From 32ba244e999d2d418383deeae8cbfa2581596ecd Mon Sep 17 00:00:00 2001 From: MollyAI Date: Sun, 17 May 2026 14:08:11 -0400 Subject: [PATCH] Expand weak cross-modal benchmark categories --- benchmarks/cross_modal_ablation.py | 355 ++++++++++++++++++++++- tests/test_cross_modal_benchmark_defs.py | 37 +++ tests/uat/corpus/CORPUS_EXPANSION.md | 25 +- 3 files changed, 409 insertions(+), 8 deletions(-) diff --git a/benchmarks/cross_modal_ablation.py b/benchmarks/cross_modal_ablation.py index 8298b5f..572e4eb 100644 --- a/benchmarks/cross_modal_ablation.py +++ b/benchmarks/cross_modal_ablation.py @@ -169,6 +169,48 @@ def get_relevance_score(self, path: str) -> int: return self.graded_relevance.get(path, 0) +WEAK_CROSS_MODAL_BENCHMARK_CATEGORIES = { + "image_to_text", + "image_to_document", + "video_to_text", + "video_to_image", + "video_to_document", +} + +MIN_WEAK_CATEGORY_QUERIES = 20 + + +def _media_query_variants( + *, + category: str, + query_type: str, + source_path: str, + relevant_paths: List[str], + queries: List[str], + difficulty: str, + graded_relevance: Optional[Dict[str, int]] = None, +) -> List[GroundTruth]: + """Build grounded media-query variants without duplicating path metadata.""" + variants: List[GroundTruth] = [] + for query in queries: + kwargs: Dict[str, Any] = { + "query": query, + "query_type": query_type, + "relevant_paths": relevant_paths, + "category": category, + "difficulty": difficulty, + "graded_relevance": graded_relevance, + } + if query_type == "image": + kwargs["image_query_path"] = source_path + elif query_type == "video": + kwargs["video_query_path"] = source_path + else: + raise ValueError(f"Unsupported media query_type: {query_type}") + variants.append(GroundTruth(**kwargs)) + return variants + + # ============================================================================ # TEXT → TEXT QUERIES (50 queries) # ============================================================================ @@ -613,7 +655,7 @@ def get_relevance_score(self, path: str) -> int: ] # ============================================================================ -# IMAGE → TEXT QUERIES (15 queries) +# IMAGE → TEXT QUERIES (20 queries) # ============================================================================ IMAGE_TO_TEXT = [ @@ -754,6 +796,52 @@ def get_relevance_score(self, path: str) -> int: ), ] +IMAGE_TO_TEXT += _media_query_variants( + category="image_to_text", + query_type="image", + source_path="images/neural_network_diagram.png", + relevant_paths=["text/ai_transformers.md", "text/ai_embeddings.md", "text/science_neuroscience.md"], + queries=["neural network diagram connected to transformer and embedding notes"], + difficulty="medium", + graded_relevance={"text/ai_transformers.md": 2, "text/ai_embeddings.md": 2, "text/science_neuroscience.md": 1}, +) +IMAGE_TO_TEXT += _media_query_variants( + category="image_to_text", + query_type="image", + source_path="images/code_editor_screenshot.png", + relevant_paths=["text/tech_edge_ai.md", "text/tech_cybersecurity.md", "text/tech_cloud_computing.md"], + queries=["code editor screenshot linked to edge AI and cloud engineering notes"], + difficulty="medium", + graded_relevance={"text/tech_edge_ai.md": 2, "text/tech_cloud_computing.md": 1, "text/tech_cybersecurity.md": 1}, +) +IMAGE_TO_TEXT += _media_query_variants( + category="image_to_text", + query_type="image", + source_path="images/handwritten_notes.png", + relevant_paths=["text/ai_agents.md", "text/tech_cloud_computing.md", "text/finance_investing.md"], + queries=["handwritten planning notes connected to agent roadmap and budget context"], + difficulty="hard", + graded_relevance={"text/ai_agents.md": 2, "text/tech_cloud_computing.md": 1, "text/finance_investing.md": 1}, +) +IMAGE_TO_TEXT += _media_query_variants( + category="image_to_text", + query_type="image", + source_path="images/whiteboard_brainstorm.png", + relevant_paths=["text/ai_agents.md", "text/tech_cloud_computing.md", "text/tech_edge_ai.md"], + queries=["brainstorm whiteboard should retrieve agent architecture notes"], + difficulty="hard", + graded_relevance={"text/ai_agents.md": 2, "text/tech_cloud_computing.md": 1, "text/tech_edge_ai.md": 1}, +) +IMAGE_TO_TEXT += _media_query_variants( + category="image_to_text", + query_type="image", + source_path="images/floor_plan_blueprint.png", + relevant_paths=["text/architecture_blueprints.md", "text/architecture_modern.md", "text/architecture_gothic.md"], + queries=["floor plan image tied to architecture blueprint reference notes"], + difficulty="medium", + graded_relevance={"text/architecture_blueprints.md": 2, "text/architecture_modern.md": 1, "text/architecture_gothic.md": 1}, +) + # ============================================================================ # MIXED MODAL QUERIES (20 queries) - find BOTH text and image # ============================================================================ @@ -1253,6 +1341,85 @@ def get_relevance_score(self, path: str) -> int: ), ] +IMAGE_TO_DOCUMENT += _media_query_variants( + category="image_to_document", + query_type="image", + source_path="images/neural_network_diagram.png", + relevant_paths=["documents/ai_strategy_report.docx", "documents/ai_architecture_deck.pptx", "documents/embedding_research.pdf"], + queries=[ + "neural network diagram should recall AI strategy document", + "embedding architecture image linked to research paper and slides", + "visual AI model graph connected to cross-modal embedding documents", + ], + difficulty="hard", + graded_relevance={"documents/ai_strategy_report.docx": 2, "documents/ai_architecture_deck.pptx": 2, "documents/embedding_research.pdf": 2}, +) +IMAGE_TO_DOCUMENT += _media_query_variants( + category="image_to_document", + query_type="image", + source_path="images/whiteboard_architecture.png", + relevant_paths=["documents/recallforge_spec.docx", "documents/ai_architecture_deck.pptx", "documents/ai_strategy_report.docx"], + queries=[ + "whiteboard architecture sketch should find RecallForge specification", + "system design drawing linked to architecture presentation", + "AI service diagram should retrieve strategy roadmap docs", + ], + difficulty="hard", + graded_relevance={"documents/recallforge_spec.docx": 2, "documents/ai_architecture_deck.pptx": 2, "documents/ai_strategy_report.docx": 1}, +) +IMAGE_TO_DOCUMENT += _media_query_variants( + category="image_to_document", + query_type="image", + source_path="images/whiteboard_brainstorm.png", + relevant_paths=["documents/ai_strategy_report.docx", "documents/project_status_q1.docx", "documents/quarterly_review.pptx"], + queries=[ + "brainstorm board linked to AI roadmap and quarterly review", + "planning whiteboard should recall project status documentation", + "strategy workshop image connected to portfolio review slides", + ], + difficulty="hard", + graded_relevance={"documents/ai_strategy_report.docx": 2, "documents/project_status_q1.docx": 2, "documents/quarterly_review.pptx": 1}, +) +IMAGE_TO_DOCUMENT += _media_query_variants( + category="image_to_document", + query_type="image", + source_path="images/code_editor_screenshot.png", + relevant_paths=["documents/recallforge_spec.docx", "documents/operations_manual.pdf", "documents/edge_deployment_guide.pdf"], + queries=[ + "developer screenshot should retrieve RecallForge technical specification", + "code editor image linked to install and operations manual", + "local deployment screen should find edge deployment guide", + ], + difficulty="hard", + graded_relevance={"documents/recallforge_spec.docx": 2, "documents/operations_manual.pdf": 2, "documents/edge_deployment_guide.pdf": 1}, +) +IMAGE_TO_DOCUMENT += _media_query_variants( + category="image_to_document", + query_type="image", + source_path="images/floor_plan_blueprint.png", + relevant_paths=["documents/ai_architecture_deck.pptx", "documents/operations_manual.pdf", "documents/project_status_q1.docx"], + queries=[ + "blueprint image connected to architecture presentation", + "floor plan drawing should recall operations and planning docs", + "building layout visual tied to project review documentation", + ], + difficulty="hard", + graded_relevance={"documents/ai_architecture_deck.pptx": 2, "documents/operations_manual.pdf": 1, "documents/project_status_q1.docx": 1}, +) +IMAGE_TO_DOCUMENT += _media_query_variants( + category="image_to_document", + query_type="image", + source_path="images/handwritten_notes.png", + relevant_paths=["documents/project_status_q1.docx", "documents/quarterly_review.pptx", "documents/ai_strategy_report.docx"], + queries=[ + "handwritten meeting notes should find project status report", + "notebook planning image linked to quarterly review deck", + "planning notes connected to AI strategy report", + ], + difficulty="hard", + graded_relevance={"documents/project_status_q1.docx": 2, "documents/quarterly_review.pptx": 2, "documents/ai_strategy_report.docx": 1}, +) + # ── Video → Text queries ────────────────────────────────────── VIDEO_TO_TEXT = [ GroundTruth( @@ -1281,6 +1448,74 @@ def get_relevance_score(self, path: str) -> int: ), ] +VIDEO_TO_TEXT += _media_query_variants( + category="video_to_text", + query_type="video", + source_path="videos/nature_timelapse.mp4", + relevant_paths=["text/nature_forests.md", "text/nature_mountains.md", "text/nature_oceans.md", "text/science_climate.md", "text/travel_national_parks.md"], + queries=[ + "nature timelapse should recall forest mountain and ocean notes", + "outdoor landscape video connected to climate and park memories", + "scenic video with forests mountains and water reference text", + "environment footage linked to nature and national parks docs", + ], + difficulty="medium", + graded_relevance={"text/nature_forests.md": 2, "text/nature_mountains.md": 2, "text/nature_oceans.md": 2, "text/science_climate.md": 1, "text/travel_national_parks.md": 1}, +) +VIDEO_TO_TEXT += _media_query_variants( + category="video_to_text", + query_type="video", + source_path="videos/coding_demo.mp4", + relevant_paths=["text/ai_agents.md", "text/tech_cloud_computing.md", "text/tech_edge_ai.md", "text/tech_cybersecurity.md"], + queries=[ + "coding demo should retrieve software agents and cloud notes", + "developer video connected to backend architecture text memories", + "programming walkthrough linked to edge AI implementation notes", + "software tutorial video should recall cybersecurity and cloud docs", + ], + difficulty="hard", + graded_relevance={"text/ai_agents.md": 2, "text/tech_cloud_computing.md": 2, "text/tech_edge_ai.md": 1, "text/tech_cybersecurity.md": 1}, +) +VIDEO_TO_TEXT += _media_query_variants( + category="video_to_text", + query_type="video", + source_path="videos/architecture_walkthrough.mp4", + relevant_paths=["text/architecture_blueprints.md", "text/architecture_modern.md", "text/architecture_gothic.md", "text/tech_cloud_computing.md"], + queries=[ + "building walkthrough video connected to architecture reference notes", + "architecture tour should recall blueprint and modern design text", + "walkthrough recording linked to systems architecture notes", + ], + difficulty="medium", + graded_relevance={"text/architecture_blueprints.md": 2, "text/architecture_modern.md": 2, "text/architecture_gothic.md": 1, "text/tech_cloud_computing.md": 1}, +) +VIDEO_TO_TEXT += _media_query_variants( + category="video_to_text", + query_type="video", + source_path="videos/cooking_tutorial.mp4", + relevant_paths=["text/cooking_pasta.md", "text/cooking_grilling.md", "text/cooking_spices.md", "text/cooking_asian_cuisine.md"], + queries=[ + "cooking tutorial video should find recipe and spice notes", + "food preparation recording connected to pasta cooking memory", + "culinary demonstration linked to cooking technique documents", + ], + difficulty="medium", + graded_relevance={"text/cooking_pasta.md": 2, "text/cooking_spices.md": 1, "text/cooking_asian_cuisine.md": 1, "text/cooking_grilling.md": 1}, +) +VIDEO_TO_TEXT += _media_query_variants( + category="video_to_text", + query_type="video", + source_path="videos/whiteboard_session.mp4", + relevant_paths=["text/ai_agents.md", "text/tech_cloud_computing.md", "text/tech_edge_ai.md", "text/ai_embeddings.md"], + queries=[ + "whiteboard session video should retrieve AI agent design notes", + "planning meeting recording linked to cloud architecture memory", + "brainstorming video connected to embedding and edge AI notes", + ], + difficulty="hard", + graded_relevance={"text/ai_agents.md": 2, "text/tech_cloud_computing.md": 2, "text/tech_edge_ai.md": 1, "text/ai_embeddings.md": 1}, +) + # ── Video → Image queries ───────────────────────────────────── VIDEO_TO_IMAGE = [ GroundTruth( @@ -1301,6 +1536,74 @@ def get_relevance_score(self, path: str) -> int: ), ] +VIDEO_TO_IMAGE += _media_query_variants( + category="video_to_image", + query_type="video", + source_path="videos/nature_timelapse.mp4", + relevant_paths=["images/forest_landscape.png", "images/mountain_landscape.png", "images/ocean_beach.png"], + queries=[ + "nature video should match forest and mountain images", + "timelapse landscape recording connected to outdoor photos", + "scenic footage should recall ocean beach and mountain visuals", + "environment video linked to landscape image memories", + ], + difficulty="medium", + graded_relevance={"images/forest_landscape.png": 2, "images/mountain_landscape.png": 2, "images/ocean_beach.png": 2}, +) +VIDEO_TO_IMAGE += _media_query_variants( + category="video_to_image", + query_type="video", + source_path="videos/whiteboard_session.mp4", + relevant_paths=["images/whiteboard_brainstorm.png", "images/whiteboard_architecture.png", "images/handwritten_notes.png"], + queries=[ + "whiteboard meeting video should match brainstorm images", + "planning session recording connected to whiteboard photos", + "team board video should recall architecture sketch image", + "meeting notes video linked to handwritten planning image", + ], + difficulty="hard", + graded_relevance={"images/whiteboard_brainstorm.png": 2, "images/whiteboard_architecture.png": 2, "images/handwritten_notes.png": 1}, +) +VIDEO_TO_IMAGE += _media_query_variants( + category="video_to_image", + query_type="video", + source_path="videos/architecture_walkthrough.mp4", + relevant_paths=["images/floor_plan_blueprint.png", "images/whiteboard_architecture.png", "images/neural_network_diagram.png"], + queries=[ + "architecture walkthrough should match blueprint and system diagrams", + "building tour video connected to floor plan visual memory", + "architecture recording linked to whiteboard design image", + ], + difficulty="hard", + graded_relevance={"images/floor_plan_blueprint.png": 2, "images/whiteboard_architecture.png": 1, "images/neural_network_diagram.png": 1}, +) +VIDEO_TO_IMAGE += _media_query_variants( + category="video_to_image", + query_type="video", + source_path="videos/coding_demo.mp4", + relevant_paths=["images/code_editor_screenshot.png", "images/neural_network_diagram.png", "images/whiteboard_architecture.png"], + queries=[ + "coding demo should match code editor screenshot", + "software video connected to neural network diagram image", + "developer recording linked to architecture whiteboard visual", + ], + difficulty="hard", + graded_relevance={"images/code_editor_screenshot.png": 2, "images/neural_network_diagram.png": 1, "images/whiteboard_architecture.png": 1}, +) +VIDEO_TO_IMAGE += _media_query_variants( + category="video_to_image", + query_type="video", + source_path="videos/cooking_tutorial.mp4", + relevant_paths=["images/food_pasta_dish.png"], + queries=[ + "cooking tutorial should match pasta dish image", + "recipe video connected to plated food photo", + "culinary demonstration linked to food image memory", + "kitchen video should recall pasta visual", + ], + difficulty="medium", +) + # ── Video → Video queries (similarity) ──────────────────────── VIDEO_TO_VIDEO = [ GroundTruth( @@ -1326,6 +1629,56 @@ def get_relevance_score(self, path: str) -> int: ), ] +VIDEO_TO_DOCUMENT += _media_query_variants( + category="video_to_document", + query_type="video", + source_path="videos/architecture_walkthrough.mp4", + relevant_paths=["documents/ai_architecture_deck.pptx", "documents/project_status_q1.docx", "documents/ai_strategy_report.docx"], + queries=[ + "architecture walkthrough should retrieve architecture deck", + "building tour video connected to planning and status docs", + "walkthrough recording linked to AI strategy architecture slides", + "architecture video should recall project milestone deck", + "tour footage connected to enterprise AI roadmap documents", + "system walkthrough recording linked to architecture presentation", + "architecture recording tied to portfolio planning documents", + ], + difficulty="hard", + graded_relevance={"documents/ai_architecture_deck.pptx": 2, "documents/project_status_q1.docx": 1, "documents/ai_strategy_report.docx": 1}, +) +VIDEO_TO_DOCUMENT += _media_query_variants( + category="video_to_document", + query_type="video", + source_path="videos/coding_demo.mp4", + relevant_paths=["documents/recallforge_spec.docx", "documents/operations_manual.pdf", "documents/edge_deployment_guide.pdf", "documents/embedding_research.pdf"], + queries=[ + "coding demo should find RecallForge technical specification", + "developer video connected to operations manual", + "software walkthrough linked to edge deployment guide", + "code demonstration should recall embedding research paper", + "implementation video tied to install and configuration docs", + "developer recording connected to local multimodal search spec", + ], + difficulty="hard", + graded_relevance={"documents/recallforge_spec.docx": 2, "documents/operations_manual.pdf": 2, "documents/edge_deployment_guide.pdf": 1, "documents/embedding_research.pdf": 1}, +) +VIDEO_TO_DOCUMENT += _media_query_variants( + category="video_to_document", + query_type="video", + source_path="videos/whiteboard_session.mp4", + relevant_paths=["documents/ai_strategy_report.docx", "documents/project_status_q1.docx", "documents/quarterly_review.pptx", "documents/recallforge_spec.docx"], + queries=[ + "whiteboard session should retrieve strategy roadmap documents", + "planning video connected to quarterly review presentation", + "brainstorm recording linked to project status report", + "whiteboard meeting should recall RecallForge spec", + "roadmap discussion video tied to portfolio review deck", + "team planning recording connected to AI strategy docs", + ], + difficulty="hard", + graded_relevance={"documents/ai_strategy_report.docx": 2, "documents/project_status_q1.docx": 2, "documents/quarterly_review.pptx": 1, "documents/recallforge_spec.docx": 1}, +) + ALL_GROUND_TRUTH = ( TEXT_TO_TEXT + TEXT_TO_IMAGE + diff --git a/tests/test_cross_modal_benchmark_defs.py b/tests/test_cross_modal_benchmark_defs.py index a660360..ec1cbdf 100644 --- a/tests/test_cross_modal_benchmark_defs.py +++ b/tests/test_cross_modal_benchmark_defs.py @@ -68,6 +68,43 @@ def test_text_to_document_block_matches_category(self): all(gt.category == "text_to_document" for gt in module.TEXT_TO_DOCUMENT) ) + def test_weak_cross_modal_categories_have_query_floor(self): + module = _load_cross_modal_ablation() + + counts = { + category: sum(1 for gt in module.ALL_GROUND_TRUTH if gt.category == category) + for category in module.WEAK_CROSS_MODAL_BENCHMARK_CATEGORIES + } + + self.assertEqual( + counts, + { + "image_to_document": module.MIN_WEAK_CATEGORY_QUERIES, + "image_to_text": module.MIN_WEAK_CATEGORY_QUERIES, + "video_to_document": module.MIN_WEAK_CATEGORY_QUERIES, + "video_to_image": module.MIN_WEAK_CATEGORY_QUERIES, + "video_to_text": module.MIN_WEAK_CATEGORY_QUERIES, + }, + ) + + def test_expanded_media_queries_keep_source_path_provenance(self): + module = _load_cross_modal_ablation() + + weak_queries = [ + gt + for gt in module.ALL_GROUND_TRUTH + if gt.category in module.WEAK_CROSS_MODAL_BENCHMARK_CATEGORIES + ] + + self.assertTrue(weak_queries) + for gt in weak_queries: + if gt.query_type == "image": + self.assertTrue(gt.image_query_path) + elif gt.query_type == "video": + self.assertTrue(gt.video_query_path) + else: + self.fail(f"Unexpected query_type for weak media category: {gt.query_type}") + def test_bm25_skip_helper_skips_image_and_video_query_categories(self): module = _load_cross_modal_ablation() diff --git a/tests/uat/corpus/CORPUS_EXPANSION.md b/tests/uat/corpus/CORPUS_EXPANSION.md index 0e93fb1..196a688 100644 --- a/tests/uat/corpus/CORPUS_EXPANSION.md +++ b/tests/uat/corpus/CORPUS_EXPANSION.md @@ -10,7 +10,7 @@ This document describes the expanded RecallForge benchmark corpus and what addit - **Documents**: 8 generated `.docx` / `.pptx` / `.pdf` files - **Total corpus documents**: 82 registered in `CORPUS_DOCS` - **Total indexed benchmark items**: 77 searchable items (the transcript JSON placeholders are empty and not indexed) -- **Total benchmark queries**: 154 queries across all modalities +- **Total benchmark queries**: 231 queries across all modalities ## New Text Documents Added (39 files) @@ -139,18 +139,28 @@ To further expand the corpus for more comprehensive cross-modal testing, the fol |----------|-------|------|--------|------|-------------------| | text_to_text | 60 | 26 | 24 | 10 | 5 | | text_to_image | 18 | 8 | 8 | 2 | 0 | -| image_to_text | 15 | 6 | 6 | 3 | 0 | +| image_to_text | 20 | 6 | 9 | 5 | 0 | | image_to_image | 3 | 1 | 2 | 0 | 1 | | image_to_video | 2 | 0 | 0 | 2 | 0 | -| image_to_document | 2 | 0 | 0 | 2 | 0 | -| video_to_text | 3 | 0 | 2 | 1 | 0 | -| video_to_image | 2 | 0 | 1 | 1 | 0 | +| image_to_document | 20 | 0 | 0 | 20 | 0 | +| video_to_text | 20 | 0 | 12 | 8 | 0 | +| video_to_image | 20 | 0 | 9 | 11 | 0 | | video_to_video | 1 | 1 | 0 | 0 | 1 | -| video_to_document | 1 | 0 | 0 | 1 | 0 | +| video_to_document | 20 | 0 | 0 | 20 | 0 | | mixed_modal | 20 | 8 | 8 | 4 | 0 | | text_to_video | 15 | 6 | 6 | 3 | 0 | | text_to_document | 12 | 6 | 5 | 1 | 0 | -| **Total** | **154** | **62** | **62** | **30** | **7** | +| **Total** | **231** | **62** | **83** | **86** | **7** | + +REC-160 expanded the weakest media-query categories to 20 queries each: + +- `image_to_text` +- `image_to_document` +- `video_to_text` +- `video_to_image` +- `video_to_document` + +Those added queries reuse the existing UAT media corpus with more grounded intent labels and explicit source-path provenance. The corpus should still grow real additional images/videos before treating these as a broad public benchmark. ## New Metrics Added @@ -158,6 +168,7 @@ To further expand the corpus for more comprehensive cross-modal testing, the fol 2. **Graded Relevance** - Supports 0/1/2 scores (irrelevant/partial/full) 3. **Query Difficulty Annotation** - Each query tagged as easy/medium/hard 4. **Per-Difficulty Metrics** - Recall reported separately for each difficulty tier +5. **Parent-Memory vs Asset-Level Metrics** - Child assets such as video frames, transcripts, document sections, and OCR pages can credit their canonical parent memory while raw asset metrics remain visible for diagnosis ## Hard Negatives