From 32ba244e999d2d418383deeae8cbfa2581596ecd Mon Sep 17 00:00:00 2001
From: MollyAI <molly@openclaw.ai>
Date: Sun, 17 May 2026 14:08:11 -0400
Subject: [PATCH] Expand weak cross-modal benchmark categories

---
 benchmarks/cross_modal_ablation.py       | 355 ++++++++++++++++++++++-
 tests/test_cross_modal_benchmark_defs.py |  37 +++
 tests/uat/corpus/CORPUS_EXPANSION.md     |  25 +-
 3 files changed, 409 insertions(+), 8 deletions(-)

diff --git a/benchmarks/cross_modal_ablation.py b/benchmarks/cross_modal_ablation.py
index 8298b5f..572e4eb 100644
--- a/benchmarks/cross_modal_ablation.py
+++ b/benchmarks/cross_modal_ablation.py
@@ -169,6 +169,48 @@ def get_relevance_score(self, path: str) -> int:
         return self.graded_relevance.get(path, 0)
 
 
+WEAK_CROSS_MODAL_BENCHMARK_CATEGORIES = {
+    "image_to_text",
+    "image_to_document",
+    "video_to_text",
+    "video_to_image",
+    "video_to_document",
+}
+
+MIN_WEAK_CATEGORY_QUERIES = 20
+
+
+def _media_query_variants(
+    *,
+    category: str,
+    query_type: str,
+    source_path: str,
+    relevant_paths: List[str],
+    queries: List[str],
+    difficulty: str,
+    graded_relevance: Optional[Dict[str, int]] = None,
+) -> List[GroundTruth]:
+    """Build grounded media-query variants without duplicating path metadata."""
+    variants: List[GroundTruth] = []
+    for query in queries:
+        kwargs: Dict[str, Any] = {
+            "query": query,
+            "query_type": query_type,
+            "relevant_paths": relevant_paths,
+            "category": category,
+            "difficulty": difficulty,
+            "graded_relevance": graded_relevance,
+        }
+        if query_type == "image":
+            kwargs["image_query_path"] = source_path
+        elif query_type == "video":
+            kwargs["video_query_path"] = source_path
+        else:
+            raise ValueError(f"Unsupported media query_type: {query_type}")
+        variants.append(GroundTruth(**kwargs))
+    return variants
+
+
 # ============================================================================
 # TEXT → TEXT QUERIES (50 queries)
 # ============================================================================
@@ -613,7 +655,7 @@ def get_relevance_score(self, path: str) -> int:
 ]
 
 # ============================================================================
-# IMAGE → TEXT QUERIES (15 queries)
+# IMAGE → TEXT QUERIES (20 queries)
 # ============================================================================
 
 IMAGE_TO_TEXT = [
@@ -754,6 +796,52 @@ def get_relevance_score(self, path: str) -> int:
     ),
 ]
 
+IMAGE_TO_TEXT += _media_query_variants(
+    category="image_to_text",
+    query_type="image",
+    source_path="images/neural_network_diagram.png",
+    relevant_paths=["text/ai_transformers.md", "text/ai_embeddings.md", "text/science_neuroscience.md"],
+    queries=["neural network diagram connected to transformer and embedding notes"],
+    difficulty="medium",
+    graded_relevance={"text/ai_transformers.md": 2, "text/ai_embeddings.md": 2, "text/science_neuroscience.md": 1},
+)
+IMAGE_TO_TEXT += _media_query_variants(
+    category="image_to_text",
+    query_type="image",
+    source_path="images/code_editor_screenshot.png",
+    relevant_paths=["text/tech_edge_ai.md", "text/tech_cybersecurity.md", "text/tech_cloud_computing.md"],
+    queries=["code editor screenshot linked to edge AI and cloud engineering notes"],
+    difficulty="medium",
+    graded_relevance={"text/tech_edge_ai.md": 2, "text/tech_cloud_computing.md": 1, "text/tech_cybersecurity.md": 1},
+)
+IMAGE_TO_TEXT += _media_query_variants(
+    category="image_to_text",
+    query_type="image",
+    source_path="images/handwritten_notes.png",
+    relevant_paths=["text/ai_agents.md", "text/tech_cloud_computing.md", "text/finance_investing.md"],
+    queries=["handwritten planning notes connected to agent roadmap and budget context"],
+    difficulty="hard",
+    graded_relevance={"text/ai_agents.md": 2, "text/tech_cloud_computing.md": 1, "text/finance_investing.md": 1},
+)
+IMAGE_TO_TEXT += _media_query_variants(
+    category="image_to_text",
+    query_type="image",
+    source_path="images/whiteboard_brainstorm.png",
+    relevant_paths=["text/ai_agents.md", "text/tech_cloud_computing.md", "text/tech_edge_ai.md"],
+    queries=["brainstorm whiteboard should retrieve agent architecture notes"],
+    difficulty="hard",
+    graded_relevance={"text/ai_agents.md": 2, "text/tech_cloud_computing.md": 1, "text/tech_edge_ai.md": 1},
+)
+IMAGE_TO_TEXT += _media_query_variants(
+    category="image_to_text",
+    query_type="image",
+    source_path="images/floor_plan_blueprint.png",
+    relevant_paths=["text/architecture_blueprints.md", "text/architecture_modern.md", "text/architecture_gothic.md"],
+    queries=["floor plan image tied to architecture blueprint reference notes"],
+    difficulty="medium",
+    graded_relevance={"text/architecture_blueprints.md": 2, "text/architecture_modern.md": 1, "text/architecture_gothic.md": 1},
+)
+
 # ============================================================================
 # MIXED MODAL QUERIES (20 queries) - find BOTH text and image
 # ============================================================================
@@ -1253,6 +1341,85 @@ def get_relevance_score(self, path: str) -> int:
     ),
 ]
 
+IMAGE_TO_DOCUMENT += _media_query_variants(
+    category="image_to_document",
+    query_type="image",
+    source_path="images/neural_network_diagram.png",
+    relevant_paths=["documents/ai_strategy_report.docx", "documents/ai_architecture_deck.pptx", "documents/embedding_research.pdf"],
+    queries=[
+        "neural network diagram should recall AI strategy document",
+        "embedding architecture image linked to research paper and slides",
+        "visual AI model graph connected to cross-modal embedding documents",
+    ],
+    difficulty="hard",
+    graded_relevance={"documents/ai_strategy_report.docx": 2, "documents/ai_architecture_deck.pptx": 2, "documents/embedding_research.pdf": 2},
+)
+IMAGE_TO_DOCUMENT += _media_query_variants(
+    category="image_to_document",
+    query_type="image",
+    source_path="images/whiteboard_architecture.png",
+    relevant_paths=["documents/recallforge_spec.docx", "documents/ai_architecture_deck.pptx", "documents/ai_strategy_report.docx"],
+    queries=[
+        "whiteboard architecture sketch should find RecallForge specification",
+        "system design drawing linked to architecture presentation",
+        "AI service diagram should retrieve strategy roadmap docs",
+    ],
+    difficulty="hard",
+    graded_relevance={"documents/recallforge_spec.docx": 2, "documents/ai_architecture_deck.pptx": 2, "documents/ai_strategy_report.docx": 1},
+)
+IMAGE_TO_DOCUMENT += _media_query_variants(
+    category="image_to_document",
+    query_type="image",
+    source_path="images/whiteboard_brainstorm.png",
+    relevant_paths=["documents/ai_strategy_report.docx", "documents/project_status_q1.docx", "documents/quarterly_review.pptx"],
+    queries=[
+        "brainstorm board linked to AI roadmap and quarterly review",
+        "planning whiteboard should recall project status documentation",
+        "strategy workshop image connected to portfolio review slides",
+    ],
+    difficulty="hard",
+    graded_relevance={"documents/ai_strategy_report.docx": 2, "documents/project_status_q1.docx": 2, "documents/quarterly_review.pptx": 1},
+)
+IMAGE_TO_DOCUMENT += _media_query_variants(
+    category="image_to_document",
+    query_type="image",
+    source_path="images/code_editor_screenshot.png",
+    relevant_paths=["documents/recallforge_spec.docx", "documents/operations_manual.pdf", "documents/edge_deployment_guide.pdf"],
+    queries=[
+        "developer screenshot should retrieve RecallForge technical specification",
+        "code editor image linked to install and operations manual",
+        "local deployment screen should find edge deployment guide",
+    ],
+    difficulty="hard",
+    graded_relevance={"documents/recallforge_spec.docx": 2, "documents/operations_manual.pdf": 2, "documents/edge_deployment_guide.pdf": 1},
+)
+IMAGE_TO_DOCUMENT += _media_query_variants(
+    category="image_to_document",
+    query_type="image",
+    source_path="images/floor_plan_blueprint.png",
+    relevant_paths=["documents/ai_architecture_deck.pptx", "documents/operations_manual.pdf", "documents/project_status_q1.docx"],
+    queries=[
+        "blueprint image connected to architecture presentation",
+        "floor plan drawing should recall operations and planning docs",
+        "building layout visual tied to project review documentation",
+    ],
+    difficulty="hard",
+    graded_relevance={"documents/ai_architecture_deck.pptx": 2, "documents/operations_manual.pdf": 1, "documents/project_status_q1.docx": 1},
+)
+IMAGE_TO_DOCUMENT += _media_query_variants(
+    category="image_to_document",
+    query_type="image",
+    source_path="images/handwritten_notes.png",
+    relevant_paths=["documents/project_status_q1.docx", "documents/quarterly_review.pptx", "documents/ai_strategy_report.docx"],
+    queries=[
+        "handwritten meeting notes should find project status report",
+        "notebook planning image linked to quarterly review deck",
+        "planning notes connected to AI strategy report",
+    ],
+    difficulty="hard",
+    graded_relevance={"documents/project_status_q1.docx": 2, "documents/quarterly_review.pptx": 2, "documents/ai_strategy_report.docx": 1},
+)
+
 # ── Video → Text queries ──────────────────────────────────────
 VIDEO_TO_TEXT = [
     GroundTruth(
@@ -1281,6 +1448,74 @@ def get_relevance_score(self, path: str) -> int:
     ),
 ]
 
+VIDEO_TO_TEXT += _media_query_variants(
+    category="video_to_text",
+    query_type="video",
+    source_path="videos/nature_timelapse.mp4",
+    relevant_paths=["text/nature_forests.md", "text/nature_mountains.md", "text/nature_oceans.md", "text/science_climate.md", "text/travel_national_parks.md"],
+    queries=[
+        "nature timelapse should recall forest mountain and ocean notes",
+        "outdoor landscape video connected to climate and park memories",
+        "scenic video with forests mountains and water reference text",
+        "environment footage linked to nature and national parks docs",
+    ],
+    difficulty="medium",
+    graded_relevance={"text/nature_forests.md": 2, "text/nature_mountains.md": 2, "text/nature_oceans.md": 2, "text/science_climate.md": 1, "text/travel_national_parks.md": 1},
+)
+VIDEO_TO_TEXT += _media_query_variants(
+    category="video_to_text",
+    query_type="video",
+    source_path="videos/coding_demo.mp4",
+    relevant_paths=["text/ai_agents.md", "text/tech_cloud_computing.md", "text/tech_edge_ai.md", "text/tech_cybersecurity.md"],
+    queries=[
+        "coding demo should retrieve software agents and cloud notes",
+        "developer video connected to backend architecture text memories",
+        "programming walkthrough linked to edge AI implementation notes",
+        "software tutorial video should recall cybersecurity and cloud docs",
+    ],
+    difficulty="hard",
+    graded_relevance={"text/ai_agents.md": 2, "text/tech_cloud_computing.md": 2, "text/tech_edge_ai.md": 1, "text/tech_cybersecurity.md": 1},
+)
+VIDEO_TO_TEXT += _media_query_variants(
+    category="video_to_text",
+    query_type="video",
+    source_path="videos/architecture_walkthrough.mp4",
+    relevant_paths=["text/architecture_blueprints.md", "text/architecture_modern.md", "text/architecture_gothic.md", "text/tech_cloud_computing.md"],
+    queries=[
+        "building walkthrough video connected to architecture reference notes",
+        "architecture tour should recall blueprint and modern design text",
+        "walkthrough recording linked to systems architecture notes",
+    ],
+    difficulty="medium",
+    graded_relevance={"text/architecture_blueprints.md": 2, "text/architecture_modern.md": 2, "text/architecture_gothic.md": 1, "text/tech_cloud_computing.md": 1},
+)
+VIDEO_TO_TEXT += _media_query_variants(
+    category="video_to_text",
+    query_type="video",
+    source_path="videos/cooking_tutorial.mp4",
+    relevant_paths=["text/cooking_pasta.md", "text/cooking_grilling.md", "text/cooking_spices.md", "text/cooking_asian_cuisine.md"],
+    queries=[
+        "cooking tutorial video should find recipe and spice notes",
+        "food preparation recording connected to pasta cooking memory",
+        "culinary demonstration linked to cooking technique documents",
+    ],
+    difficulty="medium",
+    graded_relevance={"text/cooking_pasta.md": 2, "text/cooking_spices.md": 1, "text/cooking_asian_cuisine.md": 1, "text/cooking_grilling.md": 1},
+)
+VIDEO_TO_TEXT += _media_query_variants(
+    category="video_to_text",
+    query_type="video",
+    source_path="videos/whiteboard_session.mp4",
+    relevant_paths=["text/ai_agents.md", "text/tech_cloud_computing.md", "text/tech_edge_ai.md", "text/ai_embeddings.md"],
+    queries=[
+        "whiteboard session video should retrieve AI agent design notes",
+        "planning meeting recording linked to cloud architecture memory",
+        "brainstorming video connected to embedding and edge AI notes",
+    ],
+    difficulty="hard",
+    graded_relevance={"text/ai_agents.md": 2, "text/tech_cloud_computing.md": 2, "text/tech_edge_ai.md": 1, "text/ai_embeddings.md": 1},
+)
+
 # ── Video → Image queries ─────────────────────────────────────
 VIDEO_TO_IMAGE = [
     GroundTruth(
@@ -1301,6 +1536,74 @@ def get_relevance_score(self, path: str) -> int:
     ),
 ]
 
+VIDEO_TO_IMAGE += _media_query_variants(
+    category="video_to_image",
+    query_type="video",
+    source_path="videos/nature_timelapse.mp4",
+    relevant_paths=["images/forest_landscape.png", "images/mountain_landscape.png", "images/ocean_beach.png"],
+    queries=[
+        "nature video should match forest and mountain images",
+        "timelapse landscape recording connected to outdoor photos",
+        "scenic footage should recall ocean beach and mountain visuals",
+        "environment video linked to landscape image memories",
+    ],
+    difficulty="medium",
+    graded_relevance={"images/forest_landscape.png": 2, "images/mountain_landscape.png": 2, "images/ocean_beach.png": 2},
+)
+VIDEO_TO_IMAGE += _media_query_variants(
+    category="video_to_image",
+    query_type="video",
+    source_path="videos/whiteboard_session.mp4",
+    relevant_paths=["images/whiteboard_brainstorm.png", "images/whiteboard_architecture.png", "images/handwritten_notes.png"],
+    queries=[
+        "whiteboard meeting video should match brainstorm images",
+        "planning session recording connected to whiteboard photos",
+        "team board video should recall architecture sketch image",
+        "meeting notes video linked to handwritten planning image",
+    ],
+    difficulty="hard",
+    graded_relevance={"images/whiteboard_brainstorm.png": 2, "images/whiteboard_architecture.png": 2, "images/handwritten_notes.png": 1},
+)
+VIDEO_TO_IMAGE += _media_query_variants(
+    category="video_to_image",
+    query_type="video",
+    source_path="videos/architecture_walkthrough.mp4",
+    relevant_paths=["images/floor_plan_blueprint.png", "images/whiteboard_architecture.png", "images/neural_network_diagram.png"],
+    queries=[
+        "architecture walkthrough should match blueprint and system diagrams",
+        "building tour video connected to floor plan visual memory",
+        "architecture recording linked to whiteboard design image",
+    ],
+    difficulty="hard",
+    graded_relevance={"images/floor_plan_blueprint.png": 2, "images/whiteboard_architecture.png": 1, "images/neural_network_diagram.png": 1},
+)
+VIDEO_TO_IMAGE += _media_query_variants(
+    category="video_to_image",
+    query_type="video",
+    source_path="videos/coding_demo.mp4",
+    relevant_paths=["images/code_editor_screenshot.png", "images/neural_network_diagram.png", "images/whiteboard_architecture.png"],
+    queries=[
+        "coding demo should match code editor screenshot",
+        "software video connected to neural network diagram image",
+        "developer recording linked to architecture whiteboard visual",
+    ],
+    difficulty="hard",
+    graded_relevance={"images/code_editor_screenshot.png": 2, "images/neural_network_diagram.png": 1, "images/whiteboard_architecture.png": 1},
+)
+VIDEO_TO_IMAGE += _media_query_variants(
+    category="video_to_image",
+    query_type="video",
+    source_path="videos/cooking_tutorial.mp4",
+    relevant_paths=["images/food_pasta_dish.png"],
+    queries=[
+        "cooking tutorial should match pasta dish image",
+        "recipe video connected to plated food photo",
+        "culinary demonstration linked to food image memory",
+        "kitchen video should recall pasta visual",
+    ],
+    difficulty="medium",
+)
+
 # ── Video → Video queries (similarity) ────────────────────────
 VIDEO_TO_VIDEO = [
     GroundTruth(
@@ -1326,6 +1629,56 @@ def get_relevance_score(self, path: str) -> int:
     ),
 ]
 
+VIDEO_TO_DOCUMENT += _media_query_variants(
+    category="video_to_document",
+    query_type="video",
+    source_path="videos/architecture_walkthrough.mp4",
+    relevant_paths=["documents/ai_architecture_deck.pptx", "documents/project_status_q1.docx", "documents/ai_strategy_report.docx"],
+    queries=[
+        "architecture walkthrough should retrieve architecture deck",
+        "building tour video connected to planning and status docs",
+        "walkthrough recording linked to AI strategy architecture slides",
+        "architecture video should recall project milestone deck",
+        "tour footage connected to enterprise AI roadmap documents",
+        "system walkthrough recording linked to architecture presentation",
+        "architecture recording tied to portfolio planning documents",
+    ],
+    difficulty="hard",
+    graded_relevance={"documents/ai_architecture_deck.pptx": 2, "documents/project_status_q1.docx": 1, "documents/ai_strategy_report.docx": 1},
+)
+VIDEO_TO_DOCUMENT += _media_query_variants(
+    category="video_to_document",
+    query_type="video",
+    source_path="videos/coding_demo.mp4",
+    relevant_paths=["documents/recallforge_spec.docx", "documents/operations_manual.pdf", "documents/edge_deployment_guide.pdf", "documents/embedding_research.pdf"],
+    queries=[
+        "coding demo should find RecallForge technical specification",
+        "developer video connected to operations manual",
+        "software walkthrough linked to edge deployment guide",
+        "code demonstration should recall embedding research paper",
+        "implementation video tied to install and configuration docs",
+        "developer recording connected to local multimodal search spec",
+    ],
+    difficulty="hard",
+    graded_relevance={"documents/recallforge_spec.docx": 2, "documents/operations_manual.pdf": 2, "documents/edge_deployment_guide.pdf": 1, "documents/embedding_research.pdf": 1},
+)
+VIDEO_TO_DOCUMENT += _media_query_variants(
+    category="video_to_document",
+    query_type="video",
+    source_path="videos/whiteboard_session.mp4",
+    relevant_paths=["documents/ai_strategy_report.docx", "documents/project_status_q1.docx", "documents/quarterly_review.pptx", "documents/recallforge_spec.docx"],
+    queries=[
+        "whiteboard session should retrieve strategy roadmap documents",
+        "planning video connected to quarterly review presentation",
+        "brainstorm recording linked to project status report",
+        "whiteboard meeting should recall RecallForge spec",
+        "roadmap discussion video tied to portfolio review deck",
+        "team planning recording connected to AI strategy docs",
+    ],
+    difficulty="hard",
+    graded_relevance={"documents/ai_strategy_report.docx": 2, "documents/project_status_q1.docx": 2, "documents/quarterly_review.pptx": 1, "documents/recallforge_spec.docx": 1},
+)
+
 ALL_GROUND_TRUTH = (
     TEXT_TO_TEXT + 
     TEXT_TO_IMAGE + 
diff --git a/tests/test_cross_modal_benchmark_defs.py b/tests/test_cross_modal_benchmark_defs.py
index a660360..ec1cbdf 100644
--- a/tests/test_cross_modal_benchmark_defs.py
+++ b/tests/test_cross_modal_benchmark_defs.py
@@ -68,6 +68,43 @@ def test_text_to_document_block_matches_category(self):
             all(gt.category == "text_to_document" for gt in module.TEXT_TO_DOCUMENT)
         )
 
+    def test_weak_cross_modal_categories_have_query_floor(self):
+        module = _load_cross_modal_ablation()
+
+        counts = {
+            category: sum(1 for gt in module.ALL_GROUND_TRUTH if gt.category == category)
+            for category in module.WEAK_CROSS_MODAL_BENCHMARK_CATEGORIES
+        }
+
+        self.assertEqual(
+            counts,
+            {
+                "image_to_document": module.MIN_WEAK_CATEGORY_QUERIES,
+                "image_to_text": module.MIN_WEAK_CATEGORY_QUERIES,
+                "video_to_document": module.MIN_WEAK_CATEGORY_QUERIES,
+                "video_to_image": module.MIN_WEAK_CATEGORY_QUERIES,
+                "video_to_text": module.MIN_WEAK_CATEGORY_QUERIES,
+            },
+        )
+
+    def test_expanded_media_queries_keep_source_path_provenance(self):
+        module = _load_cross_modal_ablation()
+
+        weak_queries = [
+            gt
+            for gt in module.ALL_GROUND_TRUTH
+            if gt.category in module.WEAK_CROSS_MODAL_BENCHMARK_CATEGORIES
+        ]
+
+        self.assertTrue(weak_queries)
+        for gt in weak_queries:
+            if gt.query_type == "image":
+                self.assertTrue(gt.image_query_path)
+            elif gt.query_type == "video":
+                self.assertTrue(gt.video_query_path)
+            else:
+                self.fail(f"Unexpected query_type for weak media category: {gt.query_type}")
+
     def test_bm25_skip_helper_skips_image_and_video_query_categories(self):
         module = _load_cross_modal_ablation()
 
diff --git a/tests/uat/corpus/CORPUS_EXPANSION.md b/tests/uat/corpus/CORPUS_EXPANSION.md
index 0e93fb1..196a688 100644
--- a/tests/uat/corpus/CORPUS_EXPANSION.md
+++ b/tests/uat/corpus/CORPUS_EXPANSION.md
@@ -10,7 +10,7 @@ This document describes the expanded RecallForge benchmark corpus and what addit
 - **Documents**: 8 generated `.docx` / `.pptx` / `.pdf` files
 - **Total corpus documents**: 82 registered in `CORPUS_DOCS`
 - **Total indexed benchmark items**: 77 searchable items (the transcript JSON placeholders are empty and not indexed)
-- **Total benchmark queries**: 154 queries across all modalities
+- **Total benchmark queries**: 231 queries across all modalities
 
 ## New Text Documents Added (39 files)
 
@@ -139,18 +139,28 @@ To further expand the corpus for more comprehensive cross-modal testing, the fol
 |----------|-------|------|--------|------|-------------------|
 | text_to_text | 60 | 26 | 24 | 10 | 5 |
 | text_to_image | 18 | 8 | 8 | 2 | 0 |
-| image_to_text | 15 | 6 | 6 | 3 | 0 |
+| image_to_text | 20 | 6 | 9 | 5 | 0 |
 | image_to_image | 3 | 1 | 2 | 0 | 1 |
 | image_to_video | 2 | 0 | 0 | 2 | 0 |
-| image_to_document | 2 | 0 | 0 | 2 | 0 |
-| video_to_text | 3 | 0 | 2 | 1 | 0 |
-| video_to_image | 2 | 0 | 1 | 1 | 0 |
+| image_to_document | 20 | 0 | 0 | 20 | 0 |
+| video_to_text | 20 | 0 | 12 | 8 | 0 |
+| video_to_image | 20 | 0 | 9 | 11 | 0 |
 | video_to_video | 1 | 1 | 0 | 0 | 1 |
-| video_to_document | 1 | 0 | 0 | 1 | 0 |
+| video_to_document | 20 | 0 | 0 | 20 | 0 |
 | mixed_modal | 20 | 8 | 8 | 4 | 0 |
 | text_to_video | 15 | 6 | 6 | 3 | 0 |
 | text_to_document | 12 | 6 | 5 | 1 | 0 |
-| **Total** | **154** | **62** | **62** | **30** | **7** |
+| **Total** | **231** | **62** | **83** | **86** | **7** |
+
+REC-160 expanded the weakest media-query categories to 20 queries each:
+
+- `image_to_text`
+- `image_to_document`
+- `video_to_text`
+- `video_to_image`
+- `video_to_document`
+
+Those added queries reuse the existing UAT media corpus with more grounded intent labels and explicit source-path provenance. The corpus should still grow real additional images/videos before treating these as a broad public benchmark.
 
 ## New Metrics Added
 
@@ -158,6 +168,7 @@ To further expand the corpus for more comprehensive cross-modal testing, the fol
 2. **Graded Relevance** - Supports 0/1/2 scores (irrelevant/partial/full)
 3. **Query Difficulty Annotation** - Each query tagged as easy/medium/hard
 4. **Per-Difficulty Metrics** - Recall reported separately for each difficulty tier
+5. **Parent-Memory vs Asset-Level Metrics** - Child assets such as video frames, transcripts, document sections, and OCR pages can credit their canonical parent memory while raw asset metrics remain visible for diagnosis
 
 ## Hard Negatives