diff --git a/README.md b/README.md
index 259e660..6afbba9 100644
--- a/README.md
+++ b/README.md
@@ -62,7 +62,7 @@ The reranker delivers **+20.7% R@1 over RRF fusion** and pushes R@10 to 97.8%. E
 
 *Benchmark categories: text_only (30 queries), image_only (30 queries), long_query (12 queries), typo_query (20 queries). See `benchmarks/results/pipeline_ablation_modality_results.json` for full breakdown.*
 
-For release validation, use `benchmarks/cross_modal_ablation.py`. It now checkpoints JSON output as it runs, so long MLX benchmark sessions still leave behind a partial artifact if interrupted.
+For release validation, use `benchmarks/cross_modal_ablation.py`. It checkpoints JSON output as it runs, so long MLX benchmark sessions still leave behind a partial artifact if interrupted. To turn that artifact into a ranked fix list, run `benchmarks/cross_modal_diagnostics.py`; the current report is in [docs/research/cross-modal-diagnostics.md](docs/research/cross-modal-diagnostics.md).
 
 ### Latency & resource usage
 
diff --git a/benchmarks/cross_modal_ablation.py b/benchmarks/cross_modal_ablation.py
index 026f085..8298b5f 100644
--- a/benchmarks/cross_modal_ablation.py
+++ b/benchmarks/cross_modal_ablation.py
@@ -2395,6 +2395,7 @@ def save_checkpoint(
                             "query": gt.query,
                             "query_type": gt.query_type,
                             "image_query_path": gt.image_query_path,
+                            "video_query_path": gt.video_query_path,
                             "relevant_paths": gt.relevant_paths,
                             "difficulty": gt.difficulty,
                             "is_negative_control": gt.is_negative_control,
@@ -2424,6 +2425,7 @@ def save_checkpoint(
                             "query": gt.query,
                             "query_type": gt.query_type,
                             "image_query_path": gt.image_query_path,
+                            "video_query_path": gt.video_query_path,
                             "relevant_paths": gt.relevant_paths,
                             "difficulty": gt.difficulty,
                             "is_negative_control": gt.is_negative_control,
diff --git a/benchmarks/cross_modal_diagnostics.py b/benchmarks/cross_modal_diagnostics.py
new file mode 100644
index 0000000..190a2d5
--- /dev/null
+++ b/benchmarks/cross_modal_diagnostics.py
@@ -0,0 +1,841 @@
+#!/usr/bin/env python3
+"""Diagnose weak cross-modal benchmark categories.
+
+This offline tool reads a saved ``cross_modal_ablation.py`` JSON payload and
+turns the raw per-stage metrics into a ranked diagnosis. It is intentionally
+stdlib-only so it can run after a long benchmark session without loading model
+stacks or touching the local RecallForge index.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from collections import Counter, defaultdict
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Tuple
+
+
+DEFAULT_INPUT = Path("benchmarks/results/cross_modal_ablation_results.json")
+DEFAULT_JSON_OUTPUT = Path("benchmarks/results/cross_modal_diagnostics.json")
+DEFAULT_MARKDOWN_OUTPUT = Path("docs/research/cross-modal-diagnostics.md")
+
+WEAK_RECALL_AT_5 = 0.60
+MIN_CATEGORY_QUERIES = 20
+MEANINGFUL_DELTA = 0.10
+
+DOCUMENT_FILE_TYPES = {"pdf", "docx", "pptx"}
+DOCUMENT_CATEGORIES = {
+    "text_to_document",
+    "image_to_document",
+    "video_to_document",
+}
+GENERIC_MEDIA_QUERIES = {
+    "",
+    "related text",
+    "related image",
+    "related video",
+    "related document",
+}
+
+
+def _safe_float(value: Any) -> Optional[float]:
+    if value is None:
+        return None
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def _stage_role(stage_name: str) -> str:
+    normalized = stage_name.lower()
+    if "vector-only" in normalized or normalized == "vector":
+        return "vector"
+    if "bm25-only" in normalized or normalized == "bm25":
+        return "bm25"
+    if "reranker" in normalized or "hybrid" in normalized:
+        return "hybrid"
+    if "rrf" in normalized:
+        return "rrf"
+    return "other"
+
+
+def _stage_lookup(stages: Mapping[str, Any]) -> Dict[str, str]:
+    roles: Dict[str, str] = {}
+    for stage_name in stages:
+        role = _stage_role(stage_name)
+        if role not in roles:
+            roles[role] = stage_name
+    return roles
+
+
+def _category_names(payload: Mapping[str, Any]) -> List[str]:
+    categories = set(payload.get("categories", {}) or {})
+    for stage_data in (payload.get("stages", {}) or {}).values():
+        if isinstance(stage_data, Mapping):
+            categories.update(stage_data)
+    return sorted(categories)
+
+
+def _category_query_count(payload: Mapping[str, Any], category: str) -> int:
+    metadata = (payload.get("categories", {}) or {}).get(category)
+    if isinstance(metadata, Mapping):
+        queries = metadata.get("queries")
+        if queries is not None:
+            return int(queries)
+    for stage_data in (payload.get("stages", {}) or {}).values():
+        metrics = stage_data.get(category) if isinstance(stage_data, Mapping) else None
+        if isinstance(metrics, Mapping) and metrics.get("total_queries") is not None:
+            return int(metrics["total_queries"])
+    return 0
+
+
+def _metric(
+    stages: Mapping[str, Mapping[str, Any]],
+    stage_name: Optional[str],
+    category: str,
+    metric_name: str,
+) -> Optional[float]:
+    if not stage_name:
+        return None
+    metrics = stages.get(stage_name, {}).get(category)
+    if not isinstance(metrics, Mapping) or metrics.get("skipped"):
+        return None
+    return _safe_float(metrics.get(metric_name))
+
+
+def _asset_metric(
+    stages: Mapping[str, Mapping[str, Any]],
+    stage_name: Optional[str],
+    category: str,
+    metric_name: str,
+) -> Optional[float]:
+    if not stage_name:
+        return None
+    metrics = stages.get(stage_name, {}).get(category)
+    if not isinstance(metrics, Mapping) or metrics.get("skipped"):
+        return None
+    asset_level = metrics.get("asset_level")
+    if not isinstance(asset_level, Mapping):
+        return None
+    return _safe_float(asset_level.get(metric_name))
+
+
+def _best_stage(
+    stages: Mapping[str, Mapping[str, Any]],
+    category: str,
+    metric_name: str = "recall_at_5",
+) -> Tuple[Optional[str], Optional[float]]:
+    best_name: Optional[str] = None
+    best_value: Optional[float] = None
+    for stage_name, stage_data in stages.items():
+        metrics = stage_data.get(category)
+        if not isinstance(metrics, Mapping) or metrics.get("skipped"):
+            continue
+        value = _safe_float(metrics.get(metric_name))
+        if value is None:
+            continue
+        if best_value is None or value > best_value:
+            best_name = stage_name
+            best_value = value
+    return best_name, best_value
+
+
+def _is_media_query_category(category: str, per_query_results: Sequence[Mapping[str, Any]]) -> bool:
+    if category.startswith(("image_to_", "video_to_")):
+        return True
+    return any(q.get("query_type") in {"image", "video"} for q in per_query_results)
+
+
+def _first_stage_queries(
+    stages: Mapping[str, Mapping[str, Any]],
+    category: str,
+) -> List[Mapping[str, Any]]:
+    for stage_data in stages.values():
+        metrics = stage_data.get(category)
+        if isinstance(metrics, Mapping):
+            rows = metrics.get("per_query_results")
+            if isinstance(rows, list) and rows:
+                return [row for row in rows if isinstance(row, Mapping)]
+    return []
+
+
+def _stage_queries(
+    stages: Mapping[str, Mapping[str, Any]],
+    stage_name: Optional[str],
+    category: str,
+) -> List[Mapping[str, Any]]:
+    if not stage_name:
+        return []
+    metrics = stages.get(stage_name, {}).get(category)
+    if not isinstance(metrics, Mapping):
+        return []
+    rows = metrics.get("per_query_results")
+    if not isinstance(rows, list):
+        return []
+    return [row for row in rows if isinstance(row, Mapping)]
+
+
+def _query_values(rows: Iterable[Mapping[str, Any]]) -> List[str]:
+    return [str(row.get("query") or "").strip().lower() for row in rows]
+
+
+def _has_generic_media_queries(rows: Sequence[Mapping[str, Any]]) -> bool:
+    if not rows:
+        return False
+    query_values = _query_values(rows)
+    generic_count = sum(1 for query in query_values if query in GENERIC_MEDIA_QUERIES)
+    return generic_count > 0 and generic_count >= max(1, len(query_values) // 2)
+
+
+def _missing_media_query_paths(rows: Sequence[Mapping[str, Any]]) -> List[str]:
+    missing: List[str] = []
+    for row in rows:
+        query_type = row.get("query_type")
+        if query_type == "image" and not row.get("image_query_path"):
+            missing.append(str(row.get("query") or "image query"))
+        if query_type == "video" and not row.get("video_query_path"):
+            missing.append(str(row.get("query") or "video query"))
+    return missing
+
+
+def _has_asset_level_metrics(stages: Mapping[str, Mapping[str, Any]], category: str) -> bool:
+    for stage_data in stages.values():
+        metrics = stage_data.get(category)
+        if isinstance(metrics, Mapping) and isinstance(metrics.get("asset_level"), Mapping):
+            return True
+    return False
+
+
+def _audit_source_counts(rows: Sequence[Mapping[str, Any]], top_k: int = 5) -> Dict[str, int]:
+    counts: Counter[str] = Counter()
+    for row in rows:
+        for result in (row.get("results") or [])[:top_k]:
+            if not isinstance(result, Mapping):
+                continue
+            audit = result.get("audit")
+            if not isinstance(audit, Mapping):
+                continue
+            sources = audit.get("rrf_sources")
+            if not isinstance(sources, Mapping):
+                continue
+            for source, rank in sources.items():
+                try:
+                    if int(rank) > 0:
+                        counts[str(source)] += 1
+                except (TypeError, ValueError):
+                    continue
+    return dict(sorted(counts.items()))
+
+
+def _normalize_result_path(filepath: str) -> str:
+    raw = str(filepath or "")
+    if not raw:
+        return ""
+    if raw.startswith("recallforge://"):
+        raw = raw.split("/", 3)[-1]
+    marker = "/tests/uat/corpus/"
+    if marker in raw:
+        raw = raw.split(marker, 1)[1]
+    return raw
+
+
+def _failure_examples(
+    rows: Sequence[Mapping[str, Any]],
+    *,
+    limit: int = 3,
+) -> List[Dict[str, Any]]:
+    examples: List[Dict[str, Any]] = []
+    for row in rows:
+        if row.get("hit_at_5") is True:
+            continue
+        top_results = []
+        for result in (row.get("results") or [])[:3]:
+            if isinstance(result, Mapping):
+                top_results.append(_normalize_result_path(str(result.get("filepath") or "")))
+        examples.append(
+            {
+                "query": row.get("query") or "",
+                "query_type": row.get("query_type") or "text",
+                "image_query_path": row.get("image_query_path"),
+                "video_query_path": row.get("video_query_path"),
+                "relevant_paths": row.get("relevant_paths") or [],
+                "top_results": top_results,
+            }
+        )
+        if len(examples) >= limit:
+            break
+    return examples
+
+
+def _issue(code: str, severity: str, evidence: str) -> Dict[str, str]:
+    return {"code": code, "severity": severity, "evidence": evidence}
+
+
+def _action(owner: str, priority: int, title: str, rationale: str) -> Dict[str, Any]:
+    return {
+        "owner": owner,
+        "priority": priority,
+        "title": title,
+        "rationale": rationale,
+    }
+
+
+def _target_content_type(category: str) -> Optional[str]:
+    if category.endswith("_text"):
+        return "text"
+    if category.endswith("_image"):
+        return "image"
+    if category.endswith("_video"):
+        return "video"
+    if category.endswith("_document"):
+        return "document_family"
+    return None
+
+
+def _document_filter_gap(
+    category: str,
+    content_type_filters: Mapping[str, Optional[str]],
+) -> bool:
+    if category not in DOCUMENT_CATEGORIES:
+        return False
+    configured = content_type_filters.get(category)
+    return configured is None or configured == "document"
+
+
+def _build_category_diagnosis(
+    payload: Mapping[str, Any],
+    category: str,
+    *,
+    content_type_filters: Mapping[str, Optional[str]],
+    min_queries: int,
+    weak_recall_at_5: float,
+    meaningful_delta: float,
+) -> Dict[str, Any]:
+    stages = payload.get("stages", {}) or {}
+    role_to_stage = _stage_lookup(stages)
+    first_rows = _first_stage_queries(stages, category)
+    best_name, best_r5 = _best_stage(stages, category, "recall_at_5")
+
+    vector_stage = role_to_stage.get("vector")
+    bm25_stage = role_to_stage.get("bm25")
+    rrf_stage = role_to_stage.get("rrf")
+    hybrid_stage = role_to_stage.get("hybrid")
+
+    vector_r5 = _metric(stages, vector_stage, category, "recall_at_5")
+    vector_r10 = _metric(stages, vector_stage, category, "recall_at_10")
+    bm25_r5 = _metric(stages, bm25_stage, category, "recall_at_5")
+    rrf_r5 = _metric(stages, rrf_stage, category, "recall_at_5")
+    rrf_r10 = _metric(stages, rrf_stage, category, "recall_at_10")
+    hybrid_r5 = _metric(stages, hybrid_stage, category, "recall_at_5")
+    hybrid_r10 = _metric(stages, hybrid_stage, category, "recall_at_10")
+    best_rows = _stage_queries(stages, best_name, category)
+
+    total_queries = _category_query_count(payload, category)
+    issues: List[Dict[str, str]] = []
+    media_query = _is_media_query_category(category, first_rows)
+
+    if total_queries < min_queries:
+        issues.append(
+            _issue(
+                "under_sampled_category",
+                "high" if total_queries < 5 else "medium",
+                f"{total_queries} queries is below the {min_queries}-query diagnostic floor.",
+            )
+        )
+
+    bm25_metrics = stages.get(bm25_stage or "", {}).get(category)
+    if isinstance(bm25_metrics, Mapping) and bm25_metrics.get("skipped"):
+        issues.append(
+            _issue(
+                "bm25_modality_blind",
+                "medium",
+                str(bm25_metrics.get("skip_reason") or "BM25 stage skipped."),
+            )
+        )
+
+    if vector_r5 is not None and vector_r5 < weak_recall_at_5 and media_query:
+        issues.append(
+            _issue(
+                "embedding_alignment_gap",
+                "high",
+                f"Vector-only R@5={vector_r5:.3f}; media-query categories need raw embedding alignment above {weak_recall_at_5:.2f}.",
+            )
+        )
+
+    if vector_r5 is not None and rrf_r5 is not None:
+        lift = rrf_r5 - vector_r5
+        r10_lift = (rrf_r10 or 0.0) - (vector_r10 or 0.0)
+        if lift >= meaningful_delta or r10_lift >= meaningful_delta:
+            issues.append(
+                _issue(
+                    "derived_text_probe_lift",
+                    "positive",
+                    f"RRF improves R@5 by {lift:+.3f} and R@10 by {r10_lift:+.3f} over raw vector search.",
+                )
+            )
+        elif media_query and best_r5 is not None and best_r5 < weak_recall_at_5:
+            issues.append(
+                _issue(
+                    "derived_text_probe_insufficient",
+                    "high",
+                    f"RRF R@5={rrf_r5:.3f} does not materially lift vector R@5={vector_r5:.3f}.",
+                )
+            )
+
+    if rrf_r5 is not None and hybrid_r5 is not None:
+        rerank_delta = hybrid_r5 - rrf_r5
+        if rerank_delta <= -meaningful_delta:
+            issues.append(
+                _issue(
+                    "reranker_regression",
+                    "high",
+                    f"Hybrid reranker R@5={hybrid_r5:.3f} trails RRF R@5={rrf_r5:.3f}.",
+                )
+            )
+        elif best_r5 is not None and best_r5 < weak_recall_at_5 and abs(rerank_delta) < meaningful_delta:
+            issues.append(
+                _issue(
+                    "reranker_no_lift",
+                    "medium",
+                    f"Hybrid reranker changes R@5 by only {rerank_delta:+.3f} versus RRF.",
+                )
+            )
+
+    if not _has_asset_level_metrics(stages, category):
+        issues.append(
+            _issue(
+                "parent_asset_metrics_missing",
+                "medium",
+                "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.",
+            )
+        )
+    elif best_name:
+        memory_r5 = _metric(stages, best_name, category, "recall_at_5")
+        asset_r5 = _asset_metric(stages, best_name, category, "recall_at_5")
+        if memory_r5 is not None and asset_r5 is not None and memory_r5 - asset_r5 >= meaningful_delta:
+            issues.append(
+                _issue(
+                    "parent_rollup_matters",
+                    "positive",
+                    f"Parent-memory R@5={memory_r5:.3f} is {memory_r5 - asset_r5:+.3f} above raw asset R@5.",
+                )
+            )
+
+    if _document_filter_gap(category, content_type_filters):
+        issues.append(
+            _issue(
+                "document_family_filter_gap",
+                "medium",
+                "Benchmark cannot currently constrain results to the pdf/docx/pptx document family with a single content_type filter.",
+            )
+        )
+
+    if _has_generic_media_queries(first_rows):
+        issues.append(
+            _issue(
+                "generic_query_artifact",
+                "medium",
+                "Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.",
+            )
+        )
+
+    missing_paths = _missing_media_query_paths(first_rows)
+    if missing_paths:
+        issues.append(
+            _issue(
+                "media_query_path_missing",
+                "low",
+                f"{len(missing_paths)} per-query rows omit image_query_path or video_query_path instrumentation.",
+            )
+        )
+
+    weakness = max(0.0, weak_recall_at_5 - (best_r5 or 0.0))
+    sample_penalty = 0.15 if total_queries < min_queries else 0.0
+    media_penalty = 0.05 if media_query else 0.0
+    document_penalty = 0.05 if category in DOCUMENT_CATEGORIES else 0.0
+    priority_score = round(weakness + sample_penalty + media_penalty + document_penalty, 4)
+
+    rrf_rows = _stage_queries(stages, rrf_stage, category)
+    hybrid_rows = _stage_queries(stages, hybrid_stage, category)
+    audit_sources = _audit_source_counts(hybrid_rows or rrf_rows)
+
+    return {
+        "category": category,
+        "target_content_type": _target_content_type(category),
+        "configured_content_type_filter": content_type_filters.get(category),
+        "total_queries": total_queries,
+        "best_stage": best_name,
+        "best_recall_at_5": best_r5,
+        "priority_score": priority_score,
+        "metrics": {
+            "vector_recall_at_5": vector_r5,
+            "vector_recall_at_10": vector_r10,
+            "bm25_recall_at_5": bm25_r5,
+            "rrf_recall_at_5": rrf_r5,
+            "rrf_recall_at_10": rrf_r10,
+            "hybrid_recall_at_5": hybrid_r5,
+            "hybrid_recall_at_10": hybrid_r10,
+        },
+        "issues": issues,
+        "audit_source_counts_top5": audit_sources,
+        "failure_examples": _failure_examples(best_rows or first_rows),
+    }
+
+
+def _aggregate_actions(diagnoses: Sequence[Mapping[str, Any]]) -> List[Dict[str, Any]]:
+    issue_codes_by_category: Dict[str, List[str]] = {}
+    for diagnosis in diagnoses:
+        issue_codes_by_category[str(diagnosis["category"])] = [
+            str(issue["code"]) for issue in diagnosis.get("issues", [])
+        ]
+
+    actions: List[Dict[str, Any]] = []
+
+    weak_media = [
+        category
+        for category, codes in issue_codes_by_category.items()
+        if "embedding_alignment_gap" in codes or "derived_text_probe_insufficient" in codes
+    ]
+    if weak_media:
+        actions.append(
+            _action(
+                "search",
+                1,
+                "Add bounded cascaded media reranking only after cheap top-K retrieval.",
+                "The weakest media-query categories are not rescued by current RRF/reranker stages, so REC-130 should target a strict top-K cascade instead of broad expensive scoring.",
+            )
+        )
+
+    if any("under_sampled_category" in codes for codes in issue_codes_by_category.values()):
+        actions.append(
+            _action(
+                "evals",
+                2,
+                "Expand weak categories to at least 20 queries and keep parent-memory scoring.",
+                "Several weak categories have 1-3 examples, which is too small to distinguish model weakness from benchmark noise; this maps directly to REC-160.",
+            )
+        )
+
+    if any("document_family_filter_gap" in codes for codes in issue_codes_by_category.values()):
+        actions.append(
+            _action(
+                "indexing",
+                3,
+                "Represent document-family filters explicitly across pdf/docx/pptx roots and children.",
+                "Document retrieval categories are evaluated without a proper document-family content filter, so unrelated images/videos can dominate media-query results.",
+            )
+        )
+
+    if any("generic_query_artifact" in codes for codes in issue_codes_by_category.values()):
+        actions.append(
+            _action(
+                "evals",
+                4,
+                "Replace placeholder media prompts with grounded intent labels and provenance.",
+                "Queries such as 'related document' are useful smoke probes but too ambiguous for release-quality diagnostics.",
+            )
+        )
+
+    if any("parent_asset_metrics_missing" in codes for codes in issue_codes_by_category.values()):
+        actions.append(
+            _action(
+                "evals",
+                5,
+                "Rerun cross-modal ablation with the current harness to populate asset_level metrics.",
+                "The checked-in result is from v0.2.0 and predates serialized asset-level rollups, so it cannot fully separate child-asset hits from parent-memory hits.",
+            )
+        )
+
+    if any("derived_text_probe_lift" in codes for codes in issue_codes_by_category.values()):
+        actions.append(
+            _action(
+                "ingest",
+                6,
+                "Keep strengthening captions, transcripts, and OCR as first-class retrieval text.",
+                "Where RRF improves over vector-only, the improvement is evidence that derived text is helping and should be cached/versioned rather than recomputed ad hoc.",
+            )
+        )
+
+    actions.append(
+        _action(
+            "model_research",
+            7,
+            "Benchmark visual/document-specialized retrievers against the weak categories.",
+            "ViDoRe-style visual document retrieval and MTEB/BEIR-style qrels offer better external baselines for document-heavy failures than anecdotes from one synthetic corpus.",
+        )
+    )
+
+    return actions
+
+
+def _issue_summary(diagnoses: Sequence[Mapping[str, Any]]) -> Dict[str, int]:
+    counts: Counter[str] = Counter()
+    for diagnosis in diagnoses:
+        for issue in diagnosis.get("issues", []):
+            counts[str(issue["code"])] += 1
+    return dict(sorted(counts.items()))
+
+
+def _load_content_type_filters(categories: Sequence[str]) -> Dict[str, Optional[str]]:
+    """Inspect the current benchmark helper without making it a hard dependency."""
+    try:
+        project_root = Path(__file__).resolve().parent.parent
+        if str(project_root) not in sys.path:
+            sys.path.insert(0, str(project_root))
+        from benchmarks import cross_modal_ablation  # type: ignore
+    except Exception:
+        return {}
+    return {
+        category: cross_modal_ablation._result_content_type_for_category(category)
+        for category in categories
+    }
+
+
+def build_diagnostics(
+    payload: Mapping[str, Any],
+    *,
+    content_type_filters: Optional[Mapping[str, Optional[str]]] = None,
+    min_queries: int = MIN_CATEGORY_QUERIES,
+    weak_recall_at_5: float = WEAK_RECALL_AT_5,
+    meaningful_delta: float = MEANINGFUL_DELTA,
+) -> Dict[str, Any]:
+    categories = _category_names(payload)
+    filters = dict(content_type_filters or _load_content_type_filters(categories))
+    diagnoses = [
+        _build_category_diagnosis(
+            payload,
+            category,
+            content_type_filters=filters,
+            min_queries=min_queries,
+            weak_recall_at_5=weak_recall_at_5,
+            meaningful_delta=meaningful_delta,
+        )
+        for category in categories
+    ]
+    diagnoses.sort(
+        key=lambda row: (
+            -float(row["priority_score"]),
+            str(row["category"]),
+        )
+    )
+
+    weak_categories = [
+        diagnosis
+        for diagnosis in diagnoses
+        if diagnosis["best_recall_at_5"] is None
+        or float(diagnosis["best_recall_at_5"]) < weak_recall_at_5
+        or float(diagnosis["priority_score"]) >= 0.20
+    ]
+
+    return {
+        "diagnostic": "cross_modal_retrieval",
+        "diagnosed_at": datetime.now(timezone.utc).isoformat(),
+        "source": {
+            "benchmark": payload.get("benchmark"),
+            "version": payload.get("version"),
+            "generated_at": payload.get("generated_at"),
+            "run_status": payload.get("run_status"),
+        },
+        "thresholds": {
+            "weak_recall_at_5": weak_recall_at_5,
+            "min_category_queries": min_queries,
+            "meaningful_delta": meaningful_delta,
+        },
+        "summary": {
+            "categories": len(diagnoses),
+            "weak_categories": len(weak_categories),
+            "issue_counts": _issue_summary(diagnoses),
+        },
+        "weak_categories": weak_categories,
+        "all_categories": diagnoses,
+        "prioritized_actions": _aggregate_actions(diagnoses),
+        "method_notes": [
+            "Vector-only is treated as the raw embedding baseline.",
+            "RRF lift over vector-only is treated as evidence from derived text probes such as captions, transcripts, OCR, or BM25 text.",
+            "Hybrid-minus-RRF isolates the current reranker contribution.",
+            "Parent-memory versus asset-level scoring is only available when the source payload includes asset_level metrics.",
+        ],
+    }
+
+
+def _fmt_pct(value: Optional[float]) -> str:
+    if value is None:
+        return "n/a"
+    return f"{value:.1%}"
+
+
+def _fmt_filter(value: Optional[str]) -> str:
+    return "`None`" if value is None else f"`{value}`"
+
+
+def render_markdown(diagnostics: Mapping[str, Any]) -> str:
+    source = diagnostics.get("source", {}) or {}
+    thresholds = diagnostics.get("thresholds", {}) or {}
+    weak_categories = diagnostics.get("weak_categories", []) or []
+    all_categories = diagnostics.get("all_categories", []) or []
+    actions = diagnostics.get("prioritized_actions", []) or []
+    issue_counts = (diagnostics.get("summary", {}) or {}).get("issue_counts", {}) or {}
+
+    lines: List[str] = [
+        "# Cross-Modal Retrieval Diagnostics",
+        "",
+        "This report is generated from the saved cross-modal ablation JSON. It separates raw embedding alignment, derived-text contribution, reranker contribution, benchmark artifacts, and parent-memory scoring coverage.",
+        "",
+        "## Source",
+        "",
+        f"- Benchmark: `{source.get('benchmark') or 'unknown'}`",
+        f"- Source version: `{source.get('version') or 'unknown'}`",
+        f"- Source generated at: `{source.get('generated_at') or 'unknown'}`",
+        f"- Run status: `{source.get('run_status') or 'unknown'}`",
+        f"- Weak threshold: R@5 < {_fmt_pct(_safe_float(thresholds.get('weak_recall_at_5')))}",
+        f"- Query floor: {thresholds.get('min_category_queries')} queries per category",
+        "",
+        "## Weak And At-Risk Category Ranking",
+        "",
+        "| Priority | Category | Queries | Best stage | Best R@5 | Vector R@5 | RRF R@5 | Hybrid R@5 | Key issues |",
+        "|---:|---|---:|---|---:|---:|---:|---:|---|",
+    ]
+
+    for diagnosis in weak_categories:
+        metrics = diagnosis.get("metrics", {}) or {}
+        issues = ", ".join(issue["code"] for issue in diagnosis.get("issues", [])[:4])
+        lines.append(
+            "| "
+            f"{diagnosis.get('priority_score'):.2f} | "
+            f"`{diagnosis.get('category')}` | "
+            f"{diagnosis.get('total_queries')} | "
+            f"{diagnosis.get('best_stage') or 'n/a'} | "
+            f"{_fmt_pct(_safe_float(diagnosis.get('best_recall_at_5')))} | "
+            f"{_fmt_pct(_safe_float(metrics.get('vector_recall_at_5')))} | "
+            f"{_fmt_pct(_safe_float(metrics.get('rrf_recall_at_5')))} | "
+            f"{_fmt_pct(_safe_float(metrics.get('hybrid_recall_at_5')))} | "
+            f"{issues or 'none'} |"
+        )
+
+    lines.extend(
+        [
+            "",
+            "## Diagnosis Summary",
+            "",
+        ]
+    )
+    for code, count in issue_counts.items():
+        lines.append(f"- `{code}`: {count}")
+
+    lines.extend(
+        [
+            "",
+            "## Prioritized Fix List",
+            "",
+        ]
+    )
+    for action in actions:
+        lines.append(
+            f"{action['priority']}. **{action['owner']}** - {action['title']} {action['rationale']}"
+        )
+
+    lines.extend(
+        [
+            "",
+            "## Category Evidence",
+            "",
+        ]
+    )
+    for diagnosis in all_categories:
+        metrics = diagnosis.get("metrics", {}) or {}
+        lines.extend(
+            [
+                f"### `{diagnosis.get('category')}`",
+                "",
+                f"- Queries: {diagnosis.get('total_queries')}",
+                f"- Target result family: `{diagnosis.get('target_content_type') or 'mixed'}`",
+                f"- Configured benchmark content filter: {_fmt_filter(diagnosis.get('configured_content_type_filter'))}",
+                f"- Best stage/R@5: {diagnosis.get('best_stage') or 'n/a'} / {_fmt_pct(_safe_float(diagnosis.get('best_recall_at_5')))}",
+                f"- Raw vector R@5/R@10: {_fmt_pct(_safe_float(metrics.get('vector_recall_at_5')))} / {_fmt_pct(_safe_float(metrics.get('vector_recall_at_10')))}",
+                f"- RRF R@5/R@10: {_fmt_pct(_safe_float(metrics.get('rrf_recall_at_5')))} / {_fmt_pct(_safe_float(metrics.get('rrf_recall_at_10')))}",
+                f"- Hybrid R@5/R@10: {_fmt_pct(_safe_float(metrics.get('hybrid_recall_at_5')))} / {_fmt_pct(_safe_float(metrics.get('hybrid_recall_at_10')))}",
+            ]
+        )
+        audit_counts = diagnosis.get("audit_source_counts_top5") or {}
+        if audit_counts:
+            source_text = ", ".join(f"`{key}`={value}" for key, value in audit_counts.items())
+            lines.append(f"- Top-5 audit source counts: {source_text}")
+        for issue in diagnosis.get("issues", []):
+            lines.append(f"- `{issue['code']}` ({issue['severity']}): {issue['evidence']}")
+        examples = diagnosis.get("failure_examples") or []
+        if examples:
+            lines.append("- Example misses:")
+            for example in examples:
+                top_results = ", ".join(
+                    f"`{path}`" for path in (example.get("top_results") or []) if path
+                )
+                query = example.get("query") or "<empty media query>"
+                lines.append(
+                    f"  - `{query}` ({example.get('query_type')}): expected {example.get('relevant_paths')}; top results {top_results or 'n/a'}"
+                )
+        lines.append("")
+
+    lines.extend(
+        [
+            "## Method Notes",
+            "",
+        ]
+    )
+    for note in diagnostics.get("method_notes", []) or []:
+        lines.append(f"- {note}")
+
+    lines.extend(
+        [
+            "",
+            "## External Evaluation References",
+            "",
+            "- [BEIR](https://github.com/beir-cellar/beir) structures retrieval evaluation around corpus, queries, qrels, run results, and metrics such as NDCG, MAP, Recall, Precision, and MRR.",
+            "- [MTEB](https://github.com/embeddings-benchmark/mteb) is the broader embedding and retrieval evaluation framework now used by ViDoRe for single-model retriever submissions.",
+            "- [ViDoRe pipeline evaluation](https://github.com/illuin-tech/vidore-benchmark) explicitly covers multi-stage, hybrid, reranking, OCR, and custom preprocessing pipelines for visual document retrieval.",
+            "",
+        ]
+    )
+    return "\n".join(lines)
+
+
+def _write_json(path: Path, payload: Mapping[str, Any]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")
+
+
+def _write_text(path: Path, text: str) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(text, encoding="utf-8")
+
+
+def parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--input", type=Path, default=DEFAULT_INPUT, help="cross_modal_ablation JSON to diagnose")
+    parser.add_argument("--output-json", type=Path, default=DEFAULT_JSON_OUTPUT, help="diagnostic JSON output path")
+    parser.add_argument("--output-md", type=Path, default=DEFAULT_MARKDOWN_OUTPUT, help="diagnostic Markdown output path")
+    parser.add_argument("--min-queries", type=int, default=MIN_CATEGORY_QUERIES, help="minimum desired queries per category")
+    parser.add_argument("--weak-recall-at-5", type=float, default=WEAK_RECALL_AT_5, help="R@5 threshold for weak categories")
+    return parser.parse_args(argv)
+
+
+def main(argv: Optional[Sequence[str]] = None) -> int:
+    args = parse_args(argv)
+    payload = json.loads(args.input.read_text(encoding="utf-8"))
+    diagnostics = build_diagnostics(
+        payload,
+        min_queries=args.min_queries,
+        weak_recall_at_5=args.weak_recall_at_5,
+    )
+    _write_json(args.output_json, diagnostics)
+    _write_text(args.output_md, render_markdown(diagnostics))
+    print(f"Wrote {args.output_json}")
+    print(f"Wrote {args.output_md}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/benchmarks/results/cross_modal_diagnostics.json b/benchmarks/results/cross_modal_diagnostics.json
new file mode 100644
index 0000000..76a4473
--- /dev/null
+++ b/benchmarks/results/cross_modal_diagnostics.json
@@ -0,0 +1,1579 @@
+{
+  "all_categories": [
+    {
+      "audit_source_counts_top5": {
+        "original_fts": 8,
+        "original_vec": 8
+      },
+      "best_recall_at_5": 0.0,
+      "best_stage": "Vector-only",
+      "category": "image_to_document",
+      "configured_content_type_filter": null,
+      "failure_examples": [
+        {
+          "image_query_path": "images/neural_network_diagram.png",
+          "query": "related document",
+          "query_type": "image",
+          "relevant_paths": [
+            "documents/ai_strategy_report.docx",
+            "documents/ai_architecture_deck.pptx",
+            "documents/embedding_research.pdf"
+          ],
+          "top_results": [
+            "images/neural_network_diagram.png",
+            "images/whiteboard_brainstorm.png",
+            "images/whiteboard_architecture.png"
+          ],
+          "video_query_path": null
+        },
+        {
+          "image_query_path": "images/code_editor_screenshot.png",
+          "query": "related document",
+          "query_type": "image",
+          "relevant_paths": [
+            "documents/recallforge_spec.docx",
+            "documents/operations_manual.pdf"
+          ],
+          "top_results": [
+            "images/code_editor_screenshot.png",
+            "videos/coding_demo.mp4::transcript:0001@0.00s",
+            "images/whiteboard_architecture.png"
+          ],
+          "video_query_path": null
+        }
+      ],
+      "issues": [
+        {
+          "code": "under_sampled_category",
+          "evidence": "2 queries is below the 20-query diagnostic floor.",
+          "severity": "high"
+        },
+        {
+          "code": "bm25_modality_blind",
+          "evidence": "BM25 can't process image queries",
+          "severity": "medium"
+        },
+        {
+          "code": "embedding_alignment_gap",
+          "evidence": "Vector-only R@5=0.000; media-query categories need raw embedding alignment above 0.60.",
+          "severity": "high"
+        },
+        {
+          "code": "derived_text_probe_lift",
+          "evidence": "RRF improves R@5 by +0.000 and R@10 by +0.500 over raw vector search.",
+          "severity": "positive"
+        },
+        {
+          "code": "reranker_no_lift",
+          "evidence": "Hybrid reranker changes R@5 by only +0.000 versus RRF.",
+          "severity": "medium"
+        },
+        {
+          "code": "parent_asset_metrics_missing",
+          "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.",
+          "severity": "medium"
+        },
+        {
+          "code": "document_family_filter_gap",
+          "evidence": "Benchmark cannot currently constrain results to the pdf/docx/pptx document family with a single content_type filter.",
+          "severity": "medium"
+        },
+        {
+          "code": "generic_query_artifact",
+          "evidence": "Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.",
+          "severity": "medium"
+        }
+      ],
+      "metrics": {
+        "bm25_recall_at_5": null,
+        "hybrid_recall_at_10": 0.5,
+        "hybrid_recall_at_5": 0.0,
+        "rrf_recall_at_10": 0.5,
+        "rrf_recall_at_5": 0.0,
+        "vector_recall_at_10": 0.0,
+        "vector_recall_at_5": 0.0
+      },
+      "priority_score": 0.85,
+      "target_content_type": "document_family",
+      "total_queries": 2
+    },
+    {
+      "audit_source_counts_top5": {
+        "original_fts": 4,
+        "original_vec": 4
+      },
+      "best_recall_at_5": 0.0,
+      "best_stage": "Vector-only",
+      "category": "video_to_document",
+      "configured_content_type_filter": null,
+      "failure_examples": [
+        {
+          "image_query_path": null,
+          "query": "related document",
+          "query_type": "video",
+          "relevant_paths": [
+            "documents/ai_architecture_deck.pptx"
+          ],
+          "top_results": [
+            "videos/architecture_walkthrough.mp4",
+            "videos/architecture_walkthrough.mp4::frame:0004@15.00s",
+            "videos/architecture_walkthrough.mp4::frame:0006@25.00s"
+          ],
+          "video_query_path": null
+        }
+      ],
+      "issues": [
+        {
+          "code": "under_sampled_category",
+          "evidence": "1 queries is below the 20-query diagnostic floor.",
+          "severity": "high"
+        },
+        {
+          "code": "bm25_modality_blind",
+          "evidence": "BM25 can't process video queries",
+          "severity": "medium"
+        },
+        {
+          "code": "embedding_alignment_gap",
+          "evidence": "Vector-only R@5=0.000; media-query categories need raw embedding alignment above 0.60.",
+          "severity": "high"
+        },
+        {
+          "code": "derived_text_probe_insufficient",
+          "evidence": "RRF R@5=0.000 does not materially lift vector R@5=0.000.",
+          "severity": "high"
+        },
+        {
+          "code": "reranker_no_lift",
+          "evidence": "Hybrid reranker changes R@5 by only +0.000 versus RRF.",
+          "severity": "medium"
+        },
+        {
+          "code": "parent_asset_metrics_missing",
+          "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.",
+          "severity": "medium"
+        },
+        {
+          "code": "document_family_filter_gap",
+          "evidence": "Benchmark cannot currently constrain results to the pdf/docx/pptx document family with a single content_type filter.",
+          "severity": "medium"
+        },
+        {
+          "code": "generic_query_artifact",
+          "evidence": "Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.",
+          "severity": "medium"
+        },
+        {
+          "code": "media_query_path_missing",
+          "evidence": "1 per-query rows omit image_query_path or video_query_path instrumentation.",
+          "severity": "low"
+        }
+      ],
+      "metrics": {
+        "bm25_recall_at_5": null,
+        "hybrid_recall_at_10": 0.0,
+        "hybrid_recall_at_5": 0.0,
+        "rrf_recall_at_10": 0.0,
+        "rrf_recall_at_5": 0.0,
+        "vector_recall_at_10": 0.0,
+        "vector_recall_at_5": 0.0
+      },
+      "priority_score": 0.85,
+      "target_content_type": "document_family",
+      "total_queries": 1
+    },
+    {
+      "audit_source_counts_top5": {
+        "original_fts": 7,
+        "original_vec": 8
+      },
+      "best_recall_at_5": 0.0,
+      "best_stage": "Vector-only",
+      "category": "video_to_image",
+      "configured_content_type_filter": "image",
+      "failure_examples": [
+        {
+          "image_query_path": null,
+          "query": "related image",
+          "query_type": "video",
+          "relevant_paths": [
+            "images/forest_landscape.png",
+            "images/mountain_landscape.png",
+            "images/ocean_beach.png"
+          ],
+          "top_results": [
+            "videos/nature_timelapse.mp4::frame:0006@25.00s",
+            "videos/nature_timelapse.mp4::frame:0005@20.00s",
+            "videos/nature_timelapse.mp4::frame:0007@30.00s"
+          ],
+          "video_query_path": null
+        },
+        {
+          "image_query_path": null,
+          "query": "related image",
+          "query_type": "video",
+          "relevant_paths": [
+            "images/whiteboard_brainstorm.png",
+            "images/whiteboard_architecture.png"
+          ],
+          "top_results": [
+            "videos/whiteboard_session.mp4::frame:0003@10.00s",
+            "videos/whiteboard_session.mp4::frame:0002@5.00s",
+            "videos/whiteboard_session.mp4::frame:0001@0.00s"
+          ],
+          "video_query_path": null
+        }
+      ],
+      "issues": [
+        {
+          "code": "under_sampled_category",
+          "evidence": "2 queries is below the 20-query diagnostic floor.",
+          "severity": "high"
+        },
+        {
+          "code": "bm25_modality_blind",
+          "evidence": "BM25 can't process video queries",
+          "severity": "medium"
+        },
+        {
+          "code": "embedding_alignment_gap",
+          "evidence": "Vector-only R@5=0.000; media-query categories need raw embedding alignment above 0.60.",
+          "severity": "high"
+        },
+        {
+          "code": "derived_text_probe_insufficient",
+          "evidence": "RRF R@5=0.000 does not materially lift vector R@5=0.000.",
+          "severity": "high"
+        },
+        {
+          "code": "reranker_no_lift",
+          "evidence": "Hybrid reranker changes R@5 by only +0.000 versus RRF.",
+          "severity": "medium"
+        },
+        {
+          "code": "parent_asset_metrics_missing",
+          "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.",
+          "severity": "medium"
+        },
+        {
+          "code": "generic_query_artifact",
+          "evidence": "Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.",
+          "severity": "medium"
+        },
+        {
+          "code": "media_query_path_missing",
+          "evidence": "2 per-query rows omit image_query_path or video_query_path instrumentation.",
+          "severity": "low"
+        }
+      ],
+      "metrics": {
+        "bm25_recall_at_5": null,
+        "hybrid_recall_at_10": 1.0,
+        "hybrid_recall_at_5": 0.0,
+        "rrf_recall_at_10": 1.0,
+        "rrf_recall_at_5": 0.0,
+        "vector_recall_at_10": 1.0,
+        "vector_recall_at_5": 0.0
+      },
+      "priority_score": 0.8,
+      "target_content_type": "image",
+      "total_queries": 2
+    },
+    {
+      "audit_source_counts_top5": {
+        "original_fts": 8,
+        "original_vec": 12
+      },
+      "best_recall_at_5": 0.3333,
+      "best_stage": "Vector-only",
+      "category": "video_to_text",
+      "configured_content_type_filter": "text",
+      "failure_examples": [
+        {
+          "image_query_path": null,
+          "query": "related text",
+          "query_type": "video",
+          "relevant_paths": [
+            "text/tech_cybersecurity.md",
+            "text/tech_cloud_computing.md"
+          ],
+          "top_results": [
+            "videos/coding_demo.mp4::transcript:0002@2.00s",
+            "videos/whiteboard_session.mp4::transcript:0001@0.00s",
+            "text/medicine_nutrition.md"
+          ],
+          "video_query_path": null
+        },
+        {
+          "image_query_path": null,
+          "query": "related text",
+          "query_type": "video",
+          "relevant_paths": [
+            "text/architecture_gothic.md",
+            "text/architecture_modern.md",
+            "text/architecture_blueprints.md"
+          ],
+          "top_results": [
+            "videos/coding_demo.mp4::transcript:0002@2.00s",
+            "text/ai_agents.md",
+            "documents/ai_architecture_deck.pptx::slide:0001"
+          ],
+          "video_query_path": null
+        }
+      ],
+      "issues": [
+        {
+          "code": "under_sampled_category",
+          "evidence": "3 queries is below the 20-query diagnostic floor.",
+          "severity": "high"
+        },
+        {
+          "code": "bm25_modality_blind",
+          "evidence": "BM25 can't process video queries",
+          "severity": "medium"
+        },
+        {
+          "code": "embedding_alignment_gap",
+          "evidence": "Vector-only R@5=0.333; media-query categories need raw embedding alignment above 0.60.",
+          "severity": "high"
+        },
+        {
+          "code": "derived_text_probe_insufficient",
+          "evidence": "RRF R@5=0.333 does not materially lift vector R@5=0.333.",
+          "severity": "high"
+        },
+        {
+          "code": "reranker_no_lift",
+          "evidence": "Hybrid reranker changes R@5 by only +0.000 versus RRF.",
+          "severity": "medium"
+        },
+        {
+          "code": "parent_asset_metrics_missing",
+          "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.",
+          "severity": "medium"
+        },
+        {
+          "code": "generic_query_artifact",
+          "evidence": "Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.",
+          "severity": "medium"
+        },
+        {
+          "code": "media_query_path_missing",
+          "evidence": "3 per-query rows omit image_query_path or video_query_path instrumentation.",
+          "severity": "low"
+        }
+      ],
+      "metrics": {
+        "bm25_recall_at_5": null,
+        "hybrid_recall_at_10": 0.3333,
+        "hybrid_recall_at_5": 0.3333,
+        "rrf_recall_at_10": 0.3333,
+        "rrf_recall_at_5": 0.3333,
+        "vector_recall_at_10": 0.3333,
+        "vector_recall_at_5": 0.3333
+      },
+      "priority_score": 0.4667,
+      "target_content_type": "text",
+      "total_queries": 3
+    },
+    {
+      "audit_source_counts_top5": {
+        "original_fts": 8,
+        "original_vec": 12
+      },
+      "best_recall_at_5": 1.0,
+      "best_stage": "Vector-only",
+      "category": "image_to_image",
+      "configured_content_type_filter": "image",
+      "failure_examples": [],
+      "issues": [
+        {
+          "code": "under_sampled_category",
+          "evidence": "3 queries is below the 20-query diagnostic floor.",
+          "severity": "high"
+        },
+        {
+          "code": "bm25_modality_blind",
+          "evidence": "BM25 can't process image queries",
+          "severity": "medium"
+        },
+        {
+          "code": "parent_asset_metrics_missing",
+          "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.",
+          "severity": "medium"
+        }
+      ],
+      "metrics": {
+        "bm25_recall_at_5": null,
+        "hybrid_recall_at_10": 1.0,
+        "hybrid_recall_at_5": 1.0,
+        "rrf_recall_at_10": 1.0,
+        "rrf_recall_at_5": 1.0,
+        "vector_recall_at_10": 1.0,
+        "vector_recall_at_5": 1.0
+      },
+      "priority_score": 0.2,
+      "target_content_type": "image",
+      "total_queries": 3
+    },
+    {
+      "audit_source_counts_top5": {
+        "original_fts": 60,
+        "original_vec": 63
+      },
+      "best_recall_at_5": 0.6,
+      "best_stage": "Vector + BM25 (RRF)",
+      "category": "image_to_text",
+      "configured_content_type_filter": "text",
+      "failure_examples": [
+        {
+          "image_query_path": "images/ocean_beach.png",
+          "query": "",
+          "query_type": "image",
+          "relevant_paths": [
+            "text/nature_oceans.md"
+          ],
+          "top_results": [
+            "text/sports_golf.md",
+            "text/music_production.md",
+            "text/nature_forests.md"
+          ],
+          "video_query_path": null
+        },
+        {
+          "image_query_path": "images/whiteboard_architecture.png",
+          "query": "",
+          "query_type": "image",
+          "relevant_paths": [
+            "text/ai_agents.md",
+            "text/tech_edge_ai.md"
+          ],
+          "top_results": [
+            "videos/architecture_walkthrough.mp4::transcript:0001@0.00s",
+            "text/tech_cloud_computing.md",
+            "videos/whiteboard_session.mp4::transcript:0001@0.00s"
+          ],
+          "video_query_path": null
+        },
+        {
+          "image_query_path": "images/whiteboard_brainstorm.png",
+          "query": "",
+          "query_type": "image",
+          "relevant_paths": [
+            "text/ai_agents.md",
+            "text/tech_cloud_computing.md"
+          ],
+          "top_results": [
+            "videos/whiteboard_session.mp4::transcript:0002@2.00s",
+            "videos/whiteboard_session.mp4::transcript:0001@0.00s",
+            "videos/architecture_walkthrough.mp4::transcript:0001@0.00s"
+          ],
+          "video_query_path": null
+        }
+      ],
+      "issues": [
+        {
+          "code": "under_sampled_category",
+          "evidence": "15 queries is below the 20-query diagnostic floor.",
+          "severity": "medium"
+        },
+        {
+          "code": "bm25_modality_blind",
+          "evidence": "BM25 can't process image queries",
+          "severity": "medium"
+        },
+        {
+          "code": "embedding_alignment_gap",
+          "evidence": "Vector-only R@5=0.467; media-query categories need raw embedding alignment above 0.60.",
+          "severity": "high"
+        },
+        {
+          "code": "derived_text_probe_lift",
+          "evidence": "RRF improves R@5 by +0.133 and R@10 by +0.133 over raw vector search.",
+          "severity": "positive"
+        },
+        {
+          "code": "parent_asset_metrics_missing",
+          "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.",
+          "severity": "medium"
+        },
+        {
+          "code": "generic_query_artifact",
+          "evidence": "Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.",
+          "severity": "medium"
+        }
+      ],
+      "metrics": {
+        "bm25_recall_at_5": null,
+        "hybrid_recall_at_10": 0.8667,
+        "hybrid_recall_at_5": 0.6,
+        "rrf_recall_at_10": 0.8667,
+        "rrf_recall_at_5": 0.6,
+        "vector_recall_at_10": 0.7333,
+        "vector_recall_at_5": 0.4667
+      },
+      "priority_score": 0.2,
+      "target_content_type": "text",
+      "total_queries": 15
+    },
+    {
+      "audit_source_counts_top5": {
+        "original_fts": 2,
+        "original_vec": 8
+      },
+      "best_recall_at_5": 1.0,
+      "best_stage": "Vector-only",
+      "category": "image_to_video",
+      "configured_content_type_filter": "video",
+      "failure_examples": [],
+      "issues": [
+        {
+          "code": "under_sampled_category",
+          "evidence": "2 queries is below the 20-query diagnostic floor.",
+          "severity": "high"
+        },
+        {
+          "code": "bm25_modality_blind",
+          "evidence": "BM25 can't process image queries",
+          "severity": "medium"
+        },
+        {
+          "code": "parent_asset_metrics_missing",
+          "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.",
+          "severity": "medium"
+        },
+        {
+          "code": "generic_query_artifact",
+          "evidence": "Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.",
+          "severity": "medium"
+        }
+      ],
+      "metrics": {
+        "bm25_recall_at_5": null,
+        "hybrid_recall_at_10": 1.0,
+        "hybrid_recall_at_5": 1.0,
+        "rrf_recall_at_10": 1.0,
+        "rrf_recall_at_5": 1.0,
+        "vector_recall_at_10": 1.0,
+        "vector_recall_at_5": 1.0
+      },
+      "priority_score": 0.2,
+      "target_content_type": "video",
+      "total_queries": 2
+    },
+    {
+      "audit_source_counts_top5": {
+        "original_fts": 44,
+        "original_vec": 48
+      },
+      "best_recall_at_5": 1.0,
+      "best_stage": "Vector-only",
+      "category": "text_to_document",
+      "configured_content_type_filter": null,
+      "failure_examples": [],
+      "issues": [
+        {
+          "code": "under_sampled_category",
+          "evidence": "12 queries is below the 20-query diagnostic floor.",
+          "severity": "medium"
+        },
+        {
+          "code": "parent_asset_metrics_missing",
+          "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.",
+          "severity": "medium"
+        },
+        {
+          "code": "document_family_filter_gap",
+          "evidence": "Benchmark cannot currently constrain results to the pdf/docx/pptx document family with a single content_type filter.",
+          "severity": "medium"
+        }
+      ],
+      "metrics": {
+        "bm25_recall_at_5": 1.0,
+        "hybrid_recall_at_10": 1.0,
+        "hybrid_recall_at_5": 1.0,
+        "rrf_recall_at_10": 1.0,
+        "rrf_recall_at_5": 1.0,
+        "vector_recall_at_10": 1.0,
+        "vector_recall_at_5": 1.0
+      },
+      "priority_score": 0.2,
+      "target_content_type": "document_family",
+      "total_queries": 12
+    },
+    {
+      "audit_source_counts_top5": {
+        "original_vec": 4
+      },
+      "best_recall_at_5": 1.0,
+      "best_stage": "Vector-only",
+      "category": "video_to_video",
+      "configured_content_type_filter": "video",
+      "failure_examples": [],
+      "issues": [
+        {
+          "code": "under_sampled_category",
+          "evidence": "1 queries is below the 20-query diagnostic floor.",
+          "severity": "high"
+        },
+        {
+          "code": "bm25_modality_blind",
+          "evidence": "BM25 can't process video queries",
+          "severity": "medium"
+        },
+        {
+          "code": "parent_asset_metrics_missing",
+          "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.",
+          "severity": "medium"
+        },
+        {
+          "code": "media_query_path_missing",
+          "evidence": "1 per-query rows omit image_query_path or video_query_path instrumentation.",
+          "severity": "low"
+        }
+      ],
+      "metrics": {
+        "bm25_recall_at_5": null,
+        "hybrid_recall_at_10": 1.0,
+        "hybrid_recall_at_5": 1.0,
+        "rrf_recall_at_10": 1.0,
+        "rrf_recall_at_5": 1.0,
+        "vector_recall_at_10": 1.0,
+        "vector_recall_at_5": 1.0
+      },
+      "priority_score": 0.2,
+      "target_content_type": "video",
+      "total_queries": 1
+    },
+    {
+      "audit_source_counts_top5": {
+        "original_fts": 17,
+        "original_vec": 72
+      },
+      "best_recall_at_5": 0.9444,
+      "best_stage": "Vector + BM25 (RRF)",
+      "category": "text_to_image",
+      "configured_content_type_filter": "image",
+      "failure_examples": [
+        {
+          "image_query_path": null,
+          "query": "coastal landscape photography",
+          "query_type": "text",
+          "relevant_paths": [
+            "images/ocean_beach.png"
+          ],
+          "top_results": [
+            "videos/nature_timelapse.mp4::frame:0006@25.00s",
+            "videos/nature_timelapse.mp4::frame:0005@20.00s",
+            "videos/nature_timelapse.mp4::frame:0003@10.00s"
+          ],
+          "video_query_path": null
+        }
+      ],
+      "issues": [
+        {
+          "code": "under_sampled_category",
+          "evidence": "18 queries is below the 20-query diagnostic floor.",
+          "severity": "medium"
+        },
+        {
+          "code": "parent_asset_metrics_missing",
+          "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.",
+          "severity": "medium"
+        }
+      ],
+      "metrics": {
+        "bm25_recall_at_5": 0.7222,
+        "hybrid_recall_at_10": 0.9444,
+        "hybrid_recall_at_5": 0.9444,
+        "rrf_recall_at_10": 0.9444,
+        "rrf_recall_at_5": 0.9444,
+        "vector_recall_at_10": 0.9444,
+        "vector_recall_at_5": 0.8889
+      },
+      "priority_score": 0.15,
+      "target_content_type": "image",
+      "total_queries": 18
+    },
+    {
+      "audit_source_counts_top5": {
+        "original_fts": 1,
+        "original_vec": 60
+      },
+      "best_recall_at_5": 1.0,
+      "best_stage": "Vector-only",
+      "category": "text_to_video",
+      "configured_content_type_filter": "video",
+      "failure_examples": [],
+      "issues": [
+        {
+          "code": "under_sampled_category",
+          "evidence": "15 queries is below the 20-query diagnostic floor.",
+          "severity": "medium"
+        },
+        {
+          "code": "parent_asset_metrics_missing",
+          "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.",
+          "severity": "medium"
+        }
+      ],
+      "metrics": {
+        "bm25_recall_at_5": 0.1333,
+        "hybrid_recall_at_10": 1.0,
+        "hybrid_recall_at_5": 1.0,
+        "rrf_recall_at_10": 1.0,
+        "rrf_recall_at_5": 1.0,
+        "vector_recall_at_10": 1.0,
+        "vector_recall_at_5": 1.0
+      },
+      "priority_score": 0.15,
+      "target_content_type": "video",
+      "total_queries": 15
+    },
+    {
+      "audit_source_counts_top5": {
+        "original_fts": 75,
+        "original_vec": 78
+      },
+      "best_recall_at_5": 0.95,
+      "best_stage": "BM25-only",
+      "category": "mixed_modal",
+      "configured_content_type_filter": null,
+      "failure_examples": [
+        {
+          "image_query_path": null,
+          "query": "comprehensive guide to athletic performance",
+          "query_type": "text",
+          "relevant_paths": [
+            "text/sports_running.md",
+            "text/sports_cycling.md",
+            "text/sports_swimming.md",
+            "text/sports_yoga.md",
+            "text/medicine_nutrition.md",
+            "text/medicine_cardiology.md"
+          ],
+          "top_results": [
+            "text/cooking_sourdough.md",
+            "documents/ai_architecture_deck.pptx::slide:0003",
+            "documents/edge_deployment_guide.pdf::page:0001"
+          ],
+          "video_query_path": null
+        }
+      ],
+      "issues": [
+        {
+          "code": "parent_asset_metrics_missing",
+          "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.",
+          "severity": "medium"
+        }
+      ],
+      "metrics": {
+        "bm25_recall_at_5": 0.95,
+        "hybrid_recall_at_10": 1.0,
+        "hybrid_recall_at_5": 0.9,
+        "rrf_recall_at_10": 1.0,
+        "rrf_recall_at_5": 0.95,
+        "vector_recall_at_10": 1.0,
+        "vector_recall_at_5": 0.85
+      },
+      "priority_score": 0.0,
+      "target_content_type": null,
+      "total_queries": 20
+    },
+    {
+      "audit_source_counts_top5": {
+        "original_fts": 188,
+        "original_vec": 241
+      },
+      "best_recall_at_5": 0.9,
+      "best_stage": "Vector + BM25 (RRF)",
+      "category": "text_to_text",
+      "configured_content_type_filter": "text",
+      "failure_examples": [
+        {
+          "image_query_path": null,
+          "query": "how do computers understand the meaning of words",
+          "query_type": "text",
+          "relevant_paths": [
+            "text/ai_embeddings.md"
+          ],
+          "top_results": [
+            "text/ai_transformers.md",
+            "text/tech_quantum_computing.md",
+            "text/tech_cloud_computing.md"
+          ],
+          "video_query_path": null
+        },
+        {
+          "image_query_path": null,
+          "query": "underwater basket weaving techniques",
+          "query_type": "text",
+          "relevant_paths": [],
+          "top_results": [
+            "text/sports_swimming.md",
+            "text/architecture_gothic.md",
+            "text/cooking_sourdough.md"
+          ],
+          "video_query_path": null
+        },
+        {
+          "image_query_path": null,
+          "query": "medieval jousting tournament rules and equipment",
+          "query_type": "text",
+          "relevant_paths": [],
+          "top_results": [
+            "text/sports_swimming.md",
+            "text/sports_cycling.md",
+            "text/history_renaissance.md"
+          ],
+          "video_query_path": null
+        }
+      ],
+      "issues": [
+        {
+          "code": "parent_asset_metrics_missing",
+          "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.",
+          "severity": "medium"
+        }
+      ],
+      "metrics": {
+        "bm25_recall_at_5": 0.8667,
+        "hybrid_recall_at_10": 0.9167,
+        "hybrid_recall_at_5": 0.9,
+        "rrf_recall_at_10": 0.9167,
+        "rrf_recall_at_5": 0.9,
+        "vector_recall_at_10": 0.9,
+        "vector_recall_at_5": 0.8833
+      },
+      "priority_score": 0.0,
+      "target_content_type": "text",
+      "total_queries": 60
+    }
+  ],
+  "diagnosed_at": "2026-05-17T17:57:55.151267+00:00",
+  "diagnostic": "cross_modal_retrieval",
+  "method_notes": [
+    "Vector-only is treated as the raw embedding baseline.",
+    "RRF lift over vector-only is treated as evidence from derived text probes such as captions, transcripts, OCR, or BM25 text.",
+    "Hybrid-minus-RRF isolates the current reranker contribution.",
+    "Parent-memory versus asset-level scoring is only available when the source payload includes asset_level metrics."
+  ],
+  "prioritized_actions": [
+    {
+      "owner": "search",
+      "priority": 1,
+      "rationale": "The weakest media-query categories are not rescued by current RRF/reranker stages, so REC-130 should target a strict top-K cascade instead of broad expensive scoring.",
+      "title": "Add bounded cascaded media reranking only after cheap top-K retrieval."
+    },
+    {
+      "owner": "evals",
+      "priority": 2,
+      "rationale": "Several weak categories have 1-3 examples, which is too small to distinguish model weakness from benchmark noise; this maps directly to REC-160.",
+      "title": "Expand weak categories to at least 20 queries and keep parent-memory scoring."
+    },
+    {
+      "owner": "indexing",
+      "priority": 3,
+      "rationale": "Document retrieval categories are evaluated without a proper document-family content filter, so unrelated images/videos can dominate media-query results.",
+      "title": "Represent document-family filters explicitly across pdf/docx/pptx roots and children."
+    },
+    {
+      "owner": "evals",
+      "priority": 4,
+      "rationale": "Queries such as 'related document' are useful smoke probes but too ambiguous for release-quality diagnostics.",
+      "title": "Replace placeholder media prompts with grounded intent labels and provenance."
+    },
+    {
+      "owner": "evals",
+      "priority": 5,
+      "rationale": "The checked-in result is from v0.2.0 and predates serialized asset-level rollups, so it cannot fully separate child-asset hits from parent-memory hits.",
+      "title": "Rerun cross-modal ablation with the current harness to populate asset_level metrics."
+    },
+    {
+      "owner": "ingest",
+      "priority": 6,
+      "rationale": "Where RRF improves over vector-only, the improvement is evidence that derived text is helping and should be cached/versioned rather than recomputed ad hoc.",
+      "title": "Keep strengthening captions, transcripts, and OCR as first-class retrieval text."
+    },
+    {
+      "owner": "model_research",
+      "priority": 7,
+      "rationale": "ViDoRe-style visual document retrieval and MTEB/BEIR-style qrels offer better external baselines for document-heavy failures than anecdotes from one synthetic corpus.",
+      "title": "Benchmark visual/document-specialized retrievers against the weak categories."
+    }
+  ],
+  "source": {
+    "benchmark": "cross_modal_ablation",
+    "generated_at": "2026-03-22T01:15:51.127774+00:00",
+    "run_status": "complete",
+    "version": "0.2.0"
+  },
+  "summary": {
+    "categories": 13,
+    "issue_counts": {
+      "bm25_modality_blind": 8,
+      "derived_text_probe_insufficient": 3,
+      "derived_text_probe_lift": 2,
+      "document_family_filter_gap": 3,
+      "embedding_alignment_gap": 5,
+      "generic_query_artifact": 6,
+      "media_query_path_missing": 4,
+      "parent_asset_metrics_missing": 13,
+      "reranker_no_lift": 4,
+      "under_sampled_category": 11
+    },
+    "weak_categories": 9
+  },
+  "thresholds": {
+    "meaningful_delta": 0.1,
+    "min_category_queries": 20,
+    "weak_recall_at_5": 0.6
+  },
+  "weak_categories": [
+    {
+      "audit_source_counts_top5": {
+        "original_fts": 8,
+        "original_vec": 8
+      },
+      "best_recall_at_5": 0.0,
+      "best_stage": "Vector-only",
+      "category": "image_to_document",
+      "configured_content_type_filter": null,
+      "failure_examples": [
+        {
+          "image_query_path": "images/neural_network_diagram.png",
+          "query": "related document",
+          "query_type": "image",
+          "relevant_paths": [
+            "documents/ai_strategy_report.docx",
+            "documents/ai_architecture_deck.pptx",
+            "documents/embedding_research.pdf"
+          ],
+          "top_results": [
+            "images/neural_network_diagram.png",
+            "images/whiteboard_brainstorm.png",
+            "images/whiteboard_architecture.png"
+          ],
+          "video_query_path": null
+        },
+        {
+          "image_query_path": "images/code_editor_screenshot.png",
+          "query": "related document",
+          "query_type": "image",
+          "relevant_paths": [
+            "documents/recallforge_spec.docx",
+            "documents/operations_manual.pdf"
+          ],
+          "top_results": [
+            "images/code_editor_screenshot.png",
+            "videos/coding_demo.mp4::transcript:0001@0.00s",
+            "images/whiteboard_architecture.png"
+          ],
+          "video_query_path": null
+        }
+      ],
+      "issues": [
+        {
+          "code": "under_sampled_category",
+          "evidence": "2 queries is below the 20-query diagnostic floor.",
+          "severity": "high"
+        },
+        {
+          "code": "bm25_modality_blind",
+          "evidence": "BM25 can't process image queries",
+          "severity": "medium"
+        },
+        {
+          "code": "embedding_alignment_gap",
+          "evidence": "Vector-only R@5=0.000; media-query categories need raw embedding alignment above 0.60.",
+          "severity": "high"
+        },
+        {
+          "code": "derived_text_probe_lift",
+          "evidence": "RRF improves R@5 by +0.000 and R@10 by +0.500 over raw vector search.",
+          "severity": "positive"
+        },
+        {
+          "code": "reranker_no_lift",
+          "evidence": "Hybrid reranker changes R@5 by only +0.000 versus RRF.",
+          "severity": "medium"
+        },
+        {
+          "code": "parent_asset_metrics_missing",
+          "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.",
+          "severity": "medium"
+        },
+        {
+          "code": "document_family_filter_gap",
+          "evidence": "Benchmark cannot currently constrain results to the pdf/docx/pptx document family with a single content_type filter.",
+          "severity": "medium"
+        },
+        {
+          "code": "generic_query_artifact",
+          "evidence": "Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.",
+          "severity": "medium"
+        }
+      ],
+      "metrics": {
+        "bm25_recall_at_5": null,
+        "hybrid_recall_at_10": 0.5,
+        "hybrid_recall_at_5": 0.0,
+        "rrf_recall_at_10": 0.5,
+        "rrf_recall_at_5": 0.0,
+        "vector_recall_at_10": 0.0,
+        "vector_recall_at_5": 0.0
+      },
+      "priority_score": 0.85,
+      "target_content_type": "document_family",
+      "total_queries": 2
+    },
+    {
+      "audit_source_counts_top5": {
+        "original_fts": 4,
+        "original_vec": 4
+      },
+      "best_recall_at_5": 0.0,
+      "best_stage": "Vector-only",
+      "category": "video_to_document",
+      "configured_content_type_filter": null,
+      "failure_examples": [
+        {
+          "image_query_path": null,
+          "query": "related document",
+          "query_type": "video",
+          "relevant_paths": [
+            "documents/ai_architecture_deck.pptx"
+          ],
+          "top_results": [
+            "videos/architecture_walkthrough.mp4",
+            "videos/architecture_walkthrough.mp4::frame:0004@15.00s",
+            "videos/architecture_walkthrough.mp4::frame:0006@25.00s"
+          ],
+          "video_query_path": null
+        }
+      ],
+      "issues": [
+        {
+          "code": "under_sampled_category",
+          "evidence": "1 queries is below the 20-query diagnostic floor.",
+          "severity": "high"
+        },
+        {
+          "code": "bm25_modality_blind",
+          "evidence": "BM25 can't process video queries",
+          "severity": "medium"
+        },
+        {
+          "code": "embedding_alignment_gap",
+          "evidence": "Vector-only R@5=0.000; media-query categories need raw embedding alignment above 0.60.",
+          "severity": "high"
+        },
+        {
+          "code": "derived_text_probe_insufficient",
+          "evidence": "RRF R@5=0.000 does not materially lift vector R@5=0.000.",
+          "severity": "high"
+        },
+        {
+          "code": "reranker_no_lift",
+          "evidence": "Hybrid reranker changes R@5 by only +0.000 versus RRF.",
+          "severity": "medium"
+        },
+        {
+          "code": "parent_asset_metrics_missing",
+          "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.",
+          "severity": "medium"
+        },
+        {
+          "code": "document_family_filter_gap",
+          "evidence": "Benchmark cannot currently constrain results to the pdf/docx/pptx document family with a single content_type filter.",
+          "severity": "medium"
+        },
+        {
+          "code": "generic_query_artifact",
+          "evidence": "Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.",
+          "severity": "medium"
+        },
+        {
+          "code": "media_query_path_missing",
+          "evidence": "1 per-query rows omit image_query_path or video_query_path instrumentation.",
+          "severity": "low"
+        }
+      ],
+      "metrics": {
+        "bm25_recall_at_5": null,
+        "hybrid_recall_at_10": 0.0,
+        "hybrid_recall_at_5": 0.0,
+        "rrf_recall_at_10": 0.0,
+        "rrf_recall_at_5": 0.0,
+        "vector_recall_at_10": 0.0,
+        "vector_recall_at_5": 0.0
+      },
+      "priority_score": 0.85,
+      "target_content_type": "document_family",
+      "total_queries": 1
+    },
+    {
+      "audit_source_counts_top5": {
+        "original_fts": 7,
+        "original_vec": 8
+      },
+      "best_recall_at_5": 0.0,
+      "best_stage": "Vector-only",
+      "category": "video_to_image",
+      "configured_content_type_filter": "image",
+      "failure_examples": [
+        {
+          "image_query_path": null,
+          "query": "related image",
+          "query_type": "video",
+          "relevant_paths": [
+            "images/forest_landscape.png",
+            "images/mountain_landscape.png",
+            "images/ocean_beach.png"
+          ],
+          "top_results": [
+            "videos/nature_timelapse.mp4::frame:0006@25.00s",
+            "videos/nature_timelapse.mp4::frame:0005@20.00s",
+            "videos/nature_timelapse.mp4::frame:0007@30.00s"
+          ],
+          "video_query_path": null
+        },
+        {
+          "image_query_path": null,
+          "query": "related image",
+          "query_type": "video",
+          "relevant_paths": [
+            "images/whiteboard_brainstorm.png",
+            "images/whiteboard_architecture.png"
+          ],
+          "top_results": [
+            "videos/whiteboard_session.mp4::frame:0003@10.00s",
+            "videos/whiteboard_session.mp4::frame:0002@5.00s",
+            "videos/whiteboard_session.mp4::frame:0001@0.00s"
+          ],
+          "video_query_path": null
+        }
+      ],
+      "issues": [
+        {
+          "code": "under_sampled_category",
+          "evidence": "2 queries is below the 20-query diagnostic floor.",
+          "severity": "high"
+        },
+        {
+          "code": "bm25_modality_blind",
+          "evidence": "BM25 can't process video queries",
+          "severity": "medium"
+        },
+        {
+          "code": "embedding_alignment_gap",
+          "evidence": "Vector-only R@5=0.000; media-query categories need raw embedding alignment above 0.60.",
+          "severity": "high"
+        },
+        {
+          "code": "derived_text_probe_insufficient",
+          "evidence": "RRF R@5=0.000 does not materially lift vector R@5=0.000.",
+          "severity": "high"
+        },
+        {
+          "code": "reranker_no_lift",
+          "evidence": "Hybrid reranker changes R@5 by only +0.000 versus RRF.",
+          "severity": "medium"
+        },
+        {
+          "code": "parent_asset_metrics_missing",
+          "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.",
+          "severity": "medium"
+        },
+        {
+          "code": "generic_query_artifact",
+          "evidence": "Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.",
+          "severity": "medium"
+        },
+        {
+          "code": "media_query_path_missing",
+          "evidence": "2 per-query rows omit image_query_path or video_query_path instrumentation.",
+          "severity": "low"
+        }
+      ],
+      "metrics": {
+        "bm25_recall_at_5": null,
+        "hybrid_recall_at_10": 1.0,
+        "hybrid_recall_at_5": 0.0,
+        "rrf_recall_at_10": 1.0,
+        "rrf_recall_at_5": 0.0,
+        "vector_recall_at_10": 1.0,
+        "vector_recall_at_5": 0.0
+      },
+      "priority_score": 0.8,
+      "target_content_type": "image",
+      "total_queries": 2
+    },
+    {
+      "audit_source_counts_top5": {
+        "original_fts": 8,
+        "original_vec": 12
+      },
+      "best_recall_at_5": 0.3333,
+      "best_stage": "Vector-only",
+      "category": "video_to_text",
+      "configured_content_type_filter": "text",
+      "failure_examples": [
+        {
+          "image_query_path": null,
+          "query": "related text",
+          "query_type": "video",
+          "relevant_paths": [
+            "text/tech_cybersecurity.md",
+            "text/tech_cloud_computing.md"
+          ],
+          "top_results": [
+            "videos/coding_demo.mp4::transcript:0002@2.00s",
+            "videos/whiteboard_session.mp4::transcript:0001@0.00s",
+            "text/medicine_nutrition.md"
+          ],
+          "video_query_path": null
+        },
+        {
+          "image_query_path": null,
+          "query": "related text",
+          "query_type": "video",
+          "relevant_paths": [
+            "text/architecture_gothic.md",
+            "text/architecture_modern.md",
+            "text/architecture_blueprints.md"
+          ],
+          "top_results": [
+            "videos/coding_demo.mp4::transcript:0002@2.00s",
+            "text/ai_agents.md",
+            "documents/ai_architecture_deck.pptx::slide:0001"
+          ],
+          "video_query_path": null
+        }
+      ],
+      "issues": [
+        {
+          "code": "under_sampled_category",
+          "evidence": "3 queries is below the 20-query diagnostic floor.",
+          "severity": "high"
+        },
+        {
+          "code": "bm25_modality_blind",
+          "evidence": "BM25 can't process video queries",
+          "severity": "medium"
+        },
+        {
+          "code": "embedding_alignment_gap",
+          "evidence": "Vector-only R@5=0.333; media-query categories need raw embedding alignment above 0.60.",
+          "severity": "high"
+        },
+        {
+          "code": "derived_text_probe_insufficient",
+          "evidence": "RRF R@5=0.333 does not materially lift vector R@5=0.333.",
+          "severity": "high"
+        },
+        {
+          "code": "reranker_no_lift",
+          "evidence": "Hybrid reranker changes R@5 by only +0.000 versus RRF.",
+          "severity": "medium"
+        },
+        {
+          "code": "parent_asset_metrics_missing",
+          "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.",
+          "severity": "medium"
+        },
+        {
+          "code": "generic_query_artifact",
+          "evidence": "Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.",
+          "severity": "medium"
+        },
+        {
+          "code": "media_query_path_missing",
+          "evidence": "3 per-query rows omit image_query_path or video_query_path instrumentation.",
+          "severity": "low"
+        }
+      ],
+      "metrics": {
+        "bm25_recall_at_5": null,
+        "hybrid_recall_at_10": 0.3333,
+        "hybrid_recall_at_5": 0.3333,
+        "rrf_recall_at_10": 0.3333,
+        "rrf_recall_at_5": 0.3333,
+        "vector_recall_at_10": 0.3333,
+        "vector_recall_at_5": 0.3333
+      },
+      "priority_score": 0.4667,
+      "target_content_type": "text",
+      "total_queries": 3
+    },
+    {
+      "audit_source_counts_top5": {
+        "original_fts": 8,
+        "original_vec": 12
+      },
+      "best_recall_at_5": 1.0,
+      "best_stage": "Vector-only",
+      "category": "image_to_image",
+      "configured_content_type_filter": "image",
+      "failure_examples": [],
+      "issues": [
+        {
+          "code": "under_sampled_category",
+          "evidence": "3 queries is below the 20-query diagnostic floor.",
+          "severity": "high"
+        },
+        {
+          "code": "bm25_modality_blind",
+          "evidence": "BM25 can't process image queries",
+          "severity": "medium"
+        },
+        {
+          "code": "parent_asset_metrics_missing",
+          "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.",
+          "severity": "medium"
+        }
+      ],
+      "metrics": {
+        "bm25_recall_at_5": null,
+        "hybrid_recall_at_10": 1.0,
+        "hybrid_recall_at_5": 1.0,
+        "rrf_recall_at_10": 1.0,
+        "rrf_recall_at_5": 1.0,
+        "vector_recall_at_10": 1.0,
+        "vector_recall_at_5": 1.0
+      },
+      "priority_score": 0.2,
+      "target_content_type": "image",
+      "total_queries": 3
+    },
+    {
+      "audit_source_counts_top5": {
+        "original_fts": 60,
+        "original_vec": 63
+      },
+      "best_recall_at_5": 0.6,
+      "best_stage": "Vector + BM25 (RRF)",
+      "category": "image_to_text",
+      "configured_content_type_filter": "text",
+      "failure_examples": [
+        {
+          "image_query_path": "images/ocean_beach.png",
+          "query": "",
+          "query_type": "image",
+          "relevant_paths": [
+            "text/nature_oceans.md"
+          ],
+          "top_results": [
+            "text/sports_golf.md",
+            "text/music_production.md",
+            "text/nature_forests.md"
+          ],
+          "video_query_path": null
+        },
+        {
+          "image_query_path": "images/whiteboard_architecture.png",
+          "query": "",
+          "query_type": "image",
+          "relevant_paths": [
+            "text/ai_agents.md",
+            "text/tech_edge_ai.md"
+          ],
+          "top_results": [
+            "videos/architecture_walkthrough.mp4::transcript:0001@0.00s",
+            "text/tech_cloud_computing.md",
+            "videos/whiteboard_session.mp4::transcript:0001@0.00s"
+          ],
+          "video_query_path": null
+        },
+        {
+          "image_query_path": "images/whiteboard_brainstorm.png",
+          "query": "",
+          "query_type": "image",
+          "relevant_paths": [
+            "text/ai_agents.md",
+            "text/tech_cloud_computing.md"
+          ],
+          "top_results": [
+            "videos/whiteboard_session.mp4::transcript:0002@2.00s",
+            "videos/whiteboard_session.mp4::transcript:0001@0.00s",
+            "videos/architecture_walkthrough.mp4::transcript:0001@0.00s"
+          ],
+          "video_query_path": null
+        }
+      ],
+      "issues": [
+        {
+          "code": "under_sampled_category",
+          "evidence": "15 queries is below the 20-query diagnostic floor.",
+          "severity": "medium"
+        },
+        {
+          "code": "bm25_modality_blind",
+          "evidence": "BM25 can't process image queries",
+          "severity": "medium"
+        },
+        {
+          "code": "embedding_alignment_gap",
+          "evidence": "Vector-only R@5=0.467; media-query categories need raw embedding alignment above 0.60.",
+          "severity": "high"
+        },
+        {
+          "code": "derived_text_probe_lift",
+          "evidence": "RRF improves R@5 by +0.133 and R@10 by +0.133 over raw vector search.",
+          "severity": "positive"
+        },
+        {
+          "code": "parent_asset_metrics_missing",
+          "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.",
+          "severity": "medium"
+        },
+        {
+          "code": "generic_query_artifact",
+          "evidence": "Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.",
+          "severity": "medium"
+        }
+      ],
+      "metrics": {
+        "bm25_recall_at_5": null,
+        "hybrid_recall_at_10": 0.8667,
+        "hybrid_recall_at_5": 0.6,
+        "rrf_recall_at_10": 0.8667,
+        "rrf_recall_at_5": 0.6,
+        "vector_recall_at_10": 0.7333,
+        "vector_recall_at_5": 0.4667
+      },
+      "priority_score": 0.2,
+      "target_content_type": "text",
+      "total_queries": 15
+    },
+    {
+      "audit_source_counts_top5": {
+        "original_fts": 2,
+        "original_vec": 8
+      },
+      "best_recall_at_5": 1.0,
+      "best_stage": "Vector-only",
+      "category": "image_to_video",
+      "configured_content_type_filter": "video",
+      "failure_examples": [],
+      "issues": [
+        {
+          "code": "under_sampled_category",
+          "evidence": "2 queries is below the 20-query diagnostic floor.",
+          "severity": "high"
+        },
+        {
+          "code": "bm25_modality_blind",
+          "evidence": "BM25 can't process image queries",
+          "severity": "medium"
+        },
+        {
+          "code": "parent_asset_metrics_missing",
+          "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.",
+          "severity": "medium"
+        },
+        {
+          "code": "generic_query_artifact",
+          "evidence": "Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.",
+          "severity": "medium"
+        }
+      ],
+      "metrics": {
+        "bm25_recall_at_5": null,
+        "hybrid_recall_at_10": 1.0,
+        "hybrid_recall_at_5": 1.0,
+        "rrf_recall_at_10": 1.0,
+        "rrf_recall_at_5": 1.0,
+        "vector_recall_at_10": 1.0,
+        "vector_recall_at_5": 1.0
+      },
+      "priority_score": 0.2,
+      "target_content_type": "video",
+      "total_queries": 2
+    },
+    {
+      "audit_source_counts_top5": {
+        "original_fts": 44,
+        "original_vec": 48
+      },
+      "best_recall_at_5": 1.0,
+      "best_stage": "Vector-only",
+      "category": "text_to_document",
+      "configured_content_type_filter": null,
+      "failure_examples": [],
+      "issues": [
+        {
+          "code": "under_sampled_category",
+          "evidence": "12 queries is below the 20-query diagnostic floor.",
+          "severity": "medium"
+        },
+        {
+          "code": "parent_asset_metrics_missing",
+          "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.",
+          "severity": "medium"
+        },
+        {
+          "code": "document_family_filter_gap",
+          "evidence": "Benchmark cannot currently constrain results to the pdf/docx/pptx document family with a single content_type filter.",
+          "severity": "medium"
+        }
+      ],
+      "metrics": {
+        "bm25_recall_at_5": 1.0,
+        "hybrid_recall_at_10": 1.0,
+        "hybrid_recall_at_5": 1.0,
+        "rrf_recall_at_10": 1.0,
+        "rrf_recall_at_5": 1.0,
+        "vector_recall_at_10": 1.0,
+        "vector_recall_at_5": 1.0
+      },
+      "priority_score": 0.2,
+      "target_content_type": "document_family",
+      "total_queries": 12
+    },
+    {
+      "audit_source_counts_top5": {
+        "original_vec": 4
+      },
+      "best_recall_at_5": 1.0,
+      "best_stage": "Vector-only",
+      "category": "video_to_video",
+      "configured_content_type_filter": "video",
+      "failure_examples": [],
+      "issues": [
+        {
+          "code": "under_sampled_category",
+          "evidence": "1 queries is below the 20-query diagnostic floor.",
+          "severity": "high"
+        },
+        {
+          "code": "bm25_modality_blind",
+          "evidence": "BM25 can't process video queries",
+          "severity": "medium"
+        },
+        {
+          "code": "parent_asset_metrics_missing",
+          "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.",
+          "severity": "medium"
+        },
+        {
+          "code": "media_query_path_missing",
+          "evidence": "1 per-query rows omit image_query_path or video_query_path instrumentation.",
+          "severity": "low"
+        }
+      ],
+      "metrics": {
+        "bm25_recall_at_5": null,
+        "hybrid_recall_at_10": 1.0,
+        "hybrid_recall_at_5": 1.0,
+        "rrf_recall_at_10": 1.0,
+        "rrf_recall_at_5": 1.0,
+        "vector_recall_at_10": 1.0,
+        "vector_recall_at_5": 1.0
+      },
+      "priority_score": 0.2,
+      "target_content_type": "video",
+      "total_queries": 1
+    }
+  ]
+}
diff --git a/docs/RELEASE.md b/docs/RELEASE.md
index 8854dc6..9c9a87d 100644
--- a/docs/RELEASE.md
+++ b/docs/RELEASE.md
@@ -45,6 +45,14 @@ Then run the expanded benchmark:
 
 The benchmark now checkpoints to JSON as it runs. If the run is interrupted, the output file still contains partial results plus progress metadata.
 
+After a complete or partial benchmark run, generate the cross-modal diagnosis report:
+
+```bash
+.venv/bin/python benchmarks/cross_modal_diagnostics.py --input benchmarks/results/cross_modal_ablation_results.json
+```
+
+This writes `benchmarks/results/cross_modal_diagnostics.json` and `docs/research/cross-modal-diagnostics.md`, ranking weak or under-sampled categories and separating raw embedding, derived-text, reranker, parent-memory, and benchmark-artifact signals.
+
 For safer local validation after the MLX hardening work, prefer the bounded smoke lane first:
 
 ```bash
diff --git a/docs/research/cross-modal-diagnostics.md b/docs/research/cross-modal-diagnostics.md
new file mode 100644
index 0000000..c5149a2
--- /dev/null
+++ b/docs/research/cross-modal-diagnostics.md
@@ -0,0 +1,289 @@
+# Cross-Modal Retrieval Diagnostics
+
+This report is generated from the saved cross-modal ablation JSON. It separates raw embedding alignment, derived-text contribution, reranker contribution, benchmark artifacts, and parent-memory scoring coverage.
+
+## Source
+
+- Benchmark: `cross_modal_ablation`
+- Source version: `0.2.0`
+- Source generated at: `2026-03-22T01:15:51.127774+00:00`
+- Run status: `complete`
+- Weak threshold: R@5 < 60.0%
+- Query floor: 20 queries per category
+
+## Weak And At-Risk Category Ranking
+
+| Priority | Category | Queries | Best stage | Best R@5 | Vector R@5 | RRF R@5 | Hybrid R@5 | Key issues |
+|---:|---|---:|---|---:|---:|---:|---:|---|
+| 0.85 | `image_to_document` | 2 | Vector-only | 0.0% | 0.0% | 0.0% | 0.0% | under_sampled_category, bm25_modality_blind, embedding_alignment_gap, derived_text_probe_lift |
+| 0.85 | `video_to_document` | 1 | Vector-only | 0.0% | 0.0% | 0.0% | 0.0% | under_sampled_category, bm25_modality_blind, embedding_alignment_gap, derived_text_probe_insufficient |
+| 0.80 | `video_to_image` | 2 | Vector-only | 0.0% | 0.0% | 0.0% | 0.0% | under_sampled_category, bm25_modality_blind, embedding_alignment_gap, derived_text_probe_insufficient |
+| 0.47 | `video_to_text` | 3 | Vector-only | 33.3% | 33.3% | 33.3% | 33.3% | under_sampled_category, bm25_modality_blind, embedding_alignment_gap, derived_text_probe_insufficient |
+| 0.20 | `image_to_image` | 3 | Vector-only | 100.0% | 100.0% | 100.0% | 100.0% | under_sampled_category, bm25_modality_blind, parent_asset_metrics_missing |
+| 0.20 | `image_to_text` | 15 | Vector + BM25 (RRF) | 60.0% | 46.7% | 60.0% | 60.0% | under_sampled_category, bm25_modality_blind, embedding_alignment_gap, derived_text_probe_lift |
+| 0.20 | `image_to_video` | 2 | Vector-only | 100.0% | 100.0% | 100.0% | 100.0% | under_sampled_category, bm25_modality_blind, parent_asset_metrics_missing, generic_query_artifact |
+| 0.20 | `text_to_document` | 12 | Vector-only | 100.0% | 100.0% | 100.0% | 100.0% | under_sampled_category, parent_asset_metrics_missing, document_family_filter_gap |
+| 0.20 | `video_to_video` | 1 | Vector-only | 100.0% | 100.0% | 100.0% | 100.0% | under_sampled_category, bm25_modality_blind, parent_asset_metrics_missing, media_query_path_missing |
+
+## Diagnosis Summary
+
+- `bm25_modality_blind`: 8
+- `derived_text_probe_insufficient`: 3
+- `derived_text_probe_lift`: 2
+- `document_family_filter_gap`: 3
+- `embedding_alignment_gap`: 5
+- `generic_query_artifact`: 6
+- `media_query_path_missing`: 4
+- `parent_asset_metrics_missing`: 13
+- `reranker_no_lift`: 4
+- `under_sampled_category`: 11
+
+## Prioritized Fix List
+
+1. **search** - Add bounded cascaded media reranking only after cheap top-K retrieval. The weakest media-query categories are not rescued by current RRF/reranker stages, so REC-130 should target a strict top-K cascade instead of broad expensive scoring.
+2. **evals** - Expand weak categories to at least 20 queries and keep parent-memory scoring. Several weak categories have 1-3 examples, which is too small to distinguish model weakness from benchmark noise; this maps directly to REC-160.
+3. **indexing** - Represent document-family filters explicitly across pdf/docx/pptx roots and children. Document retrieval categories are evaluated without a proper document-family content filter, so unrelated images/videos can dominate media-query results.
+4. **evals** - Replace placeholder media prompts with grounded intent labels and provenance. Queries such as 'related document' are useful smoke probes but too ambiguous for release-quality diagnostics.
+5. **evals** - Rerun cross-modal ablation with the current harness to populate asset_level metrics. The checked-in result is from v0.2.0 and predates serialized asset-level rollups, so it cannot fully separate child-asset hits from parent-memory hits.
+6. **ingest** - Keep strengthening captions, transcripts, and OCR as first-class retrieval text. Where RRF improves over vector-only, the improvement is evidence that derived text is helping and should be cached/versioned rather than recomputed ad hoc.
+7. **model_research** - Benchmark visual/document-specialized retrievers against the weak categories. ViDoRe-style visual document retrieval and MTEB/BEIR-style qrels offer better external baselines for document-heavy failures than anecdotes from one synthetic corpus.
+
+## Category Evidence
+
+### `image_to_document`
+
+- Queries: 2
+- Target result family: `document_family`
+- Configured benchmark content filter: `None`
+- Best stage/R@5: Vector-only / 0.0%
+- Raw vector R@5/R@10: 0.0% / 0.0%
+- RRF R@5/R@10: 0.0% / 50.0%
+- Hybrid R@5/R@10: 0.0% / 50.0%
+- Top-5 audit source counts: `original_fts`=8, `original_vec`=8
+- `under_sampled_category` (high): 2 queries is below the 20-query diagnostic floor.
+- `bm25_modality_blind` (medium): BM25 can't process image queries
+- `embedding_alignment_gap` (high): Vector-only R@5=0.000; media-query categories need raw embedding alignment above 0.60.
+- `derived_text_probe_lift` (positive): RRF improves R@5 by +0.000 and R@10 by +0.500 over raw vector search.
+- `reranker_no_lift` (medium): Hybrid reranker changes R@5 by only +0.000 versus RRF.
+- `parent_asset_metrics_missing` (medium): Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.
+- `document_family_filter_gap` (medium): Benchmark cannot currently constrain results to the pdf/docx/pptx document family with a single content_type filter.
+- `generic_query_artifact` (medium): Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.
+- Example misses:
+  - `related document` (image): expected ['documents/ai_strategy_report.docx', 'documents/ai_architecture_deck.pptx', 'documents/embedding_research.pdf']; top results `images/neural_network_diagram.png`, `images/whiteboard_brainstorm.png`, `images/whiteboard_architecture.png`
+  - `related document` (image): expected ['documents/recallforge_spec.docx', 'documents/operations_manual.pdf']; top results `images/code_editor_screenshot.png`, `videos/coding_demo.mp4::transcript:0001@0.00s`, `images/whiteboard_architecture.png`
+
+### `video_to_document`
+
+- Queries: 1
+- Target result family: `document_family`
+- Configured benchmark content filter: `None`
+- Best stage/R@5: Vector-only / 0.0%
+- Raw vector R@5/R@10: 0.0% / 0.0%
+- RRF R@5/R@10: 0.0% / 0.0%
+- Hybrid R@5/R@10: 0.0% / 0.0%
+- Top-5 audit source counts: `original_fts`=4, `original_vec`=4
+- `under_sampled_category` (high): 1 queries is below the 20-query diagnostic floor.
+- `bm25_modality_blind` (medium): BM25 can't process video queries
+- `embedding_alignment_gap` (high): Vector-only R@5=0.000; media-query categories need raw embedding alignment above 0.60.
+- `derived_text_probe_insufficient` (high): RRF R@5=0.000 does not materially lift vector R@5=0.000.
+- `reranker_no_lift` (medium): Hybrid reranker changes R@5 by only +0.000 versus RRF.
+- `parent_asset_metrics_missing` (medium): Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.
+- `document_family_filter_gap` (medium): Benchmark cannot currently constrain results to the pdf/docx/pptx document family with a single content_type filter.
+- `generic_query_artifact` (medium): Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.
+- `media_query_path_missing` (low): 1 per-query rows omit image_query_path or video_query_path instrumentation.
+- Example misses:
+  - `related document` (video): expected ['documents/ai_architecture_deck.pptx']; top results `videos/architecture_walkthrough.mp4`, `videos/architecture_walkthrough.mp4::frame:0004@15.00s`, `videos/architecture_walkthrough.mp4::frame:0006@25.00s`
+
+### `video_to_image`
+
+- Queries: 2
+- Target result family: `image`
+- Configured benchmark content filter: `image`
+- Best stage/R@5: Vector-only / 0.0%
+- Raw vector R@5/R@10: 0.0% / 100.0%
+- RRF R@5/R@10: 0.0% / 100.0%
+- Hybrid R@5/R@10: 0.0% / 100.0%
+- Top-5 audit source counts: `original_fts`=7, `original_vec`=8
+- `under_sampled_category` (high): 2 queries is below the 20-query diagnostic floor.
+- `bm25_modality_blind` (medium): BM25 can't process video queries
+- `embedding_alignment_gap` (high): Vector-only R@5=0.000; media-query categories need raw embedding alignment above 0.60.
+- `derived_text_probe_insufficient` (high): RRF R@5=0.000 does not materially lift vector R@5=0.000.
+- `reranker_no_lift` (medium): Hybrid reranker changes R@5 by only +0.000 versus RRF.
+- `parent_asset_metrics_missing` (medium): Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.
+- `generic_query_artifact` (medium): Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.
+- `media_query_path_missing` (low): 2 per-query rows omit image_query_path or video_query_path instrumentation.
+- Example misses:
+  - `related image` (video): expected ['images/forest_landscape.png', 'images/mountain_landscape.png', 'images/ocean_beach.png']; top results `videos/nature_timelapse.mp4::frame:0006@25.00s`, `videos/nature_timelapse.mp4::frame:0005@20.00s`, `videos/nature_timelapse.mp4::frame:0007@30.00s`
+  - `related image` (video): expected ['images/whiteboard_brainstorm.png', 'images/whiteboard_architecture.png']; top results `videos/whiteboard_session.mp4::frame:0003@10.00s`, `videos/whiteboard_session.mp4::frame:0002@5.00s`, `videos/whiteboard_session.mp4::frame:0001@0.00s`
+
+### `video_to_text`
+
+- Queries: 3
+- Target result family: `text`
+- Configured benchmark content filter: `text`
+- Best stage/R@5: Vector-only / 33.3%
+- Raw vector R@5/R@10: 33.3% / 33.3%
+- RRF R@5/R@10: 33.3% / 33.3%
+- Hybrid R@5/R@10: 33.3% / 33.3%
+- Top-5 audit source counts: `original_fts`=8, `original_vec`=12
+- `under_sampled_category` (high): 3 queries is below the 20-query diagnostic floor.
+- `bm25_modality_blind` (medium): BM25 can't process video queries
+- `embedding_alignment_gap` (high): Vector-only R@5=0.333; media-query categories need raw embedding alignment above 0.60.
+- `derived_text_probe_insufficient` (high): RRF R@5=0.333 does not materially lift vector R@5=0.333.
+- `reranker_no_lift` (medium): Hybrid reranker changes R@5 by only +0.000 versus RRF.
+- `parent_asset_metrics_missing` (medium): Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.
+- `generic_query_artifact` (medium): Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.
+- `media_query_path_missing` (low): 3 per-query rows omit image_query_path or video_query_path instrumentation.
+- Example misses:
+  - `related text` (video): expected ['text/tech_cybersecurity.md', 'text/tech_cloud_computing.md']; top results `videos/coding_demo.mp4::transcript:0002@2.00s`, `videos/whiteboard_session.mp4::transcript:0001@0.00s`, `text/medicine_nutrition.md`
+  - `related text` (video): expected ['text/architecture_gothic.md', 'text/architecture_modern.md', 'text/architecture_blueprints.md']; top results `videos/coding_demo.mp4::transcript:0002@2.00s`, `text/ai_agents.md`, `documents/ai_architecture_deck.pptx::slide:0001`
+
+### `image_to_image`
+
+- Queries: 3
+- Target result family: `image`
+- Configured benchmark content filter: `image`
+- Best stage/R@5: Vector-only / 100.0%
+- Raw vector R@5/R@10: 100.0% / 100.0%
+- RRF R@5/R@10: 100.0% / 100.0%
+- Hybrid R@5/R@10: 100.0% / 100.0%
+- Top-5 audit source counts: `original_fts`=8, `original_vec`=12
+- `under_sampled_category` (high): 3 queries is below the 20-query diagnostic floor.
+- `bm25_modality_blind` (medium): BM25 can't process image queries
+- `parent_asset_metrics_missing` (medium): Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.
+
+### `image_to_text`
+
+- Queries: 15
+- Target result family: `text`
+- Configured benchmark content filter: `text`
+- Best stage/R@5: Vector + BM25 (RRF) / 60.0%
+- Raw vector R@5/R@10: 46.7% / 73.3%
+- RRF R@5/R@10: 60.0% / 86.7%
+- Hybrid R@5/R@10: 60.0% / 86.7%
+- Top-5 audit source counts: `original_fts`=60, `original_vec`=63
+- `under_sampled_category` (medium): 15 queries is below the 20-query diagnostic floor.
+- `bm25_modality_blind` (medium): BM25 can't process image queries
+- `embedding_alignment_gap` (high): Vector-only R@5=0.467; media-query categories need raw embedding alignment above 0.60.
+- `derived_text_probe_lift` (positive): RRF improves R@5 by +0.133 and R@10 by +0.133 over raw vector search.
+- `parent_asset_metrics_missing` (medium): Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.
+- `generic_query_artifact` (medium): Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.
+- Example misses:
+  - `<empty media query>` (image): expected ['text/nature_oceans.md']; top results `text/sports_golf.md`, `text/music_production.md`, `text/nature_forests.md`
+  - `<empty media query>` (image): expected ['text/ai_agents.md', 'text/tech_edge_ai.md']; top results `videos/architecture_walkthrough.mp4::transcript:0001@0.00s`, `text/tech_cloud_computing.md`, `videos/whiteboard_session.mp4::transcript:0001@0.00s`
+  - `<empty media query>` (image): expected ['text/ai_agents.md', 'text/tech_cloud_computing.md']; top results `videos/whiteboard_session.mp4::transcript:0002@2.00s`, `videos/whiteboard_session.mp4::transcript:0001@0.00s`, `videos/architecture_walkthrough.mp4::transcript:0001@0.00s`
+
+### `image_to_video`
+
+- Queries: 2
+- Target result family: `video`
+- Configured benchmark content filter: `video`
+- Best stage/R@5: Vector-only / 100.0%
+- Raw vector R@5/R@10: 100.0% / 100.0%
+- RRF R@5/R@10: 100.0% / 100.0%
+- Hybrid R@5/R@10: 100.0% / 100.0%
+- Top-5 audit source counts: `original_fts`=2, `original_vec`=8
+- `under_sampled_category` (high): 2 queries is below the 20-query diagnostic floor.
+- `bm25_modality_blind` (medium): BM25 can't process image queries
+- `parent_asset_metrics_missing` (medium): Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.
+- `generic_query_artifact` (medium): Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.
+
+### `text_to_document`
+
+- Queries: 12
+- Target result family: `document_family`
+- Configured benchmark content filter: `None`
+- Best stage/R@5: Vector-only / 100.0%
+- Raw vector R@5/R@10: 100.0% / 100.0%
+- RRF R@5/R@10: 100.0% / 100.0%
+- Hybrid R@5/R@10: 100.0% / 100.0%
+- Top-5 audit source counts: `original_fts`=44, `original_vec`=48
+- `under_sampled_category` (medium): 12 queries is below the 20-query diagnostic floor.
+- `parent_asset_metrics_missing` (medium): Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.
+- `document_family_filter_gap` (medium): Benchmark cannot currently constrain results to the pdf/docx/pptx document family with a single content_type filter.
+
+### `video_to_video`
+
+- Queries: 1
+- Target result family: `video`
+- Configured benchmark content filter: `video`
+- Best stage/R@5: Vector-only / 100.0%
+- Raw vector R@5/R@10: 100.0% / 100.0%
+- RRF R@5/R@10: 100.0% / 100.0%
+- Hybrid R@5/R@10: 100.0% / 100.0%
+- Top-5 audit source counts: `original_vec`=4
+- `under_sampled_category` (high): 1 queries is below the 20-query diagnostic floor.
+- `bm25_modality_blind` (medium): BM25 can't process video queries
+- `parent_asset_metrics_missing` (medium): Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.
+- `media_query_path_missing` (low): 1 per-query rows omit image_query_path or video_query_path instrumentation.
+
+### `text_to_image`
+
+- Queries: 18
+- Target result family: `image`
+- Configured benchmark content filter: `image`
+- Best stage/R@5: Vector + BM25 (RRF) / 94.4%
+- Raw vector R@5/R@10: 88.9% / 94.4%
+- RRF R@5/R@10: 94.4% / 94.4%
+- Hybrid R@5/R@10: 94.4% / 94.4%
+- Top-5 audit source counts: `original_fts`=17, `original_vec`=72
+- `under_sampled_category` (medium): 18 queries is below the 20-query diagnostic floor.
+- `parent_asset_metrics_missing` (medium): Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.
+- Example misses:
+  - `coastal landscape photography` (text): expected ['images/ocean_beach.png']; top results `videos/nature_timelapse.mp4::frame:0006@25.00s`, `videos/nature_timelapse.mp4::frame:0005@20.00s`, `videos/nature_timelapse.mp4::frame:0003@10.00s`
+
+### `text_to_video`
+
+- Queries: 15
+- Target result family: `video`
+- Configured benchmark content filter: `video`
+- Best stage/R@5: Vector-only / 100.0%
+- Raw vector R@5/R@10: 100.0% / 100.0%
+- RRF R@5/R@10: 100.0% / 100.0%
+- Hybrid R@5/R@10: 100.0% / 100.0%
+- Top-5 audit source counts: `original_fts`=1, `original_vec`=60
+- `under_sampled_category` (medium): 15 queries is below the 20-query diagnostic floor.
+- `parent_asset_metrics_missing` (medium): Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.
+
+### `mixed_modal`
+
+- Queries: 20
+- Target result family: `mixed`
+- Configured benchmark content filter: `None`
+- Best stage/R@5: BM25-only / 95.0%
+- Raw vector R@5/R@10: 85.0% / 100.0%
+- RRF R@5/R@10: 95.0% / 100.0%
+- Hybrid R@5/R@10: 90.0% / 100.0%
+- Top-5 audit source counts: `original_fts`=75, `original_vec`=78
+- `parent_asset_metrics_missing` (medium): Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.
+- Example misses:
+  - `comprehensive guide to athletic performance` (text): expected ['text/sports_running.md', 'text/sports_cycling.md', 'text/sports_swimming.md', 'text/sports_yoga.md', 'text/medicine_nutrition.md', 'text/medicine_cardiology.md']; top results `text/cooking_sourdough.md`, `documents/ai_architecture_deck.pptx::slide:0003`, `documents/edge_deployment_guide.pdf::page:0001`
+
+### `text_to_text`
+
+- Queries: 60
+- Target result family: `text`
+- Configured benchmark content filter: `text`
+- Best stage/R@5: Vector + BM25 (RRF) / 90.0%
+- Raw vector R@5/R@10: 88.3% / 90.0%
+- RRF R@5/R@10: 90.0% / 91.7%
+- Hybrid R@5/R@10: 90.0% / 91.7%
+- Top-5 audit source counts: `original_fts`=188, `original_vec`=241
+- `parent_asset_metrics_missing` (medium): Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.
+- Example misses:
+  - `how do computers understand the meaning of words` (text): expected ['text/ai_embeddings.md']; top results `text/ai_transformers.md`, `text/tech_quantum_computing.md`, `text/tech_cloud_computing.md`
+  - `underwater basket weaving techniques` (text): expected []; top results `text/sports_swimming.md`, `text/architecture_gothic.md`, `text/cooking_sourdough.md`
+  - `medieval jousting tournament rules and equipment` (text): expected []; top results `text/sports_swimming.md`, `text/sports_cycling.md`, `text/history_renaissance.md`
+
+## Method Notes
+
+- Vector-only is treated as the raw embedding baseline.
+- RRF lift over vector-only is treated as evidence from derived text probes such as captions, transcripts, OCR, or BM25 text.
+- Hybrid-minus-RRF isolates the current reranker contribution.
+- Parent-memory versus asset-level scoring is only available when the source payload includes asset_level metrics.
+
+## External Evaluation References
+
+- [BEIR](https://github.com/beir-cellar/beir) structures retrieval evaluation around corpus, queries, qrels, run results, and metrics such as NDCG, MAP, Recall, Precision, and MRR.
+- [MTEB](https://github.com/embeddings-benchmark/mteb) is the broader embedding and retrieval evaluation framework now used by ViDoRe for single-model retriever submissions.
+- [ViDoRe pipeline evaluation](https://github.com/illuin-tech/vidore-benchmark) explicitly covers multi-stage, hybrid, reranking, OCR, and custom preprocessing pipelines for visual document retrieval.
diff --git a/tests/test_cross_modal_benchmark_defs.py b/tests/test_cross_modal_benchmark_defs.py
index 6ad5116..a660360 100644
--- a/tests/test_cross_modal_benchmark_defs.py
+++ b/tests/test_cross_modal_benchmark_defs.py
@@ -287,6 +287,52 @@ def test_output_payload_tracks_partial_progress(self):
             payload["stages"]["Vector-only"]["text_to_text"]["per_query_results"][0]["asset_level"]["hit_at_1"]
         )
 
+    def test_output_payload_preserves_video_query_path(self):
+        module = _load_cross_modal_ablation()
+
+        gt = module.VIDEO_TO_TEXT[0]
+        stage_result = module.StageResult(
+            stage="Vector-only",
+            category=gt.category,
+            total_queries=1,
+            hits_at_1=0,
+            hits_at_5=0,
+            hits_at_10=0,
+        )
+        stage_result.per_query_results.append(
+            {
+                "query": gt.query,
+                "query_type": gt.query_type,
+                "image_query_path": gt.image_query_path,
+                "video_query_path": gt.video_query_path,
+                "relevant_paths": gt.relevant_paths,
+                "hit_at_1": False,
+                "hit_at_5": False,
+                "hit_at_10": False,
+            }
+        )
+
+        payload = module._build_output_payload(
+            {gt.category: [gt]},
+            {"Vector-only": {gt.category: stage_result}},
+            [("Vector-only", "embed")],
+            expansion_profile=module._resolve_expansion_profile("caption_only"),
+            smoke_profile="safe",
+            rss_limit_mb=None,
+            peak_rss_mb=None,
+            indexed_items=1,
+            run_status="complete",
+            interrupted=False,
+            completed_stages=["Vector-only"],
+            current_stage=None,
+            current_category=None,
+        )
+
+        self.assertEqual(
+            payload["stages"]["Vector-only"][gt.category]["per_query_results"][0]["video_query_path"],
+            gt.video_query_path,
+        )
+
     def test_resolve_expansion_profile_variants(self):
         module = _load_cross_modal_ablation()
 
diff --git a/tests/test_cross_modal_diagnostics.py b/tests/test_cross_modal_diagnostics.py
new file mode 100644
index 0000000..e723593
--- /dev/null
+++ b/tests/test_cross_modal_diagnostics.py
@@ -0,0 +1,266 @@
+"""
+Regression tests for cross-modal benchmark diagnostics.
+"""
+
+import importlib.util
+import sys
+import unittest
+from pathlib import Path
+
+
+def _load_diagnostics_module():
+    repo_root = Path(__file__).resolve().parent.parent
+    module_path = repo_root / "benchmarks" / "cross_modal_diagnostics.py"
+    spec = importlib.util.spec_from_file_location("cross_modal_diagnostics", module_path)
+    module = importlib.util.module_from_spec(spec)
+    assert spec.loader is not None
+    sys.modules[spec.name] = module
+    spec.loader.exec_module(module)
+    return module
+
+
+class TestCrossModalDiagnostics(unittest.TestCase):
+    def _synthetic_payload(self):
+        return {
+            "benchmark": "cross_modal_ablation",
+            "version": "0.2.0",
+            "generated_at": "2026-05-17T00:00:00+00:00",
+            "run_status": "complete",
+            "categories": {
+                "image_to_document": {"queries": 2},
+                "image_to_text": {"queries": 15},
+                "mixed_modal": {"queries": 20},
+            },
+            "stages": {
+                "Vector-only": {
+                    "image_to_document": {
+                        "skipped": False,
+                        "total_queries": 2,
+                        "recall_at_5": 0.0,
+                        "recall_at_10": 0.0,
+                        "per_query_results": [
+                            {
+                                "query": "related document",
+                                "query_type": "image",
+                                "image_query_path": "images/neural_network_diagram.png",
+                                "relevant_paths": ["documents/ai_strategy_report.docx"],
+                                "hit_at_5": False,
+                                "results": [
+                                    {
+                                        "filepath": "recallforge://benchmark//repo/tests/uat/corpus/images/neural_network_diagram.png"
+                                    }
+                                ],
+                            },
+                            {
+                                "query": "related document",
+                                "query_type": "image",
+                                "image_query_path": "images/floor_plan_blueprint.png",
+                                "relevant_paths": ["documents/operations_manual.pdf"],
+                                "hit_at_5": False,
+                                "results": [],
+                            },
+                        ],
+                    },
+                    "image_to_text": {
+                        "skipped": False,
+                        "total_queries": 15,
+                        "recall_at_5": 0.4,
+                        "recall_at_10": 0.5,
+                        "asset_level": {"recall_at_5": 0.4},
+                        "per_query_results": [
+                            {
+                                "query": "",
+                                "query_type": "image",
+                                "image_query_path": "images/ocean_beach.png",
+                                "relevant_paths": ["text/nature_oceans.md"],
+                                "hit_at_5": False,
+                                "results": [],
+                            }
+                        ],
+                    },
+                    "mixed_modal": {
+                        "skipped": False,
+                        "total_queries": 20,
+                        "recall_at_5": 0.9,
+                        "recall_at_10": 1.0,
+                        "asset_level": {"recall_at_5": 0.9},
+                        "per_query_results": [],
+                    },
+                },
+                "BM25-only": {
+                    "image_to_document": {
+                        "skipped": True,
+                        "total_queries": 2,
+                        "skip_reason": "BM25 can't process image queries",
+                        "per_query_results": [],
+                    },
+                    "image_to_text": {
+                        "skipped": True,
+                        "total_queries": 15,
+                        "skip_reason": "BM25 can't process image queries",
+                        "per_query_results": [],
+                    },
+                    "mixed_modal": {
+                        "skipped": False,
+                        "total_queries": 20,
+                        "recall_at_5": 0.8,
+                        "recall_at_10": 0.9,
+                        "asset_level": {"recall_at_5": 0.8},
+                        "per_query_results": [],
+                    },
+                },
+                "Vector + BM25 (RRF)": {
+                    "image_to_document": {
+                        "skipped": False,
+                        "total_queries": 2,
+                        "recall_at_5": 0.0,
+                        "recall_at_10": 0.5,
+                        "per_query_results": [
+                            {
+                                "query": "related document",
+                                "query_type": "image",
+                                "image_query_path": "images/neural_network_diagram.png",
+                                "relevant_paths": ["documents/ai_strategy_report.docx"],
+                                "hit_at_5": False,
+                                "results": [
+                                    {
+                                        "filepath": "recallforge://benchmark//repo/tests/uat/corpus/text/science_neuroscience.md",
+                                        "audit": {"rrf_sources": {"original_vec": 3, "original_fts": 1}},
+                                    }
+                                ],
+                            }
+                        ],
+                    },
+                    "image_to_text": {
+                        "skipped": False,
+                        "total_queries": 15,
+                        "recall_at_5": 0.65,
+                        "recall_at_10": 0.8,
+                        "asset_level": {"recall_at_5": 0.65},
+                        "per_query_results": [
+                            {
+                                "query": "",
+                                "query_type": "image",
+                                "image_query_path": "images/ocean_beach.png",
+                                "relevant_paths": ["text/nature_oceans.md"],
+                                "hit_at_5": True,
+                                "results": [
+                                    {
+                                        "filepath": "recallforge://benchmark//repo/tests/uat/corpus/text/nature_oceans.md",
+                                        "audit": {"rrf_sources": {"original_vec": 1, "original_fts": 2}},
+                                    }
+                                ],
+                            }
+                        ],
+                    },
+                    "mixed_modal": {
+                        "skipped": False,
+                        "total_queries": 20,
+                        "recall_at_5": 0.95,
+                        "recall_at_10": 1.0,
+                        "asset_level": {"recall_at_5": 0.95},
+                        "per_query_results": [],
+                    },
+                },
+                "Vector + BM25 + Reranker": {
+                    "image_to_document": {
+                        "skipped": False,
+                        "total_queries": 2,
+                        "recall_at_5": 0.0,
+                        "recall_at_10": 0.5,
+                        "per_query_results": [],
+                    },
+                    "image_to_text": {
+                        "skipped": False,
+                        "total_queries": 15,
+                        "recall_at_5": 0.65,
+                        "recall_at_10": 0.8,
+                        "asset_level": {"recall_at_5": 0.65},
+                        "per_query_results": [],
+                    },
+                    "mixed_modal": {
+                        "skipped": False,
+                        "total_queries": 20,
+                        "recall_at_5": 0.95,
+                        "recall_at_10": 1.0,
+                        "asset_level": {"recall_at_5": 0.95},
+                        "per_query_results": [],
+                    },
+                },
+            },
+        }
+
+    def test_diagnostics_rank_and_classify_weak_categories(self):
+        module = _load_diagnostics_module()
+
+        diagnostics = module.build_diagnostics(
+            self._synthetic_payload(),
+            content_type_filters={
+                "image_to_document": None,
+                "image_to_text": "text",
+                "mixed_modal": None,
+            },
+            min_queries=20,
+            weak_recall_at_5=0.6,
+        )
+
+        self.assertEqual(diagnostics["weak_categories"][0]["category"], "image_to_document")
+        issues = {
+            issue["code"]
+            for issue in diagnostics["weak_categories"][0]["issues"]
+        }
+        self.assertIn("under_sampled_category", issues)
+        self.assertIn("bm25_modality_blind", issues)
+        self.assertIn("embedding_alignment_gap", issues)
+        self.assertIn("document_family_filter_gap", issues)
+        self.assertIn("generic_query_artifact", issues)
+        self.assertIn("parent_asset_metrics_missing", issues)
+        self.assertEqual(
+            diagnostics["weak_categories"][0]["audit_source_counts_top5"],
+            {"original_fts": 1, "original_vec": 1},
+        )
+
+    def test_diagnostics_detect_derived_text_lift(self):
+        module = _load_diagnostics_module()
+
+        diagnostics = module.build_diagnostics(
+            self._synthetic_payload(),
+            content_type_filters={
+                "image_to_document": None,
+                "image_to_text": "text",
+                "mixed_modal": None,
+            },
+            min_queries=20,
+            weak_recall_at_5=0.6,
+        )
+        image_to_text = next(
+            item for item in diagnostics["all_categories"] if item["category"] == "image_to_text"
+        )
+        issues = {issue["code"] for issue in image_to_text["issues"]}
+
+        self.assertIn("derived_text_probe_lift", issues)
+        self.assertNotIn("parent_asset_metrics_missing", issues)
+
+    def test_markdown_report_contains_ranked_fixes(self):
+        module = _load_diagnostics_module()
+        diagnostics = module.build_diagnostics(
+            self._synthetic_payload(),
+            content_type_filters={
+                "image_to_document": None,
+                "image_to_text": "text",
+                "mixed_modal": None,
+            },
+            min_queries=20,
+            weak_recall_at_5=0.6,
+        )
+
+        markdown = module.render_markdown(diagnostics)
+
+        self.assertIn("# Cross-Modal Retrieval Diagnostics", markdown)
+        self.assertIn("`image_to_document`", markdown)
+        self.assertIn("Prioritized Fix List", markdown)
+        self.assertIn("BEIR", markdown)
+
+
+if __name__ == "__main__":
+    unittest.main()