diff --git a/README.md b/README.md index 259e660..6afbba9 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ The reranker delivers **+20.7% R@1 over RRF fusion** and pushes R@10 to 97.8%. E *Benchmark categories: text_only (30 queries), image_only (30 queries), long_query (12 queries), typo_query (20 queries). See `benchmarks/results/pipeline_ablation_modality_results.json` for full breakdown.* -For release validation, use `benchmarks/cross_modal_ablation.py`. It now checkpoints JSON output as it runs, so long MLX benchmark sessions still leave behind a partial artifact if interrupted. +For release validation, use `benchmarks/cross_modal_ablation.py`. It checkpoints JSON output as it runs, so long MLX benchmark sessions still leave behind a partial artifact if interrupted. To turn that artifact into a ranked fix list, run `benchmarks/cross_modal_diagnostics.py`; the current report is in [docs/research/cross-modal-diagnostics.md](docs/research/cross-modal-diagnostics.md). ### Latency & resource usage diff --git a/benchmarks/cross_modal_ablation.py b/benchmarks/cross_modal_ablation.py index 026f085..8298b5f 100644 --- a/benchmarks/cross_modal_ablation.py +++ b/benchmarks/cross_modal_ablation.py @@ -2395,6 +2395,7 @@ def save_checkpoint( "query": gt.query, "query_type": gt.query_type, "image_query_path": gt.image_query_path, + "video_query_path": gt.video_query_path, "relevant_paths": gt.relevant_paths, "difficulty": gt.difficulty, "is_negative_control": gt.is_negative_control, @@ -2424,6 +2425,7 @@ def save_checkpoint( "query": gt.query, "query_type": gt.query_type, "image_query_path": gt.image_query_path, + "video_query_path": gt.video_query_path, "relevant_paths": gt.relevant_paths, "difficulty": gt.difficulty, "is_negative_control": gt.is_negative_control, diff --git a/benchmarks/cross_modal_diagnostics.py b/benchmarks/cross_modal_diagnostics.py new file mode 100644 index 0000000..190a2d5 --- /dev/null +++ b/benchmarks/cross_modal_diagnostics.py @@ -0,0 +1,841 @@ +#!/usr/bin/env python3 +"""Diagnose weak cross-modal benchmark categories. + +This offline tool reads a saved ``cross_modal_ablation.py`` JSON payload and +turns the raw per-stage metrics into a ranked diagnosis. It is intentionally +stdlib-only so it can run after a long benchmark session without loading model +stacks or touching the local RecallForge index. +""" + +from __future__ import annotations + +import argparse +import json +import sys +from collections import Counter, defaultdict +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Tuple + + +DEFAULT_INPUT = Path("benchmarks/results/cross_modal_ablation_results.json") +DEFAULT_JSON_OUTPUT = Path("benchmarks/results/cross_modal_diagnostics.json") +DEFAULT_MARKDOWN_OUTPUT = Path("docs/research/cross-modal-diagnostics.md") + +WEAK_RECALL_AT_5 = 0.60 +MIN_CATEGORY_QUERIES = 20 +MEANINGFUL_DELTA = 0.10 + +DOCUMENT_FILE_TYPES = {"pdf", "docx", "pptx"} +DOCUMENT_CATEGORIES = { + "text_to_document", + "image_to_document", + "video_to_document", +} +GENERIC_MEDIA_QUERIES = { + "", + "related text", + "related image", + "related video", + "related document", +} + + +def _safe_float(value: Any) -> Optional[float]: + if value is None: + return None + try: + return float(value) + except (TypeError, ValueError): + return None + + +def _stage_role(stage_name: str) -> str: + normalized = stage_name.lower() + if "vector-only" in normalized or normalized == "vector": + return "vector" + if "bm25-only" in normalized or normalized == "bm25": + return "bm25" + if "reranker" in normalized or "hybrid" in normalized: + return "hybrid" + if "rrf" in normalized: + return "rrf" + return "other" + + +def _stage_lookup(stages: Mapping[str, Any]) -> Dict[str, str]: + roles: Dict[str, str] = {} + for stage_name in stages: + role = _stage_role(stage_name) + if role not in roles: + roles[role] = stage_name + return roles + + +def _category_names(payload: Mapping[str, Any]) -> List[str]: + categories = set(payload.get("categories", {}) or {}) + for stage_data in (payload.get("stages", {}) or {}).values(): + if isinstance(stage_data, Mapping): + categories.update(stage_data) + return sorted(categories) + + +def _category_query_count(payload: Mapping[str, Any], category: str) -> int: + metadata = (payload.get("categories", {}) or {}).get(category) + if isinstance(metadata, Mapping): + queries = metadata.get("queries") + if queries is not None: + return int(queries) + for stage_data in (payload.get("stages", {}) or {}).values(): + metrics = stage_data.get(category) if isinstance(stage_data, Mapping) else None + if isinstance(metrics, Mapping) and metrics.get("total_queries") is not None: + return int(metrics["total_queries"]) + return 0 + + +def _metric( + stages: Mapping[str, Mapping[str, Any]], + stage_name: Optional[str], + category: str, + metric_name: str, +) -> Optional[float]: + if not stage_name: + return None + metrics = stages.get(stage_name, {}).get(category) + if not isinstance(metrics, Mapping) or metrics.get("skipped"): + return None + return _safe_float(metrics.get(metric_name)) + + +def _asset_metric( + stages: Mapping[str, Mapping[str, Any]], + stage_name: Optional[str], + category: str, + metric_name: str, +) -> Optional[float]: + if not stage_name: + return None + metrics = stages.get(stage_name, {}).get(category) + if not isinstance(metrics, Mapping) or metrics.get("skipped"): + return None + asset_level = metrics.get("asset_level") + if not isinstance(asset_level, Mapping): + return None + return _safe_float(asset_level.get(metric_name)) + + +def _best_stage( + stages: Mapping[str, Mapping[str, Any]], + category: str, + metric_name: str = "recall_at_5", +) -> Tuple[Optional[str], Optional[float]]: + best_name: Optional[str] = None + best_value: Optional[float] = None + for stage_name, stage_data in stages.items(): + metrics = stage_data.get(category) + if not isinstance(metrics, Mapping) or metrics.get("skipped"): + continue + value = _safe_float(metrics.get(metric_name)) + if value is None: + continue + if best_value is None or value > best_value: + best_name = stage_name + best_value = value + return best_name, best_value + + +def _is_media_query_category(category: str, per_query_results: Sequence[Mapping[str, Any]]) -> bool: + if category.startswith(("image_to_", "video_to_")): + return True + return any(q.get("query_type") in {"image", "video"} for q in per_query_results) + + +def _first_stage_queries( + stages: Mapping[str, Mapping[str, Any]], + category: str, +) -> List[Mapping[str, Any]]: + for stage_data in stages.values(): + metrics = stage_data.get(category) + if isinstance(metrics, Mapping): + rows = metrics.get("per_query_results") + if isinstance(rows, list) and rows: + return [row for row in rows if isinstance(row, Mapping)] + return [] + + +def _stage_queries( + stages: Mapping[str, Mapping[str, Any]], + stage_name: Optional[str], + category: str, +) -> List[Mapping[str, Any]]: + if not stage_name: + return [] + metrics = stages.get(stage_name, {}).get(category) + if not isinstance(metrics, Mapping): + return [] + rows = metrics.get("per_query_results") + if not isinstance(rows, list): + return [] + return [row for row in rows if isinstance(row, Mapping)] + + +def _query_values(rows: Iterable[Mapping[str, Any]]) -> List[str]: + return [str(row.get("query") or "").strip().lower() for row in rows] + + +def _has_generic_media_queries(rows: Sequence[Mapping[str, Any]]) -> bool: + if not rows: + return False + query_values = _query_values(rows) + generic_count = sum(1 for query in query_values if query in GENERIC_MEDIA_QUERIES) + return generic_count > 0 and generic_count >= max(1, len(query_values) // 2) + + +def _missing_media_query_paths(rows: Sequence[Mapping[str, Any]]) -> List[str]: + missing: List[str] = [] + for row in rows: + query_type = row.get("query_type") + if query_type == "image" and not row.get("image_query_path"): + missing.append(str(row.get("query") or "image query")) + if query_type == "video" and not row.get("video_query_path"): + missing.append(str(row.get("query") or "video query")) + return missing + + +def _has_asset_level_metrics(stages: Mapping[str, Mapping[str, Any]], category: str) -> bool: + for stage_data in stages.values(): + metrics = stage_data.get(category) + if isinstance(metrics, Mapping) and isinstance(metrics.get("asset_level"), Mapping): + return True + return False + + +def _audit_source_counts(rows: Sequence[Mapping[str, Any]], top_k: int = 5) -> Dict[str, int]: + counts: Counter[str] = Counter() + for row in rows: + for result in (row.get("results") or [])[:top_k]: + if not isinstance(result, Mapping): + continue + audit = result.get("audit") + if not isinstance(audit, Mapping): + continue + sources = audit.get("rrf_sources") + if not isinstance(sources, Mapping): + continue + for source, rank in sources.items(): + try: + if int(rank) > 0: + counts[str(source)] += 1 + except (TypeError, ValueError): + continue + return dict(sorted(counts.items())) + + +def _normalize_result_path(filepath: str) -> str: + raw = str(filepath or "") + if not raw: + return "" + if raw.startswith("recallforge://"): + raw = raw.split("/", 3)[-1] + marker = "/tests/uat/corpus/" + if marker in raw: + raw = raw.split(marker, 1)[1] + return raw + + +def _failure_examples( + rows: Sequence[Mapping[str, Any]], + *, + limit: int = 3, +) -> List[Dict[str, Any]]: + examples: List[Dict[str, Any]] = [] + for row in rows: + if row.get("hit_at_5") is True: + continue + top_results = [] + for result in (row.get("results") or [])[:3]: + if isinstance(result, Mapping): + top_results.append(_normalize_result_path(str(result.get("filepath") or ""))) + examples.append( + { + "query": row.get("query") or "", + "query_type": row.get("query_type") or "text", + "image_query_path": row.get("image_query_path"), + "video_query_path": row.get("video_query_path"), + "relevant_paths": row.get("relevant_paths") or [], + "top_results": top_results, + } + ) + if len(examples) >= limit: + break + return examples + + +def _issue(code: str, severity: str, evidence: str) -> Dict[str, str]: + return {"code": code, "severity": severity, "evidence": evidence} + + +def _action(owner: str, priority: int, title: str, rationale: str) -> Dict[str, Any]: + return { + "owner": owner, + "priority": priority, + "title": title, + "rationale": rationale, + } + + +def _target_content_type(category: str) -> Optional[str]: + if category.endswith("_text"): + return "text" + if category.endswith("_image"): + return "image" + if category.endswith("_video"): + return "video" + if category.endswith("_document"): + return "document_family" + return None + + +def _document_filter_gap( + category: str, + content_type_filters: Mapping[str, Optional[str]], +) -> bool: + if category not in DOCUMENT_CATEGORIES: + return False + configured = content_type_filters.get(category) + return configured is None or configured == "document" + + +def _build_category_diagnosis( + payload: Mapping[str, Any], + category: str, + *, + content_type_filters: Mapping[str, Optional[str]], + min_queries: int, + weak_recall_at_5: float, + meaningful_delta: float, +) -> Dict[str, Any]: + stages = payload.get("stages", {}) or {} + role_to_stage = _stage_lookup(stages) + first_rows = _first_stage_queries(stages, category) + best_name, best_r5 = _best_stage(stages, category, "recall_at_5") + + vector_stage = role_to_stage.get("vector") + bm25_stage = role_to_stage.get("bm25") + rrf_stage = role_to_stage.get("rrf") + hybrid_stage = role_to_stage.get("hybrid") + + vector_r5 = _metric(stages, vector_stage, category, "recall_at_5") + vector_r10 = _metric(stages, vector_stage, category, "recall_at_10") + bm25_r5 = _metric(stages, bm25_stage, category, "recall_at_5") + rrf_r5 = _metric(stages, rrf_stage, category, "recall_at_5") + rrf_r10 = _metric(stages, rrf_stage, category, "recall_at_10") + hybrid_r5 = _metric(stages, hybrid_stage, category, "recall_at_5") + hybrid_r10 = _metric(stages, hybrid_stage, category, "recall_at_10") + best_rows = _stage_queries(stages, best_name, category) + + total_queries = _category_query_count(payload, category) + issues: List[Dict[str, str]] = [] + media_query = _is_media_query_category(category, first_rows) + + if total_queries < min_queries: + issues.append( + _issue( + "under_sampled_category", + "high" if total_queries < 5 else "medium", + f"{total_queries} queries is below the {min_queries}-query diagnostic floor.", + ) + ) + + bm25_metrics = stages.get(bm25_stage or "", {}).get(category) + if isinstance(bm25_metrics, Mapping) and bm25_metrics.get("skipped"): + issues.append( + _issue( + "bm25_modality_blind", + "medium", + str(bm25_metrics.get("skip_reason") or "BM25 stage skipped."), + ) + ) + + if vector_r5 is not None and vector_r5 < weak_recall_at_5 and media_query: + issues.append( + _issue( + "embedding_alignment_gap", + "high", + f"Vector-only R@5={vector_r5:.3f}; media-query categories need raw embedding alignment above {weak_recall_at_5:.2f}.", + ) + ) + + if vector_r5 is not None and rrf_r5 is not None: + lift = rrf_r5 - vector_r5 + r10_lift = (rrf_r10 or 0.0) - (vector_r10 or 0.0) + if lift >= meaningful_delta or r10_lift >= meaningful_delta: + issues.append( + _issue( + "derived_text_probe_lift", + "positive", + f"RRF improves R@5 by {lift:+.3f} and R@10 by {r10_lift:+.3f} over raw vector search.", + ) + ) + elif media_query and best_r5 is not None and best_r5 < weak_recall_at_5: + issues.append( + _issue( + "derived_text_probe_insufficient", + "high", + f"RRF R@5={rrf_r5:.3f} does not materially lift vector R@5={vector_r5:.3f}.", + ) + ) + + if rrf_r5 is not None and hybrid_r5 is not None: + rerank_delta = hybrid_r5 - rrf_r5 + if rerank_delta <= -meaningful_delta: + issues.append( + _issue( + "reranker_regression", + "high", + f"Hybrid reranker R@5={hybrid_r5:.3f} trails RRF R@5={rrf_r5:.3f}.", + ) + ) + elif best_r5 is not None and best_r5 < weak_recall_at_5 and abs(rerank_delta) < meaningful_delta: + issues.append( + _issue( + "reranker_no_lift", + "medium", + f"Hybrid reranker changes R@5 by only {rerank_delta:+.3f} versus RRF.", + ) + ) + + if not _has_asset_level_metrics(stages, category): + issues.append( + _issue( + "parent_asset_metrics_missing", + "medium", + "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.", + ) + ) + elif best_name: + memory_r5 = _metric(stages, best_name, category, "recall_at_5") + asset_r5 = _asset_metric(stages, best_name, category, "recall_at_5") + if memory_r5 is not None and asset_r5 is not None and memory_r5 - asset_r5 >= meaningful_delta: + issues.append( + _issue( + "parent_rollup_matters", + "positive", + f"Parent-memory R@5={memory_r5:.3f} is {memory_r5 - asset_r5:+.3f} above raw asset R@5.", + ) + ) + + if _document_filter_gap(category, content_type_filters): + issues.append( + _issue( + "document_family_filter_gap", + "medium", + "Benchmark cannot currently constrain results to the pdf/docx/pptx document family with a single content_type filter.", + ) + ) + + if _has_generic_media_queries(first_rows): + issues.append( + _issue( + "generic_query_artifact", + "medium", + "Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.", + ) + ) + + missing_paths = _missing_media_query_paths(first_rows) + if missing_paths: + issues.append( + _issue( + "media_query_path_missing", + "low", + f"{len(missing_paths)} per-query rows omit image_query_path or video_query_path instrumentation.", + ) + ) + + weakness = max(0.0, weak_recall_at_5 - (best_r5 or 0.0)) + sample_penalty = 0.15 if total_queries < min_queries else 0.0 + media_penalty = 0.05 if media_query else 0.0 + document_penalty = 0.05 if category in DOCUMENT_CATEGORIES else 0.0 + priority_score = round(weakness + sample_penalty + media_penalty + document_penalty, 4) + + rrf_rows = _stage_queries(stages, rrf_stage, category) + hybrid_rows = _stage_queries(stages, hybrid_stage, category) + audit_sources = _audit_source_counts(hybrid_rows or rrf_rows) + + return { + "category": category, + "target_content_type": _target_content_type(category), + "configured_content_type_filter": content_type_filters.get(category), + "total_queries": total_queries, + "best_stage": best_name, + "best_recall_at_5": best_r5, + "priority_score": priority_score, + "metrics": { + "vector_recall_at_5": vector_r5, + "vector_recall_at_10": vector_r10, + "bm25_recall_at_5": bm25_r5, + "rrf_recall_at_5": rrf_r5, + "rrf_recall_at_10": rrf_r10, + "hybrid_recall_at_5": hybrid_r5, + "hybrid_recall_at_10": hybrid_r10, + }, + "issues": issues, + "audit_source_counts_top5": audit_sources, + "failure_examples": _failure_examples(best_rows or first_rows), + } + + +def _aggregate_actions(diagnoses: Sequence[Mapping[str, Any]]) -> List[Dict[str, Any]]: + issue_codes_by_category: Dict[str, List[str]] = {} + for diagnosis in diagnoses: + issue_codes_by_category[str(diagnosis["category"])] = [ + str(issue["code"]) for issue in diagnosis.get("issues", []) + ] + + actions: List[Dict[str, Any]] = [] + + weak_media = [ + category + for category, codes in issue_codes_by_category.items() + if "embedding_alignment_gap" in codes or "derived_text_probe_insufficient" in codes + ] + if weak_media: + actions.append( + _action( + "search", + 1, + "Add bounded cascaded media reranking only after cheap top-K retrieval.", + "The weakest media-query categories are not rescued by current RRF/reranker stages, so REC-130 should target a strict top-K cascade instead of broad expensive scoring.", + ) + ) + + if any("under_sampled_category" in codes for codes in issue_codes_by_category.values()): + actions.append( + _action( + "evals", + 2, + "Expand weak categories to at least 20 queries and keep parent-memory scoring.", + "Several weak categories have 1-3 examples, which is too small to distinguish model weakness from benchmark noise; this maps directly to REC-160.", + ) + ) + + if any("document_family_filter_gap" in codes for codes in issue_codes_by_category.values()): + actions.append( + _action( + "indexing", + 3, + "Represent document-family filters explicitly across pdf/docx/pptx roots and children.", + "Document retrieval categories are evaluated without a proper document-family content filter, so unrelated images/videos can dominate media-query results.", + ) + ) + + if any("generic_query_artifact" in codes for codes in issue_codes_by_category.values()): + actions.append( + _action( + "evals", + 4, + "Replace placeholder media prompts with grounded intent labels and provenance.", + "Queries such as 'related document' are useful smoke probes but too ambiguous for release-quality diagnostics.", + ) + ) + + if any("parent_asset_metrics_missing" in codes for codes in issue_codes_by_category.values()): + actions.append( + _action( + "evals", + 5, + "Rerun cross-modal ablation with the current harness to populate asset_level metrics.", + "The checked-in result is from v0.2.0 and predates serialized asset-level rollups, so it cannot fully separate child-asset hits from parent-memory hits.", + ) + ) + + if any("derived_text_probe_lift" in codes for codes in issue_codes_by_category.values()): + actions.append( + _action( + "ingest", + 6, + "Keep strengthening captions, transcripts, and OCR as first-class retrieval text.", + "Where RRF improves over vector-only, the improvement is evidence that derived text is helping and should be cached/versioned rather than recomputed ad hoc.", + ) + ) + + actions.append( + _action( + "model_research", + 7, + "Benchmark visual/document-specialized retrievers against the weak categories.", + "ViDoRe-style visual document retrieval and MTEB/BEIR-style qrels offer better external baselines for document-heavy failures than anecdotes from one synthetic corpus.", + ) + ) + + return actions + + +def _issue_summary(diagnoses: Sequence[Mapping[str, Any]]) -> Dict[str, int]: + counts: Counter[str] = Counter() + for diagnosis in diagnoses: + for issue in diagnosis.get("issues", []): + counts[str(issue["code"])] += 1 + return dict(sorted(counts.items())) + + +def _load_content_type_filters(categories: Sequence[str]) -> Dict[str, Optional[str]]: + """Inspect the current benchmark helper without making it a hard dependency.""" + try: + project_root = Path(__file__).resolve().parent.parent + if str(project_root) not in sys.path: + sys.path.insert(0, str(project_root)) + from benchmarks import cross_modal_ablation # type: ignore + except Exception: + return {} + return { + category: cross_modal_ablation._result_content_type_for_category(category) + for category in categories + } + + +def build_diagnostics( + payload: Mapping[str, Any], + *, + content_type_filters: Optional[Mapping[str, Optional[str]]] = None, + min_queries: int = MIN_CATEGORY_QUERIES, + weak_recall_at_5: float = WEAK_RECALL_AT_5, + meaningful_delta: float = MEANINGFUL_DELTA, +) -> Dict[str, Any]: + categories = _category_names(payload) + filters = dict(content_type_filters or _load_content_type_filters(categories)) + diagnoses = [ + _build_category_diagnosis( + payload, + category, + content_type_filters=filters, + min_queries=min_queries, + weak_recall_at_5=weak_recall_at_5, + meaningful_delta=meaningful_delta, + ) + for category in categories + ] + diagnoses.sort( + key=lambda row: ( + -float(row["priority_score"]), + str(row["category"]), + ) + ) + + weak_categories = [ + diagnosis + for diagnosis in diagnoses + if diagnosis["best_recall_at_5"] is None + or float(diagnosis["best_recall_at_5"]) < weak_recall_at_5 + or float(diagnosis["priority_score"]) >= 0.20 + ] + + return { + "diagnostic": "cross_modal_retrieval", + "diagnosed_at": datetime.now(timezone.utc).isoformat(), + "source": { + "benchmark": payload.get("benchmark"), + "version": payload.get("version"), + "generated_at": payload.get("generated_at"), + "run_status": payload.get("run_status"), + }, + "thresholds": { + "weak_recall_at_5": weak_recall_at_5, + "min_category_queries": min_queries, + "meaningful_delta": meaningful_delta, + }, + "summary": { + "categories": len(diagnoses), + "weak_categories": len(weak_categories), + "issue_counts": _issue_summary(diagnoses), + }, + "weak_categories": weak_categories, + "all_categories": diagnoses, + "prioritized_actions": _aggregate_actions(diagnoses), + "method_notes": [ + "Vector-only is treated as the raw embedding baseline.", + "RRF lift over vector-only is treated as evidence from derived text probes such as captions, transcripts, OCR, or BM25 text.", + "Hybrid-minus-RRF isolates the current reranker contribution.", + "Parent-memory versus asset-level scoring is only available when the source payload includes asset_level metrics.", + ], + } + + +def _fmt_pct(value: Optional[float]) -> str: + if value is None: + return "n/a" + return f"{value:.1%}" + + +def _fmt_filter(value: Optional[str]) -> str: + return "`None`" if value is None else f"`{value}`" + + +def render_markdown(diagnostics: Mapping[str, Any]) -> str: + source = diagnostics.get("source", {}) or {} + thresholds = diagnostics.get("thresholds", {}) or {} + weak_categories = diagnostics.get("weak_categories", []) or [] + all_categories = diagnostics.get("all_categories", []) or [] + actions = diagnostics.get("prioritized_actions", []) or [] + issue_counts = (diagnostics.get("summary", {}) or {}).get("issue_counts", {}) or {} + + lines: List[str] = [ + "# Cross-Modal Retrieval Diagnostics", + "", + "This report is generated from the saved cross-modal ablation JSON. It separates raw embedding alignment, derived-text contribution, reranker contribution, benchmark artifacts, and parent-memory scoring coverage.", + "", + "## Source", + "", + f"- Benchmark: `{source.get('benchmark') or 'unknown'}`", + f"- Source version: `{source.get('version') or 'unknown'}`", + f"- Source generated at: `{source.get('generated_at') or 'unknown'}`", + f"- Run status: `{source.get('run_status') or 'unknown'}`", + f"- Weak threshold: R@5 < {_fmt_pct(_safe_float(thresholds.get('weak_recall_at_5')))}", + f"- Query floor: {thresholds.get('min_category_queries')} queries per category", + "", + "## Weak And At-Risk Category Ranking", + "", + "| Priority | Category | Queries | Best stage | Best R@5 | Vector R@5 | RRF R@5 | Hybrid R@5 | Key issues |", + "|---:|---|---:|---|---:|---:|---:|---:|---|", + ] + + for diagnosis in weak_categories: + metrics = diagnosis.get("metrics", {}) or {} + issues = ", ".join(issue["code"] for issue in diagnosis.get("issues", [])[:4]) + lines.append( + "| " + f"{diagnosis.get('priority_score'):.2f} | " + f"`{diagnosis.get('category')}` | " + f"{diagnosis.get('total_queries')} | " + f"{diagnosis.get('best_stage') or 'n/a'} | " + f"{_fmt_pct(_safe_float(diagnosis.get('best_recall_at_5')))} | " + f"{_fmt_pct(_safe_float(metrics.get('vector_recall_at_5')))} | " + f"{_fmt_pct(_safe_float(metrics.get('rrf_recall_at_5')))} | " + f"{_fmt_pct(_safe_float(metrics.get('hybrid_recall_at_5')))} | " + f"{issues or 'none'} |" + ) + + lines.extend( + [ + "", + "## Diagnosis Summary", + "", + ] + ) + for code, count in issue_counts.items(): + lines.append(f"- `{code}`: {count}") + + lines.extend( + [ + "", + "## Prioritized Fix List", + "", + ] + ) + for action in actions: + lines.append( + f"{action['priority']}. **{action['owner']}** - {action['title']} {action['rationale']}" + ) + + lines.extend( + [ + "", + "## Category Evidence", + "", + ] + ) + for diagnosis in all_categories: + metrics = diagnosis.get("metrics", {}) or {} + lines.extend( + [ + f"### `{diagnosis.get('category')}`", + "", + f"- Queries: {diagnosis.get('total_queries')}", + f"- Target result family: `{diagnosis.get('target_content_type') or 'mixed'}`", + f"- Configured benchmark content filter: {_fmt_filter(diagnosis.get('configured_content_type_filter'))}", + f"- Best stage/R@5: {diagnosis.get('best_stage') or 'n/a'} / {_fmt_pct(_safe_float(diagnosis.get('best_recall_at_5')))}", + f"- Raw vector R@5/R@10: {_fmt_pct(_safe_float(metrics.get('vector_recall_at_5')))} / {_fmt_pct(_safe_float(metrics.get('vector_recall_at_10')))}", + f"- RRF R@5/R@10: {_fmt_pct(_safe_float(metrics.get('rrf_recall_at_5')))} / {_fmt_pct(_safe_float(metrics.get('rrf_recall_at_10')))}", + f"- Hybrid R@5/R@10: {_fmt_pct(_safe_float(metrics.get('hybrid_recall_at_5')))} / {_fmt_pct(_safe_float(metrics.get('hybrid_recall_at_10')))}", + ] + ) + audit_counts = diagnosis.get("audit_source_counts_top5") or {} + if audit_counts: + source_text = ", ".join(f"`{key}`={value}" for key, value in audit_counts.items()) + lines.append(f"- Top-5 audit source counts: {source_text}") + for issue in diagnosis.get("issues", []): + lines.append(f"- `{issue['code']}` ({issue['severity']}): {issue['evidence']}") + examples = diagnosis.get("failure_examples") or [] + if examples: + lines.append("- Example misses:") + for example in examples: + top_results = ", ".join( + f"`{path}`" for path in (example.get("top_results") or []) if path + ) + query = example.get("query") or "" + lines.append( + f" - `{query}` ({example.get('query_type')}): expected {example.get('relevant_paths')}; top results {top_results or 'n/a'}" + ) + lines.append("") + + lines.extend( + [ + "## Method Notes", + "", + ] + ) + for note in diagnostics.get("method_notes", []) or []: + lines.append(f"- {note}") + + lines.extend( + [ + "", + "## External Evaluation References", + "", + "- [BEIR](https://github.com/beir-cellar/beir) structures retrieval evaluation around corpus, queries, qrels, run results, and metrics such as NDCG, MAP, Recall, Precision, and MRR.", + "- [MTEB](https://github.com/embeddings-benchmark/mteb) is the broader embedding and retrieval evaluation framework now used by ViDoRe for single-model retriever submissions.", + "- [ViDoRe pipeline evaluation](https://github.com/illuin-tech/vidore-benchmark) explicitly covers multi-stage, hybrid, reranking, OCR, and custom preprocessing pipelines for visual document retrieval.", + "", + ] + ) + return "\n".join(lines) + + +def _write_json(path: Path, payload: Mapping[str, Any]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8") + + +def _write_text(path: Path, text: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(text, encoding="utf-8") + + +def parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--input", type=Path, default=DEFAULT_INPUT, help="cross_modal_ablation JSON to diagnose") + parser.add_argument("--output-json", type=Path, default=DEFAULT_JSON_OUTPUT, help="diagnostic JSON output path") + parser.add_argument("--output-md", type=Path, default=DEFAULT_MARKDOWN_OUTPUT, help="diagnostic Markdown output path") + parser.add_argument("--min-queries", type=int, default=MIN_CATEGORY_QUERIES, help="minimum desired queries per category") + parser.add_argument("--weak-recall-at-5", type=float, default=WEAK_RECALL_AT_5, help="R@5 threshold for weak categories") + return parser.parse_args(argv) + + +def main(argv: Optional[Sequence[str]] = None) -> int: + args = parse_args(argv) + payload = json.loads(args.input.read_text(encoding="utf-8")) + diagnostics = build_diagnostics( + payload, + min_queries=args.min_queries, + weak_recall_at_5=args.weak_recall_at_5, + ) + _write_json(args.output_json, diagnostics) + _write_text(args.output_md, render_markdown(diagnostics)) + print(f"Wrote {args.output_json}") + print(f"Wrote {args.output_md}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benchmarks/results/cross_modal_diagnostics.json b/benchmarks/results/cross_modal_diagnostics.json new file mode 100644 index 0000000..76a4473 --- /dev/null +++ b/benchmarks/results/cross_modal_diagnostics.json @@ -0,0 +1,1579 @@ +{ + "all_categories": [ + { + "audit_source_counts_top5": { + "original_fts": 8, + "original_vec": 8 + }, + "best_recall_at_5": 0.0, + "best_stage": "Vector-only", + "category": "image_to_document", + "configured_content_type_filter": null, + "failure_examples": [ + { + "image_query_path": "images/neural_network_diagram.png", + "query": "related document", + "query_type": "image", + "relevant_paths": [ + "documents/ai_strategy_report.docx", + "documents/ai_architecture_deck.pptx", + "documents/embedding_research.pdf" + ], + "top_results": [ + "images/neural_network_diagram.png", + "images/whiteboard_brainstorm.png", + "images/whiteboard_architecture.png" + ], + "video_query_path": null + }, + { + "image_query_path": "images/code_editor_screenshot.png", + "query": "related document", + "query_type": "image", + "relevant_paths": [ + "documents/recallforge_spec.docx", + "documents/operations_manual.pdf" + ], + "top_results": [ + "images/code_editor_screenshot.png", + "videos/coding_demo.mp4::transcript:0001@0.00s", + "images/whiteboard_architecture.png" + ], + "video_query_path": null + } + ], + "issues": [ + { + "code": "under_sampled_category", + "evidence": "2 queries is below the 20-query diagnostic floor.", + "severity": "high" + }, + { + "code": "bm25_modality_blind", + "evidence": "BM25 can't process image queries", + "severity": "medium" + }, + { + "code": "embedding_alignment_gap", + "evidence": "Vector-only R@5=0.000; media-query categories need raw embedding alignment above 0.60.", + "severity": "high" + }, + { + "code": "derived_text_probe_lift", + "evidence": "RRF improves R@5 by +0.000 and R@10 by +0.500 over raw vector search.", + "severity": "positive" + }, + { + "code": "reranker_no_lift", + "evidence": "Hybrid reranker changes R@5 by only +0.000 versus RRF.", + "severity": "medium" + }, + { + "code": "parent_asset_metrics_missing", + "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.", + "severity": "medium" + }, + { + "code": "document_family_filter_gap", + "evidence": "Benchmark cannot currently constrain results to the pdf/docx/pptx document family with a single content_type filter.", + "severity": "medium" + }, + { + "code": "generic_query_artifact", + "evidence": "Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.", + "severity": "medium" + } + ], + "metrics": { + "bm25_recall_at_5": null, + "hybrid_recall_at_10": 0.5, + "hybrid_recall_at_5": 0.0, + "rrf_recall_at_10": 0.5, + "rrf_recall_at_5": 0.0, + "vector_recall_at_10": 0.0, + "vector_recall_at_5": 0.0 + }, + "priority_score": 0.85, + "target_content_type": "document_family", + "total_queries": 2 + }, + { + "audit_source_counts_top5": { + "original_fts": 4, + "original_vec": 4 + }, + "best_recall_at_5": 0.0, + "best_stage": "Vector-only", + "category": "video_to_document", + "configured_content_type_filter": null, + "failure_examples": [ + { + "image_query_path": null, + "query": "related document", + "query_type": "video", + "relevant_paths": [ + "documents/ai_architecture_deck.pptx" + ], + "top_results": [ + "videos/architecture_walkthrough.mp4", + "videos/architecture_walkthrough.mp4::frame:0004@15.00s", + "videos/architecture_walkthrough.mp4::frame:0006@25.00s" + ], + "video_query_path": null + } + ], + "issues": [ + { + "code": "under_sampled_category", + "evidence": "1 queries is below the 20-query diagnostic floor.", + "severity": "high" + }, + { + "code": "bm25_modality_blind", + "evidence": "BM25 can't process video queries", + "severity": "medium" + }, + { + "code": "embedding_alignment_gap", + "evidence": "Vector-only R@5=0.000; media-query categories need raw embedding alignment above 0.60.", + "severity": "high" + }, + { + "code": "derived_text_probe_insufficient", + "evidence": "RRF R@5=0.000 does not materially lift vector R@5=0.000.", + "severity": "high" + }, + { + "code": "reranker_no_lift", + "evidence": "Hybrid reranker changes R@5 by only +0.000 versus RRF.", + "severity": "medium" + }, + { + "code": "parent_asset_metrics_missing", + "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.", + "severity": "medium" + }, + { + "code": "document_family_filter_gap", + "evidence": "Benchmark cannot currently constrain results to the pdf/docx/pptx document family with a single content_type filter.", + "severity": "medium" + }, + { + "code": "generic_query_artifact", + "evidence": "Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.", + "severity": "medium" + }, + { + "code": "media_query_path_missing", + "evidence": "1 per-query rows omit image_query_path or video_query_path instrumentation.", + "severity": "low" + } + ], + "metrics": { + "bm25_recall_at_5": null, + "hybrid_recall_at_10": 0.0, + "hybrid_recall_at_5": 0.0, + "rrf_recall_at_10": 0.0, + "rrf_recall_at_5": 0.0, + "vector_recall_at_10": 0.0, + "vector_recall_at_5": 0.0 + }, + "priority_score": 0.85, + "target_content_type": "document_family", + "total_queries": 1 + }, + { + "audit_source_counts_top5": { + "original_fts": 7, + "original_vec": 8 + }, + "best_recall_at_5": 0.0, + "best_stage": "Vector-only", + "category": "video_to_image", + "configured_content_type_filter": "image", + "failure_examples": [ + { + "image_query_path": null, + "query": "related image", + "query_type": "video", + "relevant_paths": [ + "images/forest_landscape.png", + "images/mountain_landscape.png", + "images/ocean_beach.png" + ], + "top_results": [ + "videos/nature_timelapse.mp4::frame:0006@25.00s", + "videos/nature_timelapse.mp4::frame:0005@20.00s", + "videos/nature_timelapse.mp4::frame:0007@30.00s" + ], + "video_query_path": null + }, + { + "image_query_path": null, + "query": "related image", + "query_type": "video", + "relevant_paths": [ + "images/whiteboard_brainstorm.png", + "images/whiteboard_architecture.png" + ], + "top_results": [ + "videos/whiteboard_session.mp4::frame:0003@10.00s", + "videos/whiteboard_session.mp4::frame:0002@5.00s", + "videos/whiteboard_session.mp4::frame:0001@0.00s" + ], + "video_query_path": null + } + ], + "issues": [ + { + "code": "under_sampled_category", + "evidence": "2 queries is below the 20-query diagnostic floor.", + "severity": "high" + }, + { + "code": "bm25_modality_blind", + "evidence": "BM25 can't process video queries", + "severity": "medium" + }, + { + "code": "embedding_alignment_gap", + "evidence": "Vector-only R@5=0.000; media-query categories need raw embedding alignment above 0.60.", + "severity": "high" + }, + { + "code": "derived_text_probe_insufficient", + "evidence": "RRF R@5=0.000 does not materially lift vector R@5=0.000.", + "severity": "high" + }, + { + "code": "reranker_no_lift", + "evidence": "Hybrid reranker changes R@5 by only +0.000 versus RRF.", + "severity": "medium" + }, + { + "code": "parent_asset_metrics_missing", + "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.", + "severity": "medium" + }, + { + "code": "generic_query_artifact", + "evidence": "Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.", + "severity": "medium" + }, + { + "code": "media_query_path_missing", + "evidence": "2 per-query rows omit image_query_path or video_query_path instrumentation.", + "severity": "low" + } + ], + "metrics": { + "bm25_recall_at_5": null, + "hybrid_recall_at_10": 1.0, + "hybrid_recall_at_5": 0.0, + "rrf_recall_at_10": 1.0, + "rrf_recall_at_5": 0.0, + "vector_recall_at_10": 1.0, + "vector_recall_at_5": 0.0 + }, + "priority_score": 0.8, + "target_content_type": "image", + "total_queries": 2 + }, + { + "audit_source_counts_top5": { + "original_fts": 8, + "original_vec": 12 + }, + "best_recall_at_5": 0.3333, + "best_stage": "Vector-only", + "category": "video_to_text", + "configured_content_type_filter": "text", + "failure_examples": [ + { + "image_query_path": null, + "query": "related text", + "query_type": "video", + "relevant_paths": [ + "text/tech_cybersecurity.md", + "text/tech_cloud_computing.md" + ], + "top_results": [ + "videos/coding_demo.mp4::transcript:0002@2.00s", + "videos/whiteboard_session.mp4::transcript:0001@0.00s", + "text/medicine_nutrition.md" + ], + "video_query_path": null + }, + { + "image_query_path": null, + "query": "related text", + "query_type": "video", + "relevant_paths": [ + "text/architecture_gothic.md", + "text/architecture_modern.md", + "text/architecture_blueprints.md" + ], + "top_results": [ + "videos/coding_demo.mp4::transcript:0002@2.00s", + "text/ai_agents.md", + "documents/ai_architecture_deck.pptx::slide:0001" + ], + "video_query_path": null + } + ], + "issues": [ + { + "code": "under_sampled_category", + "evidence": "3 queries is below the 20-query diagnostic floor.", + "severity": "high" + }, + { + "code": "bm25_modality_blind", + "evidence": "BM25 can't process video queries", + "severity": "medium" + }, + { + "code": "embedding_alignment_gap", + "evidence": "Vector-only R@5=0.333; media-query categories need raw embedding alignment above 0.60.", + "severity": "high" + }, + { + "code": "derived_text_probe_insufficient", + "evidence": "RRF R@5=0.333 does not materially lift vector R@5=0.333.", + "severity": "high" + }, + { + "code": "reranker_no_lift", + "evidence": "Hybrid reranker changes R@5 by only +0.000 versus RRF.", + "severity": "medium" + }, + { + "code": "parent_asset_metrics_missing", + "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.", + "severity": "medium" + }, + { + "code": "generic_query_artifact", + "evidence": "Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.", + "severity": "medium" + }, + { + "code": "media_query_path_missing", + "evidence": "3 per-query rows omit image_query_path or video_query_path instrumentation.", + "severity": "low" + } + ], + "metrics": { + "bm25_recall_at_5": null, + "hybrid_recall_at_10": 0.3333, + "hybrid_recall_at_5": 0.3333, + "rrf_recall_at_10": 0.3333, + "rrf_recall_at_5": 0.3333, + "vector_recall_at_10": 0.3333, + "vector_recall_at_5": 0.3333 + }, + "priority_score": 0.4667, + "target_content_type": "text", + "total_queries": 3 + }, + { + "audit_source_counts_top5": { + "original_fts": 8, + "original_vec": 12 + }, + "best_recall_at_5": 1.0, + "best_stage": "Vector-only", + "category": "image_to_image", + "configured_content_type_filter": "image", + "failure_examples": [], + "issues": [ + { + "code": "under_sampled_category", + "evidence": "3 queries is below the 20-query diagnostic floor.", + "severity": "high" + }, + { + "code": "bm25_modality_blind", + "evidence": "BM25 can't process image queries", + "severity": "medium" + }, + { + "code": "parent_asset_metrics_missing", + "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.", + "severity": "medium" + } + ], + "metrics": { + "bm25_recall_at_5": null, + "hybrid_recall_at_10": 1.0, + "hybrid_recall_at_5": 1.0, + "rrf_recall_at_10": 1.0, + "rrf_recall_at_5": 1.0, + "vector_recall_at_10": 1.0, + "vector_recall_at_5": 1.0 + }, + "priority_score": 0.2, + "target_content_type": "image", + "total_queries": 3 + }, + { + "audit_source_counts_top5": { + "original_fts": 60, + "original_vec": 63 + }, + "best_recall_at_5": 0.6, + "best_stage": "Vector + BM25 (RRF)", + "category": "image_to_text", + "configured_content_type_filter": "text", + "failure_examples": [ + { + "image_query_path": "images/ocean_beach.png", + "query": "", + "query_type": "image", + "relevant_paths": [ + "text/nature_oceans.md" + ], + "top_results": [ + "text/sports_golf.md", + "text/music_production.md", + "text/nature_forests.md" + ], + "video_query_path": null + }, + { + "image_query_path": "images/whiteboard_architecture.png", + "query": "", + "query_type": "image", + "relevant_paths": [ + "text/ai_agents.md", + "text/tech_edge_ai.md" + ], + "top_results": [ + "videos/architecture_walkthrough.mp4::transcript:0001@0.00s", + "text/tech_cloud_computing.md", + "videos/whiteboard_session.mp4::transcript:0001@0.00s" + ], + "video_query_path": null + }, + { + "image_query_path": "images/whiteboard_brainstorm.png", + "query": "", + "query_type": "image", + "relevant_paths": [ + "text/ai_agents.md", + "text/tech_cloud_computing.md" + ], + "top_results": [ + "videos/whiteboard_session.mp4::transcript:0002@2.00s", + "videos/whiteboard_session.mp4::transcript:0001@0.00s", + "videos/architecture_walkthrough.mp4::transcript:0001@0.00s" + ], + "video_query_path": null + } + ], + "issues": [ + { + "code": "under_sampled_category", + "evidence": "15 queries is below the 20-query diagnostic floor.", + "severity": "medium" + }, + { + "code": "bm25_modality_blind", + "evidence": "BM25 can't process image queries", + "severity": "medium" + }, + { + "code": "embedding_alignment_gap", + "evidence": "Vector-only R@5=0.467; media-query categories need raw embedding alignment above 0.60.", + "severity": "high" + }, + { + "code": "derived_text_probe_lift", + "evidence": "RRF improves R@5 by +0.133 and R@10 by +0.133 over raw vector search.", + "severity": "positive" + }, + { + "code": "parent_asset_metrics_missing", + "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.", + "severity": "medium" + }, + { + "code": "generic_query_artifact", + "evidence": "Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.", + "severity": "medium" + } + ], + "metrics": { + "bm25_recall_at_5": null, + "hybrid_recall_at_10": 0.8667, + "hybrid_recall_at_5": 0.6, + "rrf_recall_at_10": 0.8667, + "rrf_recall_at_5": 0.6, + "vector_recall_at_10": 0.7333, + "vector_recall_at_5": 0.4667 + }, + "priority_score": 0.2, + "target_content_type": "text", + "total_queries": 15 + }, + { + "audit_source_counts_top5": { + "original_fts": 2, + "original_vec": 8 + }, + "best_recall_at_5": 1.0, + "best_stage": "Vector-only", + "category": "image_to_video", + "configured_content_type_filter": "video", + "failure_examples": [], + "issues": [ + { + "code": "under_sampled_category", + "evidence": "2 queries is below the 20-query diagnostic floor.", + "severity": "high" + }, + { + "code": "bm25_modality_blind", + "evidence": "BM25 can't process image queries", + "severity": "medium" + }, + { + "code": "parent_asset_metrics_missing", + "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.", + "severity": "medium" + }, + { + "code": "generic_query_artifact", + "evidence": "Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.", + "severity": "medium" + } + ], + "metrics": { + "bm25_recall_at_5": null, + "hybrid_recall_at_10": 1.0, + "hybrid_recall_at_5": 1.0, + "rrf_recall_at_10": 1.0, + "rrf_recall_at_5": 1.0, + "vector_recall_at_10": 1.0, + "vector_recall_at_5": 1.0 + }, + "priority_score": 0.2, + "target_content_type": "video", + "total_queries": 2 + }, + { + "audit_source_counts_top5": { + "original_fts": 44, + "original_vec": 48 + }, + "best_recall_at_5": 1.0, + "best_stage": "Vector-only", + "category": "text_to_document", + "configured_content_type_filter": null, + "failure_examples": [], + "issues": [ + { + "code": "under_sampled_category", + "evidence": "12 queries is below the 20-query diagnostic floor.", + "severity": "medium" + }, + { + "code": "parent_asset_metrics_missing", + "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.", + "severity": "medium" + }, + { + "code": "document_family_filter_gap", + "evidence": "Benchmark cannot currently constrain results to the pdf/docx/pptx document family with a single content_type filter.", + "severity": "medium" + } + ], + "metrics": { + "bm25_recall_at_5": 1.0, + "hybrid_recall_at_10": 1.0, + "hybrid_recall_at_5": 1.0, + "rrf_recall_at_10": 1.0, + "rrf_recall_at_5": 1.0, + "vector_recall_at_10": 1.0, + "vector_recall_at_5": 1.0 + }, + "priority_score": 0.2, + "target_content_type": "document_family", + "total_queries": 12 + }, + { + "audit_source_counts_top5": { + "original_vec": 4 + }, + "best_recall_at_5": 1.0, + "best_stage": "Vector-only", + "category": "video_to_video", + "configured_content_type_filter": "video", + "failure_examples": [], + "issues": [ + { + "code": "under_sampled_category", + "evidence": "1 queries is below the 20-query diagnostic floor.", + "severity": "high" + }, + { + "code": "bm25_modality_blind", + "evidence": "BM25 can't process video queries", + "severity": "medium" + }, + { + "code": "parent_asset_metrics_missing", + "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.", + "severity": "medium" + }, + { + "code": "media_query_path_missing", + "evidence": "1 per-query rows omit image_query_path or video_query_path instrumentation.", + "severity": "low" + } + ], + "metrics": { + "bm25_recall_at_5": null, + "hybrid_recall_at_10": 1.0, + "hybrid_recall_at_5": 1.0, + "rrf_recall_at_10": 1.0, + "rrf_recall_at_5": 1.0, + "vector_recall_at_10": 1.0, + "vector_recall_at_5": 1.0 + }, + "priority_score": 0.2, + "target_content_type": "video", + "total_queries": 1 + }, + { + "audit_source_counts_top5": { + "original_fts": 17, + "original_vec": 72 + }, + "best_recall_at_5": 0.9444, + "best_stage": "Vector + BM25 (RRF)", + "category": "text_to_image", + "configured_content_type_filter": "image", + "failure_examples": [ + { + "image_query_path": null, + "query": "coastal landscape photography", + "query_type": "text", + "relevant_paths": [ + "images/ocean_beach.png" + ], + "top_results": [ + "videos/nature_timelapse.mp4::frame:0006@25.00s", + "videos/nature_timelapse.mp4::frame:0005@20.00s", + "videos/nature_timelapse.mp4::frame:0003@10.00s" + ], + "video_query_path": null + } + ], + "issues": [ + { + "code": "under_sampled_category", + "evidence": "18 queries is below the 20-query diagnostic floor.", + "severity": "medium" + }, + { + "code": "parent_asset_metrics_missing", + "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.", + "severity": "medium" + } + ], + "metrics": { + "bm25_recall_at_5": 0.7222, + "hybrid_recall_at_10": 0.9444, + "hybrid_recall_at_5": 0.9444, + "rrf_recall_at_10": 0.9444, + "rrf_recall_at_5": 0.9444, + "vector_recall_at_10": 0.9444, + "vector_recall_at_5": 0.8889 + }, + "priority_score": 0.15, + "target_content_type": "image", + "total_queries": 18 + }, + { + "audit_source_counts_top5": { + "original_fts": 1, + "original_vec": 60 + }, + "best_recall_at_5": 1.0, + "best_stage": "Vector-only", + "category": "text_to_video", + "configured_content_type_filter": "video", + "failure_examples": [], + "issues": [ + { + "code": "under_sampled_category", + "evidence": "15 queries is below the 20-query diagnostic floor.", + "severity": "medium" + }, + { + "code": "parent_asset_metrics_missing", + "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.", + "severity": "medium" + } + ], + "metrics": { + "bm25_recall_at_5": 0.1333, + "hybrid_recall_at_10": 1.0, + "hybrid_recall_at_5": 1.0, + "rrf_recall_at_10": 1.0, + "rrf_recall_at_5": 1.0, + "vector_recall_at_10": 1.0, + "vector_recall_at_5": 1.0 + }, + "priority_score": 0.15, + "target_content_type": "video", + "total_queries": 15 + }, + { + "audit_source_counts_top5": { + "original_fts": 75, + "original_vec": 78 + }, + "best_recall_at_5": 0.95, + "best_stage": "BM25-only", + "category": "mixed_modal", + "configured_content_type_filter": null, + "failure_examples": [ + { + "image_query_path": null, + "query": "comprehensive guide to athletic performance", + "query_type": "text", + "relevant_paths": [ + "text/sports_running.md", + "text/sports_cycling.md", + "text/sports_swimming.md", + "text/sports_yoga.md", + "text/medicine_nutrition.md", + "text/medicine_cardiology.md" + ], + "top_results": [ + "text/cooking_sourdough.md", + "documents/ai_architecture_deck.pptx::slide:0003", + "documents/edge_deployment_guide.pdf::page:0001" + ], + "video_query_path": null + } + ], + "issues": [ + { + "code": "parent_asset_metrics_missing", + "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.", + "severity": "medium" + } + ], + "metrics": { + "bm25_recall_at_5": 0.95, + "hybrid_recall_at_10": 1.0, + "hybrid_recall_at_5": 0.9, + "rrf_recall_at_10": 1.0, + "rrf_recall_at_5": 0.95, + "vector_recall_at_10": 1.0, + "vector_recall_at_5": 0.85 + }, + "priority_score": 0.0, + "target_content_type": null, + "total_queries": 20 + }, + { + "audit_source_counts_top5": { + "original_fts": 188, + "original_vec": 241 + }, + "best_recall_at_5": 0.9, + "best_stage": "Vector + BM25 (RRF)", + "category": "text_to_text", + "configured_content_type_filter": "text", + "failure_examples": [ + { + "image_query_path": null, + "query": "how do computers understand the meaning of words", + "query_type": "text", + "relevant_paths": [ + "text/ai_embeddings.md" + ], + "top_results": [ + "text/ai_transformers.md", + "text/tech_quantum_computing.md", + "text/tech_cloud_computing.md" + ], + "video_query_path": null + }, + { + "image_query_path": null, + "query": "underwater basket weaving techniques", + "query_type": "text", + "relevant_paths": [], + "top_results": [ + "text/sports_swimming.md", + "text/architecture_gothic.md", + "text/cooking_sourdough.md" + ], + "video_query_path": null + }, + { + "image_query_path": null, + "query": "medieval jousting tournament rules and equipment", + "query_type": "text", + "relevant_paths": [], + "top_results": [ + "text/sports_swimming.md", + "text/sports_cycling.md", + "text/history_renaissance.md" + ], + "video_query_path": null + } + ], + "issues": [ + { + "code": "parent_asset_metrics_missing", + "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.", + "severity": "medium" + } + ], + "metrics": { + "bm25_recall_at_5": 0.8667, + "hybrid_recall_at_10": 0.9167, + "hybrid_recall_at_5": 0.9, + "rrf_recall_at_10": 0.9167, + "rrf_recall_at_5": 0.9, + "vector_recall_at_10": 0.9, + "vector_recall_at_5": 0.8833 + }, + "priority_score": 0.0, + "target_content_type": "text", + "total_queries": 60 + } + ], + "diagnosed_at": "2026-05-17T17:57:55.151267+00:00", + "diagnostic": "cross_modal_retrieval", + "method_notes": [ + "Vector-only is treated as the raw embedding baseline.", + "RRF lift over vector-only is treated as evidence from derived text probes such as captions, transcripts, OCR, or BM25 text.", + "Hybrid-minus-RRF isolates the current reranker contribution.", + "Parent-memory versus asset-level scoring is only available when the source payload includes asset_level metrics." + ], + "prioritized_actions": [ + { + "owner": "search", + "priority": 1, + "rationale": "The weakest media-query categories are not rescued by current RRF/reranker stages, so REC-130 should target a strict top-K cascade instead of broad expensive scoring.", + "title": "Add bounded cascaded media reranking only after cheap top-K retrieval." + }, + { + "owner": "evals", + "priority": 2, + "rationale": "Several weak categories have 1-3 examples, which is too small to distinguish model weakness from benchmark noise; this maps directly to REC-160.", + "title": "Expand weak categories to at least 20 queries and keep parent-memory scoring." + }, + { + "owner": "indexing", + "priority": 3, + "rationale": "Document retrieval categories are evaluated without a proper document-family content filter, so unrelated images/videos can dominate media-query results.", + "title": "Represent document-family filters explicitly across pdf/docx/pptx roots and children." + }, + { + "owner": "evals", + "priority": 4, + "rationale": "Queries such as 'related document' are useful smoke probes but too ambiguous for release-quality diagnostics.", + "title": "Replace placeholder media prompts with grounded intent labels and provenance." + }, + { + "owner": "evals", + "priority": 5, + "rationale": "The checked-in result is from v0.2.0 and predates serialized asset-level rollups, so it cannot fully separate child-asset hits from parent-memory hits.", + "title": "Rerun cross-modal ablation with the current harness to populate asset_level metrics." + }, + { + "owner": "ingest", + "priority": 6, + "rationale": "Where RRF improves over vector-only, the improvement is evidence that derived text is helping and should be cached/versioned rather than recomputed ad hoc.", + "title": "Keep strengthening captions, transcripts, and OCR as first-class retrieval text." + }, + { + "owner": "model_research", + "priority": 7, + "rationale": "ViDoRe-style visual document retrieval and MTEB/BEIR-style qrels offer better external baselines for document-heavy failures than anecdotes from one synthetic corpus.", + "title": "Benchmark visual/document-specialized retrievers against the weak categories." + } + ], + "source": { + "benchmark": "cross_modal_ablation", + "generated_at": "2026-03-22T01:15:51.127774+00:00", + "run_status": "complete", + "version": "0.2.0" + }, + "summary": { + "categories": 13, + "issue_counts": { + "bm25_modality_blind": 8, + "derived_text_probe_insufficient": 3, + "derived_text_probe_lift": 2, + "document_family_filter_gap": 3, + "embedding_alignment_gap": 5, + "generic_query_artifact": 6, + "media_query_path_missing": 4, + "parent_asset_metrics_missing": 13, + "reranker_no_lift": 4, + "under_sampled_category": 11 + }, + "weak_categories": 9 + }, + "thresholds": { + "meaningful_delta": 0.1, + "min_category_queries": 20, + "weak_recall_at_5": 0.6 + }, + "weak_categories": [ + { + "audit_source_counts_top5": { + "original_fts": 8, + "original_vec": 8 + }, + "best_recall_at_5": 0.0, + "best_stage": "Vector-only", + "category": "image_to_document", + "configured_content_type_filter": null, + "failure_examples": [ + { + "image_query_path": "images/neural_network_diagram.png", + "query": "related document", + "query_type": "image", + "relevant_paths": [ + "documents/ai_strategy_report.docx", + "documents/ai_architecture_deck.pptx", + "documents/embedding_research.pdf" + ], + "top_results": [ + "images/neural_network_diagram.png", + "images/whiteboard_brainstorm.png", + "images/whiteboard_architecture.png" + ], + "video_query_path": null + }, + { + "image_query_path": "images/code_editor_screenshot.png", + "query": "related document", + "query_type": "image", + "relevant_paths": [ + "documents/recallforge_spec.docx", + "documents/operations_manual.pdf" + ], + "top_results": [ + "images/code_editor_screenshot.png", + "videos/coding_demo.mp4::transcript:0001@0.00s", + "images/whiteboard_architecture.png" + ], + "video_query_path": null + } + ], + "issues": [ + { + "code": "under_sampled_category", + "evidence": "2 queries is below the 20-query diagnostic floor.", + "severity": "high" + }, + { + "code": "bm25_modality_blind", + "evidence": "BM25 can't process image queries", + "severity": "medium" + }, + { + "code": "embedding_alignment_gap", + "evidence": "Vector-only R@5=0.000; media-query categories need raw embedding alignment above 0.60.", + "severity": "high" + }, + { + "code": "derived_text_probe_lift", + "evidence": "RRF improves R@5 by +0.000 and R@10 by +0.500 over raw vector search.", + "severity": "positive" + }, + { + "code": "reranker_no_lift", + "evidence": "Hybrid reranker changes R@5 by only +0.000 versus RRF.", + "severity": "medium" + }, + { + "code": "parent_asset_metrics_missing", + "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.", + "severity": "medium" + }, + { + "code": "document_family_filter_gap", + "evidence": "Benchmark cannot currently constrain results to the pdf/docx/pptx document family with a single content_type filter.", + "severity": "medium" + }, + { + "code": "generic_query_artifact", + "evidence": "Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.", + "severity": "medium" + } + ], + "metrics": { + "bm25_recall_at_5": null, + "hybrid_recall_at_10": 0.5, + "hybrid_recall_at_5": 0.0, + "rrf_recall_at_10": 0.5, + "rrf_recall_at_5": 0.0, + "vector_recall_at_10": 0.0, + "vector_recall_at_5": 0.0 + }, + "priority_score": 0.85, + "target_content_type": "document_family", + "total_queries": 2 + }, + { + "audit_source_counts_top5": { + "original_fts": 4, + "original_vec": 4 + }, + "best_recall_at_5": 0.0, + "best_stage": "Vector-only", + "category": "video_to_document", + "configured_content_type_filter": null, + "failure_examples": [ + { + "image_query_path": null, + "query": "related document", + "query_type": "video", + "relevant_paths": [ + "documents/ai_architecture_deck.pptx" + ], + "top_results": [ + "videos/architecture_walkthrough.mp4", + "videos/architecture_walkthrough.mp4::frame:0004@15.00s", + "videos/architecture_walkthrough.mp4::frame:0006@25.00s" + ], + "video_query_path": null + } + ], + "issues": [ + { + "code": "under_sampled_category", + "evidence": "1 queries is below the 20-query diagnostic floor.", + "severity": "high" + }, + { + "code": "bm25_modality_blind", + "evidence": "BM25 can't process video queries", + "severity": "medium" + }, + { + "code": "embedding_alignment_gap", + "evidence": "Vector-only R@5=0.000; media-query categories need raw embedding alignment above 0.60.", + "severity": "high" + }, + { + "code": "derived_text_probe_insufficient", + "evidence": "RRF R@5=0.000 does not materially lift vector R@5=0.000.", + "severity": "high" + }, + { + "code": "reranker_no_lift", + "evidence": "Hybrid reranker changes R@5 by only +0.000 versus RRF.", + "severity": "medium" + }, + { + "code": "parent_asset_metrics_missing", + "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.", + "severity": "medium" + }, + { + "code": "document_family_filter_gap", + "evidence": "Benchmark cannot currently constrain results to the pdf/docx/pptx document family with a single content_type filter.", + "severity": "medium" + }, + { + "code": "generic_query_artifact", + "evidence": "Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.", + "severity": "medium" + }, + { + "code": "media_query_path_missing", + "evidence": "1 per-query rows omit image_query_path or video_query_path instrumentation.", + "severity": "low" + } + ], + "metrics": { + "bm25_recall_at_5": null, + "hybrid_recall_at_10": 0.0, + "hybrid_recall_at_5": 0.0, + "rrf_recall_at_10": 0.0, + "rrf_recall_at_5": 0.0, + "vector_recall_at_10": 0.0, + "vector_recall_at_5": 0.0 + }, + "priority_score": 0.85, + "target_content_type": "document_family", + "total_queries": 1 + }, + { + "audit_source_counts_top5": { + "original_fts": 7, + "original_vec": 8 + }, + "best_recall_at_5": 0.0, + "best_stage": "Vector-only", + "category": "video_to_image", + "configured_content_type_filter": "image", + "failure_examples": [ + { + "image_query_path": null, + "query": "related image", + "query_type": "video", + "relevant_paths": [ + "images/forest_landscape.png", + "images/mountain_landscape.png", + "images/ocean_beach.png" + ], + "top_results": [ + "videos/nature_timelapse.mp4::frame:0006@25.00s", + "videos/nature_timelapse.mp4::frame:0005@20.00s", + "videos/nature_timelapse.mp4::frame:0007@30.00s" + ], + "video_query_path": null + }, + { + "image_query_path": null, + "query": "related image", + "query_type": "video", + "relevant_paths": [ + "images/whiteboard_brainstorm.png", + "images/whiteboard_architecture.png" + ], + "top_results": [ + "videos/whiteboard_session.mp4::frame:0003@10.00s", + "videos/whiteboard_session.mp4::frame:0002@5.00s", + "videos/whiteboard_session.mp4::frame:0001@0.00s" + ], + "video_query_path": null + } + ], + "issues": [ + { + "code": "under_sampled_category", + "evidence": "2 queries is below the 20-query diagnostic floor.", + "severity": "high" + }, + { + "code": "bm25_modality_blind", + "evidence": "BM25 can't process video queries", + "severity": "medium" + }, + { + "code": "embedding_alignment_gap", + "evidence": "Vector-only R@5=0.000; media-query categories need raw embedding alignment above 0.60.", + "severity": "high" + }, + { + "code": "derived_text_probe_insufficient", + "evidence": "RRF R@5=0.000 does not materially lift vector R@5=0.000.", + "severity": "high" + }, + { + "code": "reranker_no_lift", + "evidence": "Hybrid reranker changes R@5 by only +0.000 versus RRF.", + "severity": "medium" + }, + { + "code": "parent_asset_metrics_missing", + "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.", + "severity": "medium" + }, + { + "code": "generic_query_artifact", + "evidence": "Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.", + "severity": "medium" + }, + { + "code": "media_query_path_missing", + "evidence": "2 per-query rows omit image_query_path or video_query_path instrumentation.", + "severity": "low" + } + ], + "metrics": { + "bm25_recall_at_5": null, + "hybrid_recall_at_10": 1.0, + "hybrid_recall_at_5": 0.0, + "rrf_recall_at_10": 1.0, + "rrf_recall_at_5": 0.0, + "vector_recall_at_10": 1.0, + "vector_recall_at_5": 0.0 + }, + "priority_score": 0.8, + "target_content_type": "image", + "total_queries": 2 + }, + { + "audit_source_counts_top5": { + "original_fts": 8, + "original_vec": 12 + }, + "best_recall_at_5": 0.3333, + "best_stage": "Vector-only", + "category": "video_to_text", + "configured_content_type_filter": "text", + "failure_examples": [ + { + "image_query_path": null, + "query": "related text", + "query_type": "video", + "relevant_paths": [ + "text/tech_cybersecurity.md", + "text/tech_cloud_computing.md" + ], + "top_results": [ + "videos/coding_demo.mp4::transcript:0002@2.00s", + "videos/whiteboard_session.mp4::transcript:0001@0.00s", + "text/medicine_nutrition.md" + ], + "video_query_path": null + }, + { + "image_query_path": null, + "query": "related text", + "query_type": "video", + "relevant_paths": [ + "text/architecture_gothic.md", + "text/architecture_modern.md", + "text/architecture_blueprints.md" + ], + "top_results": [ + "videos/coding_demo.mp4::transcript:0002@2.00s", + "text/ai_agents.md", + "documents/ai_architecture_deck.pptx::slide:0001" + ], + "video_query_path": null + } + ], + "issues": [ + { + "code": "under_sampled_category", + "evidence": "3 queries is below the 20-query diagnostic floor.", + "severity": "high" + }, + { + "code": "bm25_modality_blind", + "evidence": "BM25 can't process video queries", + "severity": "medium" + }, + { + "code": "embedding_alignment_gap", + "evidence": "Vector-only R@5=0.333; media-query categories need raw embedding alignment above 0.60.", + "severity": "high" + }, + { + "code": "derived_text_probe_insufficient", + "evidence": "RRF R@5=0.333 does not materially lift vector R@5=0.333.", + "severity": "high" + }, + { + "code": "reranker_no_lift", + "evidence": "Hybrid reranker changes R@5 by only +0.000 versus RRF.", + "severity": "medium" + }, + { + "code": "parent_asset_metrics_missing", + "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.", + "severity": "medium" + }, + { + "code": "generic_query_artifact", + "evidence": "Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.", + "severity": "medium" + }, + { + "code": "media_query_path_missing", + "evidence": "3 per-query rows omit image_query_path or video_query_path instrumentation.", + "severity": "low" + } + ], + "metrics": { + "bm25_recall_at_5": null, + "hybrid_recall_at_10": 0.3333, + "hybrid_recall_at_5": 0.3333, + "rrf_recall_at_10": 0.3333, + "rrf_recall_at_5": 0.3333, + "vector_recall_at_10": 0.3333, + "vector_recall_at_5": 0.3333 + }, + "priority_score": 0.4667, + "target_content_type": "text", + "total_queries": 3 + }, + { + "audit_source_counts_top5": { + "original_fts": 8, + "original_vec": 12 + }, + "best_recall_at_5": 1.0, + "best_stage": "Vector-only", + "category": "image_to_image", + "configured_content_type_filter": "image", + "failure_examples": [], + "issues": [ + { + "code": "under_sampled_category", + "evidence": "3 queries is below the 20-query diagnostic floor.", + "severity": "high" + }, + { + "code": "bm25_modality_blind", + "evidence": "BM25 can't process image queries", + "severity": "medium" + }, + { + "code": "parent_asset_metrics_missing", + "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.", + "severity": "medium" + } + ], + "metrics": { + "bm25_recall_at_5": null, + "hybrid_recall_at_10": 1.0, + "hybrid_recall_at_5": 1.0, + "rrf_recall_at_10": 1.0, + "rrf_recall_at_5": 1.0, + "vector_recall_at_10": 1.0, + "vector_recall_at_5": 1.0 + }, + "priority_score": 0.2, + "target_content_type": "image", + "total_queries": 3 + }, + { + "audit_source_counts_top5": { + "original_fts": 60, + "original_vec": 63 + }, + "best_recall_at_5": 0.6, + "best_stage": "Vector + BM25 (RRF)", + "category": "image_to_text", + "configured_content_type_filter": "text", + "failure_examples": [ + { + "image_query_path": "images/ocean_beach.png", + "query": "", + "query_type": "image", + "relevant_paths": [ + "text/nature_oceans.md" + ], + "top_results": [ + "text/sports_golf.md", + "text/music_production.md", + "text/nature_forests.md" + ], + "video_query_path": null + }, + { + "image_query_path": "images/whiteboard_architecture.png", + "query": "", + "query_type": "image", + "relevant_paths": [ + "text/ai_agents.md", + "text/tech_edge_ai.md" + ], + "top_results": [ + "videos/architecture_walkthrough.mp4::transcript:0001@0.00s", + "text/tech_cloud_computing.md", + "videos/whiteboard_session.mp4::transcript:0001@0.00s" + ], + "video_query_path": null + }, + { + "image_query_path": "images/whiteboard_brainstorm.png", + "query": "", + "query_type": "image", + "relevant_paths": [ + "text/ai_agents.md", + "text/tech_cloud_computing.md" + ], + "top_results": [ + "videos/whiteboard_session.mp4::transcript:0002@2.00s", + "videos/whiteboard_session.mp4::transcript:0001@0.00s", + "videos/architecture_walkthrough.mp4::transcript:0001@0.00s" + ], + "video_query_path": null + } + ], + "issues": [ + { + "code": "under_sampled_category", + "evidence": "15 queries is below the 20-query diagnostic floor.", + "severity": "medium" + }, + { + "code": "bm25_modality_blind", + "evidence": "BM25 can't process image queries", + "severity": "medium" + }, + { + "code": "embedding_alignment_gap", + "evidence": "Vector-only R@5=0.467; media-query categories need raw embedding alignment above 0.60.", + "severity": "high" + }, + { + "code": "derived_text_probe_lift", + "evidence": "RRF improves R@5 by +0.133 and R@10 by +0.133 over raw vector search.", + "severity": "positive" + }, + { + "code": "parent_asset_metrics_missing", + "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.", + "severity": "medium" + }, + { + "code": "generic_query_artifact", + "evidence": "Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.", + "severity": "medium" + } + ], + "metrics": { + "bm25_recall_at_5": null, + "hybrid_recall_at_10": 0.8667, + "hybrid_recall_at_5": 0.6, + "rrf_recall_at_10": 0.8667, + "rrf_recall_at_5": 0.6, + "vector_recall_at_10": 0.7333, + "vector_recall_at_5": 0.4667 + }, + "priority_score": 0.2, + "target_content_type": "text", + "total_queries": 15 + }, + { + "audit_source_counts_top5": { + "original_fts": 2, + "original_vec": 8 + }, + "best_recall_at_5": 1.0, + "best_stage": "Vector-only", + "category": "image_to_video", + "configured_content_type_filter": "video", + "failure_examples": [], + "issues": [ + { + "code": "under_sampled_category", + "evidence": "2 queries is below the 20-query diagnostic floor.", + "severity": "high" + }, + { + "code": "bm25_modality_blind", + "evidence": "BM25 can't process image queries", + "severity": "medium" + }, + { + "code": "parent_asset_metrics_missing", + "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.", + "severity": "medium" + }, + { + "code": "generic_query_artifact", + "evidence": "Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity.", + "severity": "medium" + } + ], + "metrics": { + "bm25_recall_at_5": null, + "hybrid_recall_at_10": 1.0, + "hybrid_recall_at_5": 1.0, + "rrf_recall_at_10": 1.0, + "rrf_recall_at_5": 1.0, + "vector_recall_at_10": 1.0, + "vector_recall_at_5": 1.0 + }, + "priority_score": 0.2, + "target_content_type": "video", + "total_queries": 2 + }, + { + "audit_source_counts_top5": { + "original_fts": 44, + "original_vec": 48 + }, + "best_recall_at_5": 1.0, + "best_stage": "Vector-only", + "category": "text_to_document", + "configured_content_type_filter": null, + "failure_examples": [], + "issues": [ + { + "code": "under_sampled_category", + "evidence": "12 queries is below the 20-query diagnostic floor.", + "severity": "medium" + }, + { + "code": "parent_asset_metrics_missing", + "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.", + "severity": "medium" + }, + { + "code": "document_family_filter_gap", + "evidence": "Benchmark cannot currently constrain results to the pdf/docx/pptx document family with a single content_type filter.", + "severity": "medium" + } + ], + "metrics": { + "bm25_recall_at_5": 1.0, + "hybrid_recall_at_10": 1.0, + "hybrid_recall_at_5": 1.0, + "rrf_recall_at_10": 1.0, + "rrf_recall_at_5": 1.0, + "vector_recall_at_10": 1.0, + "vector_recall_at_5": 1.0 + }, + "priority_score": 0.2, + "target_content_type": "document_family", + "total_queries": 12 + }, + { + "audit_source_counts_top5": { + "original_vec": 4 + }, + "best_recall_at_5": 1.0, + "best_stage": "Vector-only", + "category": "video_to_video", + "configured_content_type_filter": "video", + "failure_examples": [], + "issues": [ + { + "code": "under_sampled_category", + "evidence": "1 queries is below the 20-query diagnostic floor.", + "severity": "high" + }, + { + "code": "bm25_modality_blind", + "evidence": "BM25 can't process video queries", + "severity": "medium" + }, + { + "code": "parent_asset_metrics_missing", + "evidence": "Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits.", + "severity": "medium" + }, + { + "code": "media_query_path_missing", + "evidence": "1 per-query rows omit image_query_path or video_query_path instrumentation.", + "severity": "low" + } + ], + "metrics": { + "bm25_recall_at_5": null, + "hybrid_recall_at_10": 1.0, + "hybrid_recall_at_5": 1.0, + "rrf_recall_at_10": 1.0, + "rrf_recall_at_5": 1.0, + "vector_recall_at_10": 1.0, + "vector_recall_at_5": 1.0 + }, + "priority_score": 0.2, + "target_content_type": "video", + "total_queries": 1 + } + ] +} diff --git a/docs/RELEASE.md b/docs/RELEASE.md index 8854dc6..9c9a87d 100644 --- a/docs/RELEASE.md +++ b/docs/RELEASE.md @@ -45,6 +45,14 @@ Then run the expanded benchmark: The benchmark now checkpoints to JSON as it runs. If the run is interrupted, the output file still contains partial results plus progress metadata. +After a complete or partial benchmark run, generate the cross-modal diagnosis report: + +```bash +.venv/bin/python benchmarks/cross_modal_diagnostics.py --input benchmarks/results/cross_modal_ablation_results.json +``` + +This writes `benchmarks/results/cross_modal_diagnostics.json` and `docs/research/cross-modal-diagnostics.md`, ranking weak or under-sampled categories and separating raw embedding, derived-text, reranker, parent-memory, and benchmark-artifact signals. + For safer local validation after the MLX hardening work, prefer the bounded smoke lane first: ```bash diff --git a/docs/research/cross-modal-diagnostics.md b/docs/research/cross-modal-diagnostics.md new file mode 100644 index 0000000..c5149a2 --- /dev/null +++ b/docs/research/cross-modal-diagnostics.md @@ -0,0 +1,289 @@ +# Cross-Modal Retrieval Diagnostics + +This report is generated from the saved cross-modal ablation JSON. It separates raw embedding alignment, derived-text contribution, reranker contribution, benchmark artifacts, and parent-memory scoring coverage. + +## Source + +- Benchmark: `cross_modal_ablation` +- Source version: `0.2.0` +- Source generated at: `2026-03-22T01:15:51.127774+00:00` +- Run status: `complete` +- Weak threshold: R@5 < 60.0% +- Query floor: 20 queries per category + +## Weak And At-Risk Category Ranking + +| Priority | Category | Queries | Best stage | Best R@5 | Vector R@5 | RRF R@5 | Hybrid R@5 | Key issues | +|---:|---|---:|---|---:|---:|---:|---:|---| +| 0.85 | `image_to_document` | 2 | Vector-only | 0.0% | 0.0% | 0.0% | 0.0% | under_sampled_category, bm25_modality_blind, embedding_alignment_gap, derived_text_probe_lift | +| 0.85 | `video_to_document` | 1 | Vector-only | 0.0% | 0.0% | 0.0% | 0.0% | under_sampled_category, bm25_modality_blind, embedding_alignment_gap, derived_text_probe_insufficient | +| 0.80 | `video_to_image` | 2 | Vector-only | 0.0% | 0.0% | 0.0% | 0.0% | under_sampled_category, bm25_modality_blind, embedding_alignment_gap, derived_text_probe_insufficient | +| 0.47 | `video_to_text` | 3 | Vector-only | 33.3% | 33.3% | 33.3% | 33.3% | under_sampled_category, bm25_modality_blind, embedding_alignment_gap, derived_text_probe_insufficient | +| 0.20 | `image_to_image` | 3 | Vector-only | 100.0% | 100.0% | 100.0% | 100.0% | under_sampled_category, bm25_modality_blind, parent_asset_metrics_missing | +| 0.20 | `image_to_text` | 15 | Vector + BM25 (RRF) | 60.0% | 46.7% | 60.0% | 60.0% | under_sampled_category, bm25_modality_blind, embedding_alignment_gap, derived_text_probe_lift | +| 0.20 | `image_to_video` | 2 | Vector-only | 100.0% | 100.0% | 100.0% | 100.0% | under_sampled_category, bm25_modality_blind, parent_asset_metrics_missing, generic_query_artifact | +| 0.20 | `text_to_document` | 12 | Vector-only | 100.0% | 100.0% | 100.0% | 100.0% | under_sampled_category, parent_asset_metrics_missing, document_family_filter_gap | +| 0.20 | `video_to_video` | 1 | Vector-only | 100.0% | 100.0% | 100.0% | 100.0% | under_sampled_category, bm25_modality_blind, parent_asset_metrics_missing, media_query_path_missing | + +## Diagnosis Summary + +- `bm25_modality_blind`: 8 +- `derived_text_probe_insufficient`: 3 +- `derived_text_probe_lift`: 2 +- `document_family_filter_gap`: 3 +- `embedding_alignment_gap`: 5 +- `generic_query_artifact`: 6 +- `media_query_path_missing`: 4 +- `parent_asset_metrics_missing`: 13 +- `reranker_no_lift`: 4 +- `under_sampled_category`: 11 + +## Prioritized Fix List + +1. **search** - Add bounded cascaded media reranking only after cheap top-K retrieval. The weakest media-query categories are not rescued by current RRF/reranker stages, so REC-130 should target a strict top-K cascade instead of broad expensive scoring. +2. **evals** - Expand weak categories to at least 20 queries and keep parent-memory scoring. Several weak categories have 1-3 examples, which is too small to distinguish model weakness from benchmark noise; this maps directly to REC-160. +3. **indexing** - Represent document-family filters explicitly across pdf/docx/pptx roots and children. Document retrieval categories are evaluated without a proper document-family content filter, so unrelated images/videos can dominate media-query results. +4. **evals** - Replace placeholder media prompts with grounded intent labels and provenance. Queries such as 'related document' are useful smoke probes but too ambiguous for release-quality diagnostics. +5. **evals** - Rerun cross-modal ablation with the current harness to populate asset_level metrics. The checked-in result is from v0.2.0 and predates serialized asset-level rollups, so it cannot fully separate child-asset hits from parent-memory hits. +6. **ingest** - Keep strengthening captions, transcripts, and OCR as first-class retrieval text. Where RRF improves over vector-only, the improvement is evidence that derived text is helping and should be cached/versioned rather than recomputed ad hoc. +7. **model_research** - Benchmark visual/document-specialized retrievers against the weak categories. ViDoRe-style visual document retrieval and MTEB/BEIR-style qrels offer better external baselines for document-heavy failures than anecdotes from one synthetic corpus. + +## Category Evidence + +### `image_to_document` + +- Queries: 2 +- Target result family: `document_family` +- Configured benchmark content filter: `None` +- Best stage/R@5: Vector-only / 0.0% +- Raw vector R@5/R@10: 0.0% / 0.0% +- RRF R@5/R@10: 0.0% / 50.0% +- Hybrid R@5/R@10: 0.0% / 50.0% +- Top-5 audit source counts: `original_fts`=8, `original_vec`=8 +- `under_sampled_category` (high): 2 queries is below the 20-query diagnostic floor. +- `bm25_modality_blind` (medium): BM25 can't process image queries +- `embedding_alignment_gap` (high): Vector-only R@5=0.000; media-query categories need raw embedding alignment above 0.60. +- `derived_text_probe_lift` (positive): RRF improves R@5 by +0.000 and R@10 by +0.500 over raw vector search. +- `reranker_no_lift` (medium): Hybrid reranker changes R@5 by only +0.000 versus RRF. +- `parent_asset_metrics_missing` (medium): Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits. +- `document_family_filter_gap` (medium): Benchmark cannot currently constrain results to the pdf/docx/pptx document family with a single content_type filter. +- `generic_query_artifact` (medium): Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity. +- Example misses: + - `related document` (image): expected ['documents/ai_strategy_report.docx', 'documents/ai_architecture_deck.pptx', 'documents/embedding_research.pdf']; top results `images/neural_network_diagram.png`, `images/whiteboard_brainstorm.png`, `images/whiteboard_architecture.png` + - `related document` (image): expected ['documents/recallforge_spec.docx', 'documents/operations_manual.pdf']; top results `images/code_editor_screenshot.png`, `videos/coding_demo.mp4::transcript:0001@0.00s`, `images/whiteboard_architecture.png` + +### `video_to_document` + +- Queries: 1 +- Target result family: `document_family` +- Configured benchmark content filter: `None` +- Best stage/R@5: Vector-only / 0.0% +- Raw vector R@5/R@10: 0.0% / 0.0% +- RRF R@5/R@10: 0.0% / 0.0% +- Hybrid R@5/R@10: 0.0% / 0.0% +- Top-5 audit source counts: `original_fts`=4, `original_vec`=4 +- `under_sampled_category` (high): 1 queries is below the 20-query diagnostic floor. +- `bm25_modality_blind` (medium): BM25 can't process video queries +- `embedding_alignment_gap` (high): Vector-only R@5=0.000; media-query categories need raw embedding alignment above 0.60. +- `derived_text_probe_insufficient` (high): RRF R@5=0.000 does not materially lift vector R@5=0.000. +- `reranker_no_lift` (medium): Hybrid reranker changes R@5 by only +0.000 versus RRF. +- `parent_asset_metrics_missing` (medium): Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits. +- `document_family_filter_gap` (medium): Benchmark cannot currently constrain results to the pdf/docx/pptx document family with a single content_type filter. +- `generic_query_artifact` (medium): Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity. +- `media_query_path_missing` (low): 1 per-query rows omit image_query_path or video_query_path instrumentation. +- Example misses: + - `related document` (video): expected ['documents/ai_architecture_deck.pptx']; top results `videos/architecture_walkthrough.mp4`, `videos/architecture_walkthrough.mp4::frame:0004@15.00s`, `videos/architecture_walkthrough.mp4::frame:0006@25.00s` + +### `video_to_image` + +- Queries: 2 +- Target result family: `image` +- Configured benchmark content filter: `image` +- Best stage/R@5: Vector-only / 0.0% +- Raw vector R@5/R@10: 0.0% / 100.0% +- RRF R@5/R@10: 0.0% / 100.0% +- Hybrid R@5/R@10: 0.0% / 100.0% +- Top-5 audit source counts: `original_fts`=7, `original_vec`=8 +- `under_sampled_category` (high): 2 queries is below the 20-query diagnostic floor. +- `bm25_modality_blind` (medium): BM25 can't process video queries +- `embedding_alignment_gap` (high): Vector-only R@5=0.000; media-query categories need raw embedding alignment above 0.60. +- `derived_text_probe_insufficient` (high): RRF R@5=0.000 does not materially lift vector R@5=0.000. +- `reranker_no_lift` (medium): Hybrid reranker changes R@5 by only +0.000 versus RRF. +- `parent_asset_metrics_missing` (medium): Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits. +- `generic_query_artifact` (medium): Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity. +- `media_query_path_missing` (low): 2 per-query rows omit image_query_path or video_query_path instrumentation. +- Example misses: + - `related image` (video): expected ['images/forest_landscape.png', 'images/mountain_landscape.png', 'images/ocean_beach.png']; top results `videos/nature_timelapse.mp4::frame:0006@25.00s`, `videos/nature_timelapse.mp4::frame:0005@20.00s`, `videos/nature_timelapse.mp4::frame:0007@30.00s` + - `related image` (video): expected ['images/whiteboard_brainstorm.png', 'images/whiteboard_architecture.png']; top results `videos/whiteboard_session.mp4::frame:0003@10.00s`, `videos/whiteboard_session.mp4::frame:0002@5.00s`, `videos/whiteboard_session.mp4::frame:0001@0.00s` + +### `video_to_text` + +- Queries: 3 +- Target result family: `text` +- Configured benchmark content filter: `text` +- Best stage/R@5: Vector-only / 33.3% +- Raw vector R@5/R@10: 33.3% / 33.3% +- RRF R@5/R@10: 33.3% / 33.3% +- Hybrid R@5/R@10: 33.3% / 33.3% +- Top-5 audit source counts: `original_fts`=8, `original_vec`=12 +- `under_sampled_category` (high): 3 queries is below the 20-query diagnostic floor. +- `bm25_modality_blind` (medium): BM25 can't process video queries +- `embedding_alignment_gap` (high): Vector-only R@5=0.333; media-query categories need raw embedding alignment above 0.60. +- `derived_text_probe_insufficient` (high): RRF R@5=0.333 does not materially lift vector R@5=0.333. +- `reranker_no_lift` (medium): Hybrid reranker changes R@5 by only +0.000 versus RRF. +- `parent_asset_metrics_missing` (medium): Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits. +- `generic_query_artifact` (medium): Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity. +- `media_query_path_missing` (low): 3 per-query rows omit image_query_path or video_query_path instrumentation. +- Example misses: + - `related text` (video): expected ['text/tech_cybersecurity.md', 'text/tech_cloud_computing.md']; top results `videos/coding_demo.mp4::transcript:0002@2.00s`, `videos/whiteboard_session.mp4::transcript:0001@0.00s`, `text/medicine_nutrition.md` + - `related text` (video): expected ['text/architecture_gothic.md', 'text/architecture_modern.md', 'text/architecture_blueprints.md']; top results `videos/coding_demo.mp4::transcript:0002@2.00s`, `text/ai_agents.md`, `documents/ai_architecture_deck.pptx::slide:0001` + +### `image_to_image` + +- Queries: 3 +- Target result family: `image` +- Configured benchmark content filter: `image` +- Best stage/R@5: Vector-only / 100.0% +- Raw vector R@5/R@10: 100.0% / 100.0% +- RRF R@5/R@10: 100.0% / 100.0% +- Hybrid R@5/R@10: 100.0% / 100.0% +- Top-5 audit source counts: `original_fts`=8, `original_vec`=12 +- `under_sampled_category` (high): 3 queries is below the 20-query diagnostic floor. +- `bm25_modality_blind` (medium): BM25 can't process image queries +- `parent_asset_metrics_missing` (medium): Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits. + +### `image_to_text` + +- Queries: 15 +- Target result family: `text` +- Configured benchmark content filter: `text` +- Best stage/R@5: Vector + BM25 (RRF) / 60.0% +- Raw vector R@5/R@10: 46.7% / 73.3% +- RRF R@5/R@10: 60.0% / 86.7% +- Hybrid R@5/R@10: 60.0% / 86.7% +- Top-5 audit source counts: `original_fts`=60, `original_vec`=63 +- `under_sampled_category` (medium): 15 queries is below the 20-query diagnostic floor. +- `bm25_modality_blind` (medium): BM25 can't process image queries +- `embedding_alignment_gap` (high): Vector-only R@5=0.467; media-query categories need raw embedding alignment above 0.60. +- `derived_text_probe_lift` (positive): RRF improves R@5 by +0.133 and R@10 by +0.133 over raw vector search. +- `parent_asset_metrics_missing` (medium): Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits. +- `generic_query_artifact` (medium): Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity. +- Example misses: + - `` (image): expected ['text/nature_oceans.md']; top results `text/sports_golf.md`, `text/music_production.md`, `text/nature_forests.md` + - `` (image): expected ['text/ai_agents.md', 'text/tech_edge_ai.md']; top results `videos/architecture_walkthrough.mp4::transcript:0001@0.00s`, `text/tech_cloud_computing.md`, `videos/whiteboard_session.mp4::transcript:0001@0.00s` + - `` (image): expected ['text/ai_agents.md', 'text/tech_cloud_computing.md']; top results `videos/whiteboard_session.mp4::transcript:0002@2.00s`, `videos/whiteboard_session.mp4::transcript:0001@0.00s`, `videos/architecture_walkthrough.mp4::transcript:0001@0.00s` + +### `image_to_video` + +- Queries: 2 +- Target result family: `video` +- Configured benchmark content filter: `video` +- Best stage/R@5: Vector-only / 100.0% +- Raw vector R@5/R@10: 100.0% / 100.0% +- RRF R@5/R@10: 100.0% / 100.0% +- Hybrid R@5/R@10: 100.0% / 100.0% +- Top-5 audit source counts: `original_fts`=2, `original_vec`=8 +- `under_sampled_category` (high): 2 queries is below the 20-query diagnostic floor. +- `bm25_modality_blind` (medium): BM25 can't process image queries +- `parent_asset_metrics_missing` (medium): Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits. +- `generic_query_artifact` (medium): Most media-query prompts are generic placeholders such as 'related document', so scores mix retrieval quality with query-definition ambiguity. + +### `text_to_document` + +- Queries: 12 +- Target result family: `document_family` +- Configured benchmark content filter: `None` +- Best stage/R@5: Vector-only / 100.0% +- Raw vector R@5/R@10: 100.0% / 100.0% +- RRF R@5/R@10: 100.0% / 100.0% +- Hybrid R@5/R@10: 100.0% / 100.0% +- Top-5 audit source counts: `original_fts`=44, `original_vec`=48 +- `under_sampled_category` (medium): 12 queries is below the 20-query diagnostic floor. +- `parent_asset_metrics_missing` (medium): Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits. +- `document_family_filter_gap` (medium): Benchmark cannot currently constrain results to the pdf/docx/pptx document family with a single content_type filter. + +### `video_to_video` + +- Queries: 1 +- Target result family: `video` +- Configured benchmark content filter: `video` +- Best stage/R@5: Vector-only / 100.0% +- Raw vector R@5/R@10: 100.0% / 100.0% +- RRF R@5/R@10: 100.0% / 100.0% +- Hybrid R@5/R@10: 100.0% / 100.0% +- Top-5 audit source counts: `original_vec`=4 +- `under_sampled_category` (high): 1 queries is below the 20-query diagnostic floor. +- `bm25_modality_blind` (medium): BM25 can't process video queries +- `parent_asset_metrics_missing` (medium): Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits. +- `media_query_path_missing` (low): 1 per-query rows omit image_query_path or video_query_path instrumentation. + +### `text_to_image` + +- Queries: 18 +- Target result family: `image` +- Configured benchmark content filter: `image` +- Best stage/R@5: Vector + BM25 (RRF) / 94.4% +- Raw vector R@5/R@10: 88.9% / 94.4% +- RRF R@5/R@10: 94.4% / 94.4% +- Hybrid R@5/R@10: 94.4% / 94.4% +- Top-5 audit source counts: `original_fts`=17, `original_vec`=72 +- `under_sampled_category` (medium): 18 queries is below the 20-query diagnostic floor. +- `parent_asset_metrics_missing` (medium): Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits. +- Example misses: + - `coastal landscape photography` (text): expected ['images/ocean_beach.png']; top results `videos/nature_timelapse.mp4::frame:0006@25.00s`, `videos/nature_timelapse.mp4::frame:0005@20.00s`, `videos/nature_timelapse.mp4::frame:0003@10.00s` + +### `text_to_video` + +- Queries: 15 +- Target result family: `video` +- Configured benchmark content filter: `video` +- Best stage/R@5: Vector-only / 100.0% +- Raw vector R@5/R@10: 100.0% / 100.0% +- RRF R@5/R@10: 100.0% / 100.0% +- Hybrid R@5/R@10: 100.0% / 100.0% +- Top-5 audit source counts: `original_fts`=1, `original_vec`=60 +- `under_sampled_category` (medium): 15 queries is below the 20-query diagnostic floor. +- `parent_asset_metrics_missing` (medium): Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits. + +### `mixed_modal` + +- Queries: 20 +- Target result family: `mixed` +- Configured benchmark content filter: `None` +- Best stage/R@5: BM25-only / 95.0% +- Raw vector R@5/R@10: 85.0% / 100.0% +- RRF R@5/R@10: 95.0% / 100.0% +- Hybrid R@5/R@10: 90.0% / 100.0% +- Top-5 audit source counts: `original_fts`=75, `original_vec`=78 +- `parent_asset_metrics_missing` (medium): Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits. +- Example misses: + - `comprehensive guide to athletic performance` (text): expected ['text/sports_running.md', 'text/sports_cycling.md', 'text/sports_swimming.md', 'text/sports_yoga.md', 'text/medicine_nutrition.md', 'text/medicine_cardiology.md']; top results `text/cooking_sourdough.md`, `documents/ai_architecture_deck.pptx::slide:0003`, `documents/edge_deployment_guide.pdf::page:0001` + +### `text_to_text` + +- Queries: 60 +- Target result family: `text` +- Configured benchmark content filter: `text` +- Best stage/R@5: Vector + BM25 (RRF) / 90.0% +- Raw vector R@5/R@10: 88.3% / 90.0% +- RRF R@5/R@10: 90.0% / 91.7% +- Hybrid R@5/R@10: 90.0% / 91.7% +- Top-5 audit source counts: `original_fts`=188, `original_vec`=241 +- `parent_asset_metrics_missing` (medium): Saved payload lacks asset_level metrics; rerun the benchmark with the current harness to separate parent-memory and child-asset hits. +- Example misses: + - `how do computers understand the meaning of words` (text): expected ['text/ai_embeddings.md']; top results `text/ai_transformers.md`, `text/tech_quantum_computing.md`, `text/tech_cloud_computing.md` + - `underwater basket weaving techniques` (text): expected []; top results `text/sports_swimming.md`, `text/architecture_gothic.md`, `text/cooking_sourdough.md` + - `medieval jousting tournament rules and equipment` (text): expected []; top results `text/sports_swimming.md`, `text/sports_cycling.md`, `text/history_renaissance.md` + +## Method Notes + +- Vector-only is treated as the raw embedding baseline. +- RRF lift over vector-only is treated as evidence from derived text probes such as captions, transcripts, OCR, or BM25 text. +- Hybrid-minus-RRF isolates the current reranker contribution. +- Parent-memory versus asset-level scoring is only available when the source payload includes asset_level metrics. + +## External Evaluation References + +- [BEIR](https://github.com/beir-cellar/beir) structures retrieval evaluation around corpus, queries, qrels, run results, and metrics such as NDCG, MAP, Recall, Precision, and MRR. +- [MTEB](https://github.com/embeddings-benchmark/mteb) is the broader embedding and retrieval evaluation framework now used by ViDoRe for single-model retriever submissions. +- [ViDoRe pipeline evaluation](https://github.com/illuin-tech/vidore-benchmark) explicitly covers multi-stage, hybrid, reranking, OCR, and custom preprocessing pipelines for visual document retrieval. diff --git a/tests/test_cross_modal_benchmark_defs.py b/tests/test_cross_modal_benchmark_defs.py index 6ad5116..a660360 100644 --- a/tests/test_cross_modal_benchmark_defs.py +++ b/tests/test_cross_modal_benchmark_defs.py @@ -287,6 +287,52 @@ def test_output_payload_tracks_partial_progress(self): payload["stages"]["Vector-only"]["text_to_text"]["per_query_results"][0]["asset_level"]["hit_at_1"] ) + def test_output_payload_preserves_video_query_path(self): + module = _load_cross_modal_ablation() + + gt = module.VIDEO_TO_TEXT[0] + stage_result = module.StageResult( + stage="Vector-only", + category=gt.category, + total_queries=1, + hits_at_1=0, + hits_at_5=0, + hits_at_10=0, + ) + stage_result.per_query_results.append( + { + "query": gt.query, + "query_type": gt.query_type, + "image_query_path": gt.image_query_path, + "video_query_path": gt.video_query_path, + "relevant_paths": gt.relevant_paths, + "hit_at_1": False, + "hit_at_5": False, + "hit_at_10": False, + } + ) + + payload = module._build_output_payload( + {gt.category: [gt]}, + {"Vector-only": {gt.category: stage_result}}, + [("Vector-only", "embed")], + expansion_profile=module._resolve_expansion_profile("caption_only"), + smoke_profile="safe", + rss_limit_mb=None, + peak_rss_mb=None, + indexed_items=1, + run_status="complete", + interrupted=False, + completed_stages=["Vector-only"], + current_stage=None, + current_category=None, + ) + + self.assertEqual( + payload["stages"]["Vector-only"][gt.category]["per_query_results"][0]["video_query_path"], + gt.video_query_path, + ) + def test_resolve_expansion_profile_variants(self): module = _load_cross_modal_ablation() diff --git a/tests/test_cross_modal_diagnostics.py b/tests/test_cross_modal_diagnostics.py new file mode 100644 index 0000000..e723593 --- /dev/null +++ b/tests/test_cross_modal_diagnostics.py @@ -0,0 +1,266 @@ +""" +Regression tests for cross-modal benchmark diagnostics. +""" + +import importlib.util +import sys +import unittest +from pathlib import Path + + +def _load_diagnostics_module(): + repo_root = Path(__file__).resolve().parent.parent + module_path = repo_root / "benchmarks" / "cross_modal_diagnostics.py" + spec = importlib.util.spec_from_file_location("cross_modal_diagnostics", module_path) + module = importlib.util.module_from_spec(spec) + assert spec.loader is not None + sys.modules[spec.name] = module + spec.loader.exec_module(module) + return module + + +class TestCrossModalDiagnostics(unittest.TestCase): + def _synthetic_payload(self): + return { + "benchmark": "cross_modal_ablation", + "version": "0.2.0", + "generated_at": "2026-05-17T00:00:00+00:00", + "run_status": "complete", + "categories": { + "image_to_document": {"queries": 2}, + "image_to_text": {"queries": 15}, + "mixed_modal": {"queries": 20}, + }, + "stages": { + "Vector-only": { + "image_to_document": { + "skipped": False, + "total_queries": 2, + "recall_at_5": 0.0, + "recall_at_10": 0.0, + "per_query_results": [ + { + "query": "related document", + "query_type": "image", + "image_query_path": "images/neural_network_diagram.png", + "relevant_paths": ["documents/ai_strategy_report.docx"], + "hit_at_5": False, + "results": [ + { + "filepath": "recallforge://benchmark//repo/tests/uat/corpus/images/neural_network_diagram.png" + } + ], + }, + { + "query": "related document", + "query_type": "image", + "image_query_path": "images/floor_plan_blueprint.png", + "relevant_paths": ["documents/operations_manual.pdf"], + "hit_at_5": False, + "results": [], + }, + ], + }, + "image_to_text": { + "skipped": False, + "total_queries": 15, + "recall_at_5": 0.4, + "recall_at_10": 0.5, + "asset_level": {"recall_at_5": 0.4}, + "per_query_results": [ + { + "query": "", + "query_type": "image", + "image_query_path": "images/ocean_beach.png", + "relevant_paths": ["text/nature_oceans.md"], + "hit_at_5": False, + "results": [], + } + ], + }, + "mixed_modal": { + "skipped": False, + "total_queries": 20, + "recall_at_5": 0.9, + "recall_at_10": 1.0, + "asset_level": {"recall_at_5": 0.9}, + "per_query_results": [], + }, + }, + "BM25-only": { + "image_to_document": { + "skipped": True, + "total_queries": 2, + "skip_reason": "BM25 can't process image queries", + "per_query_results": [], + }, + "image_to_text": { + "skipped": True, + "total_queries": 15, + "skip_reason": "BM25 can't process image queries", + "per_query_results": [], + }, + "mixed_modal": { + "skipped": False, + "total_queries": 20, + "recall_at_5": 0.8, + "recall_at_10": 0.9, + "asset_level": {"recall_at_5": 0.8}, + "per_query_results": [], + }, + }, + "Vector + BM25 (RRF)": { + "image_to_document": { + "skipped": False, + "total_queries": 2, + "recall_at_5": 0.0, + "recall_at_10": 0.5, + "per_query_results": [ + { + "query": "related document", + "query_type": "image", + "image_query_path": "images/neural_network_diagram.png", + "relevant_paths": ["documents/ai_strategy_report.docx"], + "hit_at_5": False, + "results": [ + { + "filepath": "recallforge://benchmark//repo/tests/uat/corpus/text/science_neuroscience.md", + "audit": {"rrf_sources": {"original_vec": 3, "original_fts": 1}}, + } + ], + } + ], + }, + "image_to_text": { + "skipped": False, + "total_queries": 15, + "recall_at_5": 0.65, + "recall_at_10": 0.8, + "asset_level": {"recall_at_5": 0.65}, + "per_query_results": [ + { + "query": "", + "query_type": "image", + "image_query_path": "images/ocean_beach.png", + "relevant_paths": ["text/nature_oceans.md"], + "hit_at_5": True, + "results": [ + { + "filepath": "recallforge://benchmark//repo/tests/uat/corpus/text/nature_oceans.md", + "audit": {"rrf_sources": {"original_vec": 1, "original_fts": 2}}, + } + ], + } + ], + }, + "mixed_modal": { + "skipped": False, + "total_queries": 20, + "recall_at_5": 0.95, + "recall_at_10": 1.0, + "asset_level": {"recall_at_5": 0.95}, + "per_query_results": [], + }, + }, + "Vector + BM25 + Reranker": { + "image_to_document": { + "skipped": False, + "total_queries": 2, + "recall_at_5": 0.0, + "recall_at_10": 0.5, + "per_query_results": [], + }, + "image_to_text": { + "skipped": False, + "total_queries": 15, + "recall_at_5": 0.65, + "recall_at_10": 0.8, + "asset_level": {"recall_at_5": 0.65}, + "per_query_results": [], + }, + "mixed_modal": { + "skipped": False, + "total_queries": 20, + "recall_at_5": 0.95, + "recall_at_10": 1.0, + "asset_level": {"recall_at_5": 0.95}, + "per_query_results": [], + }, + }, + }, + } + + def test_diagnostics_rank_and_classify_weak_categories(self): + module = _load_diagnostics_module() + + diagnostics = module.build_diagnostics( + self._synthetic_payload(), + content_type_filters={ + "image_to_document": None, + "image_to_text": "text", + "mixed_modal": None, + }, + min_queries=20, + weak_recall_at_5=0.6, + ) + + self.assertEqual(diagnostics["weak_categories"][0]["category"], "image_to_document") + issues = { + issue["code"] + for issue in diagnostics["weak_categories"][0]["issues"] + } + self.assertIn("under_sampled_category", issues) + self.assertIn("bm25_modality_blind", issues) + self.assertIn("embedding_alignment_gap", issues) + self.assertIn("document_family_filter_gap", issues) + self.assertIn("generic_query_artifact", issues) + self.assertIn("parent_asset_metrics_missing", issues) + self.assertEqual( + diagnostics["weak_categories"][0]["audit_source_counts_top5"], + {"original_fts": 1, "original_vec": 1}, + ) + + def test_diagnostics_detect_derived_text_lift(self): + module = _load_diagnostics_module() + + diagnostics = module.build_diagnostics( + self._synthetic_payload(), + content_type_filters={ + "image_to_document": None, + "image_to_text": "text", + "mixed_modal": None, + }, + min_queries=20, + weak_recall_at_5=0.6, + ) + image_to_text = next( + item for item in diagnostics["all_categories"] if item["category"] == "image_to_text" + ) + issues = {issue["code"] for issue in image_to_text["issues"]} + + self.assertIn("derived_text_probe_lift", issues) + self.assertNotIn("parent_asset_metrics_missing", issues) + + def test_markdown_report_contains_ranked_fixes(self): + module = _load_diagnostics_module() + diagnostics = module.build_diagnostics( + self._synthetic_payload(), + content_type_filters={ + "image_to_document": None, + "image_to_text": "text", + "mixed_modal": None, + }, + min_queries=20, + weak_recall_at_5=0.6, + ) + + markdown = module.render_markdown(diagnostics) + + self.assertIn("# Cross-Modal Retrieval Diagnostics", markdown) + self.assertIn("`image_to_document`", markdown) + self.assertIn("Prioritized Fix List", markdown) + self.assertIn("BEIR", markdown) + + +if __name__ == "__main__": + unittest.main()