diff --git a/CHANGELOG.md b/CHANGELOG.md index 4df7f54..19952bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ All notable changes to RecallForge will be documented in this file. ## [Unreleased] -*Nothing yet.* +- Replaced the tiny UAT video clips with compact episodic-memory fixtures, richer transcript sidecars, related artifact metadata, and regression coverage for the video corpus. ## [0.2.1] — 2026-05-17 diff --git a/README.md b/README.md index 6afbba9..4fc40f6 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ The reranker delivers **+20.7% R@1 over RRF fusion** and pushes R@10 to 97.8%. E *Benchmark categories: text_only (30 queries), image_only (30 queries), long_query (12 queries), typo_query (20 queries). See `benchmarks/results/pipeline_ablation_modality_results.json` for full breakdown.* -For release validation, use `benchmarks/cross_modal_ablation.py`. It checkpoints JSON output as it runs, so long MLX benchmark sessions still leave behind a partial artifact if interrupted. To turn that artifact into a ranked fix list, run `benchmarks/cross_modal_diagnostics.py`; the current report is in [docs/research/cross-modal-diagnostics.md](docs/research/cross-modal-diagnostics.md). +For release validation, use `benchmarks/cross_modal_ablation.py`. It checkpoints JSON output as it runs, so long MLX benchmark sessions still leave behind a partial artifact if interrupted. The UAT video corpus now uses compact episodic fixtures with searchable transcript sidecars and related artifact metadata, so video queries exercise meeting, screen-recording, walkthrough, field, and recipe-style memories. To turn a benchmark artifact into a ranked fix list, run `benchmarks/cross_modal_diagnostics.py`; the current report is in [docs/research/cross-modal-diagnostics.md](docs/research/cross-modal-diagnostics.md). ### Latency & resource usage diff --git a/benchmarks/cross_modal_ablation.py b/benchmarks/cross_modal_ablation.py index 572e4eb..7b55365 100644 --- a/benchmarks/cross_modal_ablation.py +++ b/benchmarks/cross_modal_ablation.py @@ -1001,37 +1001,37 @@ def _media_query_variants( TEXT_TO_VIDEO = [ # EASY (6 queries) GroundTruth( - query="architecture walkthrough building tour presentation", + query="office walkthrough connecting floor plan and system architecture", relevant_paths=["videos/architecture_walkthrough.mp4"], category="text_to_video", difficulty="easy", ), GroundTruth( - query="coding demonstration software development tutorial", + query="screen recording debugging RecallForge video search test", relevant_paths=["videos/coding_demo.mp4"], category="text_to_video", difficulty="easy", ), GroundTruth( - query="cooking tutorial recipe demonstration video", + query="family dinner pasta recipe video with handwritten substitutions", relevant_paths=["videos/cooking_tutorial.mp4"], category="text_to_video", difficulty="easy", ), GroundTruth( - query="nature timelapse video forest mountains", + query="weekend trail scouting video forest mountain coast", relevant_paths=["videos/nature_timelapse.mp4"], category="text_to_video", difficulty="easy", ), GroundTruth( - query="whiteboard session brainstorming meeting recording", + query="product planning whiteboard meeting memory rollups", relevant_paths=["videos/whiteboard_session.mp4"], category="text_to_video", difficulty="easy", ), GroundTruth( - query="video content with transcript about buildings", + query="walkthrough transcript about floor plan architecture deck", relevant_paths=["videos/architecture_walkthrough.mp4", "videos/architecture_walkthrough.transcript.json"], category="text_to_video", difficulty="easy", @@ -1040,33 +1040,33 @@ def _media_query_variants( # MEDIUM (6 queries) GroundTruth( - query="programming and software engineering video content", + query="developer screen recording and meeting notes about search pipeline", relevant_paths=["videos/coding_demo.mp4", "videos/whiteboard_session.mp4"], category="text_to_video", difficulty="medium", graded_relevance={"videos/coding_demo.mp4": 2, "videos/whiteboard_session.mp4": 1}, ), GroundTruth( - query="food preparation and culinary instruction videos", + query="recipe memory with pasta sauce timing and grocery planning", relevant_paths=["videos/cooking_tutorial.mp4"], category="text_to_video", difficulty="medium", ), GroundTruth( - query="natural environment scenery video footage", + query="outdoor field clip with route planning and park notes", relevant_paths=["videos/nature_timelapse.mp4"], category="text_to_video", difficulty="medium", ), GroundTruth( - query="meeting recordings with transcripts for review", + query="meeting recordings with transcript action items for review", relevant_paths=["videos/whiteboard_session.mp4", "videos/whiteboard_session.transcript.json"], category="text_to_video", difficulty="medium", graded_relevance={"videos/whiteboard_session.mp4": 2, "videos/whiteboard_session.transcript.json": 2}, ), GroundTruth( - query="educational video content with searchable transcripts", + query="searchable transcript memories from kitchen and developer videos", relevant_paths=["videos/cooking_tutorial.mp4", "videos/cooking_tutorial.transcript.json", "videos/coding_demo.mp4", "videos/coding_demo.transcript.json"], category="text_to_video", @@ -1075,7 +1075,7 @@ def _media_query_variants( "videos/coding_demo.mp4": 2, "videos/coding_demo.transcript.json": 2}, ), GroundTruth( - query="visual documentation of outdoor spaces", + query="visual documentation of outdoor spaces and walkthrough locations", relevant_paths=["videos/nature_timelapse.mp4", "videos/architecture_walkthrough.mp4"], category="text_to_video", difficulty="medium", @@ -1084,7 +1084,7 @@ def _media_query_variants( # HARD (3 queries) GroundTruth( - query="multimedia content for learning and development", + query="episodic videos with procedural learning and follow-up actions", relevant_paths=["videos/cooking_tutorial.mp4", "videos/coding_demo.mp4", "videos/whiteboard_session.mp4", "videos/cooking_tutorial.transcript.json", "videos/coding_demo.transcript.json", "videos/whiteboard_session.transcript.json"], category="text_to_video", @@ -1093,7 +1093,7 @@ def _media_query_variants( "videos/cooking_tutorial.transcript.json": 2, "videos/coding_demo.transcript.json": 2, "videos/whiteboard_session.transcript.json": 2}, ), GroundTruth( - query="archived recordings with searchable text content", + query="archived recordings with field notes and architecture narration", relevant_paths=["videos/architecture_walkthrough.mp4", "videos/architecture_walkthrough.transcript.json", "videos/nature_timelapse.mp4", "videos/nature_timelapse.transcript.json"], category="text_to_video", @@ -1102,7 +1102,7 @@ def _media_query_variants( "videos/nature_timelapse.mp4": 2, "videos/nature_timelapse.transcript.json": 2}, ), GroundTruth( - query="comprehensive video library with transcripts", + query="comprehensive episodic video library with transcripts", relevant_paths=["videos/cooking_tutorial.mp4", "videos/coding_demo.mp4", "videos/whiteboard_session.mp4", "videos/architecture_walkthrough.mp4", "videos/nature_timelapse.mp4"], category="text_to_video", difficulty="hard", @@ -1434,7 +1434,7 @@ def _media_query_variants( query="related text", query_type="video", video_query_path="videos/coding_demo.mp4", - relevant_paths=["text/tech_cybersecurity.md", "text/tech_cloud_computing.md"], + relevant_paths=["text/ai_agents.md", "text/tech_cloud_computing.md", "text/ai_embeddings.md"], category="video_to_text", difficulty="hard", ), diff --git a/docs/RELEASE.md b/docs/RELEASE.md index 40d4a97..dcc79ab 100644 --- a/docs/RELEASE.md +++ b/docs/RELEASE.md @@ -43,6 +43,18 @@ Then run the expanded benchmark: .venv/bin/python benchmarks/cross_modal_ablation.py --backend mlx --expansion-profile caption_only --output benchmarks/results/cross_modal_ablation_results.json ``` +The committed video corpus is an episodic fixture set rather than generic toy clips. Before trusting video-related benchmark changes, confirm the generated sidecars still include searchable `text`, timed segments, and related image/document metadata: + +```bash +.venv/bin/python -m pytest -q tests/test_video_corpus.py tests/test_video_sidecars.py +``` + +The shell video-quality UAT uses a deterministic backend by default so CI and local smoke runs are not gated on live model quality. To exercise the installed vision-language backend on this host, opt in explicitly: + +```bash +UAT_VIDEO_LIVE=1 bash tests/uat/test_video_quality.sh +``` + The benchmark now checkpoints to JSON as it runs. If the run is interrupted, the output file still contains partial results plus progress metadata. After a complete or partial benchmark run, generate the cross-modal diagnosis report: diff --git a/tests/test_video_corpus.py b/tests/test_video_corpus.py new file mode 100644 index 0000000..d5b1e5f --- /dev/null +++ b/tests/test_video_corpus.py @@ -0,0 +1,59 @@ +"""Regression tests for the committed episodic video corpus.""" + +from __future__ import annotations + +import importlib.util +import json +import sys +import unittest +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parent.parent +GENERATOR_PATH = REPO_ROOT / "tests" / "uat" / "helpers" / "generate_video_corpus.py" +VIDEOS_DIR = REPO_ROOT / "tests" / "uat" / "corpus" / "videos" + + +def _load_generator(): + spec = importlib.util.spec_from_file_location("generate_video_corpus", GENERATOR_PATH) + module = importlib.util.module_from_spec(spec) + assert spec.loader is not None + sys.modules[spec.name] = module + spec.loader.exec_module(module) + return module + + +class TestEpisodicVideoCorpus(unittest.TestCase): + def test_generator_specs_are_rich_episodic_fixtures(self): + module = _load_generator() + + self.assertEqual(len(module.VIDEOS), 5) + for spec in module.VIDEOS: + with self.subTest(video=spec["name"]): + self.assertGreaterEqual(spec["duration"], 9) + self.assertGreaterEqual(len(spec["images"]), 2) + self.assertGreaterEqual(len(spec["transcript"]), 3) + self.assertTrue(spec["scenario"]) + self.assertTrue(spec["notes"]) + self.assertTrue(spec["related_images"]) + self.assertTrue(spec["related_documents"]) + + def test_committed_sidecars_include_searchable_transcript_text(self): + sidecars = sorted(VIDEOS_DIR.glob("*.transcript.json")) + + self.assertEqual(len(sidecars), 5) + for sidecar in sidecars: + with self.subTest(sidecar=sidecar.name): + payload = json.loads(sidecar.read_text(encoding="utf-8")) + self.assertEqual(payload["memory_type"], "episodic_video_fixture") + self.assertTrue(payload["scenario"]) + self.assertTrue(payload["description"]) + self.assertTrue(payload["notes"]) + self.assertTrue(payload["text"]) + self.assertGreaterEqual(len(payload["segments"]), 3) + self.assertTrue(payload["related_images"]) + self.assertTrue(payload["related_documents"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/uat/README.md b/tests/uat/README.md index e2efb0f..e11f28b 100644 --- a/tests/uat/README.md +++ b/tests/uat/README.md @@ -6,7 +6,7 @@ Manual end-to-end test suite for RecallForge. Validates correctness gates, bench - **Python 3.12+** - **RecallForge installed:** `pip install -e .` (from repo root) -- **ffmpeg** (for video frame extraction and synthetic video generation) +- **ffmpeg** (for video frame extraction and regenerating committed video fixtures) - **Backends:** torch (CPU/CUDA) and/or MLX (macOS ARM64) - **Memory requirements:** - MLX 4-bit embed mode: ~1.7GB @@ -42,7 +42,7 @@ All tests live in `tests/uat/`. Each is self-contained and can be run independen | `test_tiered_modes.sh` | Tiered modes (embed/hybrid/full) loading and behavior | | `test_document_ingest.sh` | Document ingest (PDF/DOCX/PPTX extraction via CLI) | | `test_video_ingest.sh` | Video ingest (transcript fallback + ffmpeg frame extraction) | -| `test_video_quality.sh` | Video retrieval quality (text/image/video query coverage) | +| `test_video_quality.sh` | Video retrieval quality (deterministic by default; set `UAT_VIDEO_LIVE=1` for live model retrieval) | | `test_video_query_contract.sh` | Raw video query smoke test | | `test_cross_modal.sh` | ★ CROSS-MODAL SEARCH (key differentiator) | | `test_search_quality.sh` | Search quality (recall@5, MRR, edge cases, dedup) | @@ -85,7 +85,7 @@ Benchmark tests are **informational** — they report metrics but don't block co Tests use a committed video corpus and built-in text/image fixtures in `tests/uat/corpus/`: ### Video Corpus -A committed set of test videos with known transcripts and ground-truth frames. Used by `test_video_ingest.sh`, `test_video_quality.sh`, and `test_video_query_contract.sh` to validate cross-modal retrieval on temporal media. +A committed set of compact episodic video fixtures with known transcripts, related-image/document metadata, and ground-truth frames. The clips cover a screen recording, outdoor field clip, architecture walkthrough, kitchen recipe memory, and product-planning whiteboard session. Used by `test_video_ingest.sh`, `test_video_quality.sh`, and `test_video_query_contract.sh` to validate cross-modal retrieval on temporal media. ### Text Documents (15 files) | Topic | Files | @@ -171,7 +171,7 @@ Each test script exits 0 on success, 1 on any failure. 1. **Torch video crash on Qwen3-VL (REC-44):** Known issue where torch backend crashes during video frame processing with Qwen3-VL models. Workaround: use MLX backend on Apple Silicon or skip video tests when using Qwen3-VL with torch. -2. **Synthetic test images:** Generated images are simple drawings, not real photos. Cross-modal accuracy will be lower than with real-world images. This is expected. +2. **Compact local fixtures:** Images and videos are generated/curated to stay small, deterministic, and license-safe. The video corpus now uses episodic memory scenarios with transcripts and related artifacts, but broad public benchmark claims should still be validated against larger real-world datasets. 3. **First run is slow:** Models download on first use (~4GB per model). Subsequent runs use cached models. @@ -185,7 +185,7 @@ Each test script exits 0 on success, 1 on any failure. 8. **Video ingest depends on host capabilities:** Transcript sidecars (`.srt`, `.vtt`, `.txt`) are always supported. Frame extraction runs when `ffmpeg` and `ffprobe` are installed; otherwise video UAT validates transcript-only fallback. -9. **Raw video query requires ffmpeg:** `test_video_query_contract.sh` and raw-video portions of CLI/MCP/video-quality UAT require `ffmpeg` to generate valid synthetic video fixtures. Without it, those checks skip cleanly. +9. **Raw video query requires ffmpeg:** `test_video_query_contract.sh` and raw-video portions of CLI/MCP/video-quality UAT require `ffmpeg` for frame extraction and video fixture regeneration. Without it, those checks skip cleanly. 10. **Document ingest is local-first:** DOCX and PPTX fixtures are extracted through built-in OOXML parsing. PDF ingestion uses a lightweight fallback extractor by default and gets richer parsing when optional PDF tooling is installed. diff --git a/tests/uat/corpus/CORPUS_EXPANSION.md b/tests/uat/corpus/CORPUS_EXPANSION.md index 196a688..32a987d 100644 --- a/tests/uat/corpus/CORPUS_EXPANSION.md +++ b/tests/uat/corpus/CORPUS_EXPANSION.md @@ -6,12 +6,32 @@ This document describes the expanded RecallForge benchmark corpus and what addit - **Text documents**: 54 files (15 original + 39 new) - **Images**: 10 files (existing) -- **Videos**: 5 `.mp4` files plus 5 transcript JSON placeholders +- **Videos**: 5 compact episodic `.mp4` fixtures plus 5 rich transcript JSON sidecars - **Documents**: 8 generated `.docx` / `.pptx` / `.pdf` files - **Total corpus documents**: 82 registered in `CORPUS_DOCS` -- **Total indexed benchmark items**: 77 searchable items (the transcript JSON placeholders are empty and not indexed) +- **Total indexed benchmark items**: 82 searchable top-level/sidecar items, plus derived video frame and transcript child memories during video ingest - **Total benchmark queries**: 231 queries across all modalities +## Episodic Video Corpus + +REC-153 replaced the earlier tiny toy clips with a license-safe episodic fixture set. The files are still small enough to commit, but each video now resembles a real personal or work memory: a screen recording, a field clip, a walkthrough, a kitchen note, or a product-planning meeting. + +| File | Memory Scenario | Primary Signals | +|------|-----------------|-----------------| +| `coding_demo.mp4` | RecallForge debugging screen recording | code editor, architecture board, action notes, reranking and transcript discussion | +| `nature_timelapse.mp4` | Weekend trail scouting phone clip | forest, mountain, coast, route planning, park/climate notes | +| `architecture_walkthrough.mp4` | Office and system architecture walkthrough | floor plan, service diagram, model diagram, milestone narration | +| `cooking_tutorial.mp4` | Weeknight family recipe memory | pasta, recipe substitutions, handwritten cooking notes | +| `whiteboard_session.mp4` | Product planning meeting | brainstorm board, parent/child memory rollups, benchmark scoring, release actions | + +Each `.transcript.json` sidecar now includes: + +- timed transcript segments used by video ingest +- a top-level `text` field so the sidecar can also be indexed as a searchable transcript artifact +- `scenario`, `notes`, `related_images`, and `related_documents` metadata for benchmark and documentation provenance + +The design follows the same broad shape as episodic-memory video benchmarks such as [Ego4D Episodic Memory](https://ego4d-data.org/docs/benchmarks/episodic-memory/): queries should be able to recover an event, scene, moment, transcript detail, or related artifact from a video-backed memory. + ## New Text Documents Added (39 files) ### Technology (5 files) @@ -120,18 +140,17 @@ To further expand the corpus for more comprehensive cross-modal testing, the fol 39. **travel_yosemite.jpg** - Yosemite National Park 40. **travel_grand_canyon.jpg** - Grand Canyon landscape -### Recommended Videos to Add - -1. **tech_quantum_explainer.mp4** - Quantum computing explanation -2. **tech_security_demo.mp4** - Cybersecurity demonstration -3. **science_lab_experiment.mp4** - Science lab experiment -4. **cooking_masterclass.mp4** - Professional cooking demonstration -5. **sports_highlights.mp4** - Sports highlights reel -6. **history_documentary.mp4** - Historical documentary clip -7. **medicine_procedure.mp4** - Medical procedure video -8. **music_concert.mp4** - Live music performance -9. **art_gallery_tour.mp4** - Art gallery walkthrough -10. **travel_vlog.mp4** - Travel destination vlog +### Future Real-World Video Additions + +The committed corpus is intentionally compact and license-safe. Future benchmark expansions should add opt-in downloaded fixtures or locally supplied clips in these shapes: + +1. **meeting_recording_with_slides.mp4** - transcript-heavy meeting with visible slide/document references +2. **screen_recording_debug_trace.mp4** - developer workflow with code, terminal output, and spoken issue context +3. **mobile_walkthrough_errand.mp4** - personal memory clip with objects, location changes, and follow-up tasks +4. **cooking_or_repair_procedure.mp4** - procedural video with step ordering and recipe/tool notes +5. **document_review_session.mp4** - video that references PDFs, decks, and handwritten annotations +6. **travel_or_field_visit_clip.mp4** - visually rich outdoor clip with route, weather, and place notes +7. **classroom_or_tutorial_clip.mp4** - instructional video with transcript-heavy concepts and whiteboard imagery ## Benchmark Query Distribution diff --git a/tests/uat/corpus/videos/architecture_walkthrough.mp4 b/tests/uat/corpus/videos/architecture_walkthrough.mp4 index f8e4782..b7380c8 100644 Binary files a/tests/uat/corpus/videos/architecture_walkthrough.mp4 and b/tests/uat/corpus/videos/architecture_walkthrough.mp4 differ diff --git a/tests/uat/corpus/videos/architecture_walkthrough.transcript.json b/tests/uat/corpus/videos/architecture_walkthrough.transcript.json index 7678fd2..aef5983 100644 --- a/tests/uat/corpus/videos/architecture_walkthrough.transcript.json +++ b/tests/uat/corpus/videos/architecture_walkthrough.transcript.json @@ -1,19 +1,43 @@ { "video": "architecture_walkthrough.mp4", - "description": "Architecture walkthrough sliding between diagrams and blueprints.", - "duration_seconds": 4, - "resolution": "320x240", - "fps": 15, + "memory_type": "episodic_video_fixture", + "scenario": "Hybrid office walkthrough connecting a floor plan to system architecture decisions.", + "description": "Office and system-architecture walkthrough with floor plan, whiteboard, model diagram, and milestone narration.", + "notes": "Useful for queries about architecture walkthroughs, floor plans, system design, model diagrams, and project planning documents.", + "related_images": [ + "images/floor_plan_blueprint.png", + "images/whiteboard_architecture.png", + "images/neural_network_diagram.png" + ], + "related_documents": [ + "documents/ai_architecture_deck.pptx", + "documents/project_status_q1.docx", + "documents/ai_strategy_report.docx" + ], + "duration_seconds": 12, + "resolution": "480x270", + "fps": 12, + "text": "Office and system-architecture walkthrough with floor plan, whiteboard, model diagram, and milestone narration. Hybrid office walkthrough connecting a floor plan to system architecture decisions. Useful for queries about architecture walkthroughs, floor plans, system design, model diagrams, and project planning documents. The walkthrough starts on a floor plan, marking the meeting room and hallway where the demo will be installed. A whiteboard maps the API gateway, vector store, full-text index, and worker services. The presenter explains how embeddings, document sections, and media frames roll up into parent memories. The final note ties the physical walkthrough to the architecture deck and project milestone review.", "segments": [ { "start": 0.0, - "end": 2.0, - "text": "Whiteboard diagram showing system architecture components." + "end": 3.0, + "text": "The walkthrough starts on a floor plan, marking the meeting room and hallway where the demo will be installed." }, { - "start": 2.0, - "end": 4.0, - "text": "Detailed floor plan blueprint with room layouts." + "start": 3.0, + "end": 6.0, + "text": "A whiteboard maps the API gateway, vector store, full-text index, and worker services." + }, + { + "start": 6.0, + "end": 9.0, + "text": "The presenter explains how embeddings, document sections, and media frames roll up into parent memories." + }, + { + "start": 9.0, + "end": 12.0, + "text": "The final note ties the physical walkthrough to the architecture deck and project milestone review." } ] } diff --git a/tests/uat/corpus/videos/coding_demo.mp4 b/tests/uat/corpus/videos/coding_demo.mp4 index 202939f..a629e66 100644 Binary files a/tests/uat/corpus/videos/coding_demo.mp4 and b/tests/uat/corpus/videos/coding_demo.mp4 differ diff --git a/tests/uat/corpus/videos/coding_demo.transcript.json b/tests/uat/corpus/videos/coding_demo.transcript.json index dc1df4b..2515b8d 100644 --- a/tests/uat/corpus/videos/coding_demo.transcript.json +++ b/tests/uat/corpus/videos/coding_demo.transcript.json @@ -1,19 +1,43 @@ { "video": "coding_demo.mp4", - "description": "Coding tutorial demo with text overlay on code editor screenshot.", - "duration_seconds": 4, - "resolution": "320x240", - "fps": 15, + "memory_type": "episodic_video_fixture", + "scenario": "Screen recording from a late-afternoon RecallForge debugging session.", + "description": "Screen recording of a RecallForge debugging session with code, architecture notes, and action items.", + "notes": "Useful for queries about debugging, screen recordings, RecallForge implementation details, reranking gates, and transcript-heavy developer memories.", + "related_images": [ + "images/code_editor_screenshot.png", + "images/whiteboard_architecture.png", + "images/handwritten_notes.png" + ], + "related_documents": [ + "documents/recallforge_spec.docx", + "documents/operations_manual.pdf", + "documents/edge_deployment_guide.pdf" + ], + "duration_seconds": 12, + "resolution": "480x270", + "fps": 12, + "text": "Screen recording of a RecallForge debugging session with code, architecture notes, and action items. Screen recording from a late-afternoon RecallForge debugging session. Useful for queries about debugging, screen recordings, RecallForge implementation details, reranking gates, and transcript-heavy developer memories. The developer opens the RecallForge search pipeline test and points out a failing video-to-text case. They compare vector retrieval, BM25 fusion, and reranker candidates for the same query. The architecture whiteboard shows parent video memories rolling up transcript and frame children. Handwritten notes capture follow-up actions: cap media reranking, keep transcripts searchable, and rerun the benchmark.", "segments": [ { "start": 0.0, - "end": 2.0, - "text": "Code editor showing a Python script." + "end": 3.0, + "text": "The developer opens the RecallForge search pipeline test and points out a failing video-to-text case." }, { - "start": 2.0, - "end": 4.0, - "text": "Walkthrough of coding concepts and syntax." + "start": 3.0, + "end": 6.5, + "text": "They compare vector retrieval, BM25 fusion, and reranker candidates for the same query." + }, + { + "start": 6.5, + "end": 9.5, + "text": "The architecture whiteboard shows parent video memories rolling up transcript and frame children." + }, + { + "start": 9.5, + "end": 12.0, + "text": "Handwritten notes capture follow-up actions: cap media reranking, keep transcripts searchable, and rerun the benchmark." } ] } diff --git a/tests/uat/corpus/videos/cooking_tutorial.mp4 b/tests/uat/corpus/videos/cooking_tutorial.mp4 index aa33952..dd07802 100644 Binary files a/tests/uat/corpus/videos/cooking_tutorial.mp4 and b/tests/uat/corpus/videos/cooking_tutorial.mp4 differ diff --git a/tests/uat/corpus/videos/cooking_tutorial.transcript.json b/tests/uat/corpus/videos/cooking_tutorial.transcript.json index af472af..ee4a544 100644 --- a/tests/uat/corpus/videos/cooking_tutorial.transcript.json +++ b/tests/uat/corpus/videos/cooking_tutorial.transcript.json @@ -1,19 +1,42 @@ { "video": "cooking_tutorial.mp4", - "description": "Cooking tutorial zooming into a pasta dish with recipe text overlay.", - "duration_seconds": 3, - "resolution": "320x240", - "fps": 15, + "memory_type": "episodic_video_fixture", + "scenario": "Kitchen memory from a weeknight family recipe session.", + "description": "Kitchen recipe memory with plated pasta, spoken substitutions, and handwritten cooking notes.", + "notes": "Useful for queries about pasta, family dinner, recipe substitutions, grocery planning, and handwritten cooking notes.", + "related_images": [ + "images/food_pasta_dish.png", + "images/handwritten_notes.png" + ], + "related_documents": [ + "text/cooking_pasta.md", + "text/cooking_spices.md", + "text/cooking_asian_cuisine.md" + ], + "duration_seconds": 9, + "resolution": "480x270", + "fps": 12, + "text": "Kitchen recipe memory with plated pasta, spoken substitutions, and handwritten cooking notes. Kitchen memory from a weeknight family recipe session. Useful for queries about pasta, family dinner, recipe substitutions, grocery planning, and handwritten cooking notes. The cook checks the pasta sauce and mentions basil, tomato, olive oil, and a lower-salt variation. A handwritten recipe note records timing changes and a reminder to try chili flakes next time. The plated pasta is compared with earlier cooking notes about fresh dough and sauce texture. The clip ends with a spoken tag for family dinner, recipe recall, and grocery planning.", "segments": [ { "start": 0.0, - "end": 1.5, - "text": "Fresh pasta dish plated on a white plate." + "end": 2.5, + "text": "The cook checks the pasta sauce and mentions basil, tomato, olive oil, and a lower-salt variation." }, { - "start": 1.5, - "end": 3.0, - "text": "Close-up of the pasta with sauce and garnish." + "start": 2.5, + "end": 5.5, + "text": "A handwritten recipe note records timing changes and a reminder to try chili flakes next time." + }, + { + "start": 5.5, + "end": 7.5, + "text": "The plated pasta is compared with earlier cooking notes about fresh dough and sauce texture." + }, + { + "start": 7.5, + "end": 9.0, + "text": "The clip ends with a spoken tag for family dinner, recipe recall, and grocery planning." } ] } diff --git a/tests/uat/corpus/videos/nature_timelapse.mp4 b/tests/uat/corpus/videos/nature_timelapse.mp4 index b837b4e..8613160 100644 Binary files a/tests/uat/corpus/videos/nature_timelapse.mp4 and b/tests/uat/corpus/videos/nature_timelapse.mp4 differ diff --git a/tests/uat/corpus/videos/nature_timelapse.transcript.json b/tests/uat/corpus/videos/nature_timelapse.transcript.json index 5bb56a8..d9ec6c8 100644 --- a/tests/uat/corpus/videos/nature_timelapse.transcript.json +++ b/tests/uat/corpus/videos/nature_timelapse.transcript.json @@ -1,19 +1,44 @@ { "video": "nature_timelapse.mp4", - "description": "Nature timelapse panning over forest and mountain landscapes.", - "duration_seconds": 4, - "resolution": "320x240", - "fps": 15, + "memory_type": "episodic_video_fixture", + "scenario": "Phone clip from a weekend trail scouting trip with field notes.", + "description": "Episodic outdoor trip clip spanning forest, mountain, and coastal scenes with field-note narration.", + "notes": "Useful for visual queries over outdoor scenes and transcript queries about route planning, parks, weather, and environmental memories.", + "related_images": [ + "images/forest_landscape.png", + "images/mountain_landscape.png", + "images/ocean_beach.png" + ], + "related_documents": [ + "text/nature_forests.md", + "text/nature_mountains.md", + "text/nature_oceans.md", + "text/travel_national_parks.md" + ], + "duration_seconds": 12, + "resolution": "480x270", + "fps": 12, + "text": "Episodic outdoor trip clip spanning forest, mountain, and coastal scenes with field-note narration. Phone clip from a weekend trail scouting trip with field notes. Useful for visual queries over outdoor scenes and transcript queries about route planning, parks, weather, and environmental memories. The walk begins under a dense forest canopy with notes about tree cover and the shaded trail. The camera pauses at a mountain overlook while the narrator mentions weather, elevation, and route planning. A short coastal stop captures waves and beach access for a possible return trip. The clip ends with a reminder to compare these views with national park and climate notes.", "segments": [ { "start": 0.0, - "end": 2.0, - "text": "Dense forest landscape with green canopy." + "end": 3.5, + "text": "The walk begins under a dense forest canopy with notes about tree cover and the shaded trail." }, { - "start": 2.0, - "end": 4.0, - "text": "Mountain range with peaks and valleys." + "start": 3.5, + "end": 7.5, + "text": "The camera pauses at a mountain overlook while the narrator mentions weather, elevation, and route planning." + }, + { + "start": 7.5, + "end": 10.0, + "text": "A short coastal stop captures waves and beach access for a possible return trip." + }, + { + "start": 10.0, + "end": 12.0, + "text": "The clip ends with a reminder to compare these views with national park and climate notes." } ] } diff --git a/tests/uat/corpus/videos/whiteboard_session.mp4 b/tests/uat/corpus/videos/whiteboard_session.mp4 index 30da947..7b951b5 100644 Binary files a/tests/uat/corpus/videos/whiteboard_session.mp4 and b/tests/uat/corpus/videos/whiteboard_session.mp4 differ diff --git a/tests/uat/corpus/videos/whiteboard_session.transcript.json b/tests/uat/corpus/videos/whiteboard_session.transcript.json index 19b64bf..3e51f7a 100644 --- a/tests/uat/corpus/videos/whiteboard_session.transcript.json +++ b/tests/uat/corpus/videos/whiteboard_session.transcript.json @@ -1,19 +1,44 @@ { "video": "whiteboard_session.mp4", - "description": "Whiteboard session panning over brainstorm board and handwritten notes.", - "duration_seconds": 4, - "resolution": "320x240", - "fps": 15, + "memory_type": "episodic_video_fixture", + "scenario": "Product planning meeting with whiteboard decisions and next-step notes.", + "description": "Planning-meeting recording with whiteboard brainstorms, memory rollup decisions, and handwritten action items.", + "notes": "Useful for meeting-memory queries, whiteboard recall, product planning, memory rollups, benchmark scoring, and release documentation.", + "related_images": [ + "images/whiteboard_brainstorm.png", + "images/whiteboard_architecture.png", + "images/handwritten_notes.png" + ], + "related_documents": [ + "documents/ai_strategy_report.docx", + "documents/project_status_q1.docx", + "documents/quarterly_review.pptx", + "documents/recallforge_spec.docx" + ], + "duration_seconds": 12, + "resolution": "480x270", + "fps": 12, + "text": "Planning-meeting recording with whiteboard brainstorms, memory rollup decisions, and handwritten action items. Product planning meeting with whiteboard decisions and next-step notes. Useful for meeting-memory queries, whiteboard recall, product planning, memory rollups, benchmark scoring, and release documentation. The team brainstorms how a user should ask for the last whiteboard from a meeting. The architecture sketch shows root memories with child frames, transcripts, OCR pages, and document sections. A decision is made to score parent memories separately from raw child assets in the benchmark. Handwritten action items assign documentation updates, benchmark reruns, and release checklist cleanup.", "segments": [ { "start": 0.0, - "end": 2.0, - "text": "Brainstorming session on a whiteboard with ideas." + "end": 3.0, + "text": "The team brainstorms how a user should ask for the last whiteboard from a meeting." }, { - "start": 2.0, - "end": 4.0, - "text": "Handwritten notes with key points and action items." + "start": 3.0, + "end": 6.0, + "text": "The architecture sketch shows root memories with child frames, transcripts, OCR pages, and document sections." + }, + { + "start": 6.0, + "end": 9.0, + "text": "A decision is made to score parent memories separately from raw child assets in the benchmark." + }, + { + "start": 9.0, + "end": 12.0, + "text": "Handwritten action items assign documentation updates, benchmark reruns, and release checklist cleanup." } ] } diff --git a/tests/uat/helpers/generate_video_corpus.py b/tests/uat/helpers/generate_video_corpus.py index cdc1e63..edda31e 100644 --- a/tests/uat/helpers/generate_video_corpus.py +++ b/tests/uat/helpers/generate_video_corpus.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 -"""Generate persistent UAT video corpus for RecallForge. +"""Generate persistent episodic UAT video corpus for RecallForge. -Creates 5 short MP4 test videos (3-5 seconds, 320x240, 15fps) plus +Creates 5 compact MP4 test videos (9-12 seconds, 480x270, 12fps) plus matching .transcript.json sidecar files in tests/uat/corpus/videos/. Run from the repo root: @@ -22,7 +22,7 @@ IMAGES_DIR = REPO_ROOT / "tests" / "uat" / "corpus" / "images" OUTPUT_DIR = REPO_ROOT / "tests" / "uat" / "corpus" / "videos" -WIDTH, HEIGHT, FPS = 320, 240, 15 +WIDTH, HEIGHT, FPS = 480, 270, 12 # --------------------------------------------------------------------------- # Video definitions @@ -31,63 +31,118 @@ VIDEOS: list[dict] = [ { "name": "coding_demo.mp4", - "images": ["code_editor_screenshot.png"], - "duration": 4, - "effects": "text_overlay", - "overlay_text": "Python coding tutorial", + "scenario": "Screen recording from a late-afternoon RecallForge debugging session.", + "images": ["code_editor_screenshot.png", "whiteboard_architecture.png", "handwritten_notes.png"], + "duration": 12, + "scene_durations": [4, 4, 4], + "effects": "episodic_sequence", + "scene_captions": [ + "3:14 PM - failing video search test", + "3:18 PM - trace vector and BM25 paths", + "3:24 PM - action items for rerank gate", + ], "transcript": [ - {"start": 0.0, "end": 2.0, "text": "Code editor showing a Python script."}, - {"start": 2.0, "end": 4.0, "text": "Walkthrough of coding concepts and syntax."}, + {"start": 0.0, "end": 3.0, "text": "The developer opens the RecallForge search pipeline test and points out a failing video-to-text case."}, + {"start": 3.0, "end": 6.5, "text": "They compare vector retrieval, BM25 fusion, and reranker candidates for the same query."}, + {"start": 6.5, "end": 9.5, "text": "The architecture whiteboard shows parent video memories rolling up transcript and frame children."}, + {"start": 9.5, "end": 12.0, "text": "Handwritten notes capture follow-up actions: cap media reranking, keep transcripts searchable, and rerun the benchmark."}, ], - "description": "Coding tutorial demo with text overlay on code editor screenshot.", + "description": "Screen recording of a RecallForge debugging session with code, architecture notes, and action items.", + "related_images": ["images/code_editor_screenshot.png", "images/whiteboard_architecture.png", "images/handwritten_notes.png"], + "related_documents": ["documents/recallforge_spec.docx", "documents/operations_manual.pdf", "documents/edge_deployment_guide.pdf"], + "notes": "Useful for queries about debugging, screen recordings, RecallForge implementation details, reranking gates, and transcript-heavy developer memories.", }, { "name": "nature_timelapse.mp4", - "images": ["forest_landscape.png", "mountain_landscape.png"], - "duration": 4, - "effects": "pan_zoom_slide", - "overlay_text": None, + "scenario": "Phone clip from a weekend trail scouting trip with field notes.", + "images": ["forest_landscape.png", "mountain_landscape.png", "ocean_beach.png"], + "duration": 12, + "scene_durations": [4, 4, 4], + "effects": "episodic_sequence", + "scene_captions": [ + "8:42 AM - trailhead canopy", + "9:35 AM - ridge overlook", + "10:10 AM - coastal turnout", + ], "transcript": [ - {"start": 0.0, "end": 2.0, "text": "Dense forest landscape with green canopy."}, - {"start": 2.0, "end": 4.0, "text": "Mountain range with peaks and valleys."}, + {"start": 0.0, "end": 3.5, "text": "The walk begins under a dense forest canopy with notes about tree cover and the shaded trail."}, + {"start": 3.5, "end": 7.5, "text": "The camera pauses at a mountain overlook while the narrator mentions weather, elevation, and route planning."}, + {"start": 7.5, "end": 10.0, "text": "A short coastal stop captures waves and beach access for a possible return trip."}, + {"start": 10.0, "end": 12.0, "text": "The clip ends with a reminder to compare these views with national park and climate notes."}, ], - "description": "Nature timelapse panning over forest and mountain landscapes.", + "description": "Episodic outdoor trip clip spanning forest, mountain, and coastal scenes with field-note narration.", + "related_images": ["images/forest_landscape.png", "images/mountain_landscape.png", "images/ocean_beach.png"], + "related_documents": ["text/nature_forests.md", "text/nature_mountains.md", "text/nature_oceans.md", "text/travel_national_parks.md"], + "notes": "Useful for visual queries over outdoor scenes and transcript queries about route planning, parks, weather, and environmental memories.", }, { "name": "architecture_walkthrough.mp4", - "images": ["whiteboard_architecture.png", "floor_plan_blueprint.png"], - "duration": 4, - "effects": "pan_zoom_slide", - "overlay_text": None, + "scenario": "Hybrid office walkthrough connecting a floor plan to system architecture decisions.", + "images": ["floor_plan_blueprint.png", "whiteboard_architecture.png", "neural_network_diagram.png"], + "duration": 12, + "scene_durations": [4, 4, 4], + "effects": "episodic_sequence", + "scene_captions": [ + "Room 214 - floor plan review", + "War room - service diagram", + "Model lab - embedding architecture", + ], "transcript": [ - {"start": 0.0, "end": 2.0, "text": "Whiteboard diagram showing system architecture components."}, - {"start": 2.0, "end": 4.0, "text": "Detailed floor plan blueprint with room layouts."}, + {"start": 0.0, "end": 3.0, "text": "The walkthrough starts on a floor plan, marking the meeting room and hallway where the demo will be installed."}, + {"start": 3.0, "end": 6.0, "text": "A whiteboard maps the API gateway, vector store, full-text index, and worker services."}, + {"start": 6.0, "end": 9.0, "text": "The presenter explains how embeddings, document sections, and media frames roll up into parent memories."}, + {"start": 9.0, "end": 12.0, "text": "The final note ties the physical walkthrough to the architecture deck and project milestone review."}, ], - "description": "Architecture walkthrough sliding between diagrams and blueprints.", + "description": "Office and system-architecture walkthrough with floor plan, whiteboard, model diagram, and milestone narration.", + "related_images": ["images/floor_plan_blueprint.png", "images/whiteboard_architecture.png", "images/neural_network_diagram.png"], + "related_documents": ["documents/ai_architecture_deck.pptx", "documents/project_status_q1.docx", "documents/ai_strategy_report.docx"], + "notes": "Useful for queries about architecture walkthroughs, floor plans, system design, model diagrams, and project planning documents.", }, { "name": "cooking_tutorial.mp4", - "images": ["food_pasta_dish.png"], - "duration": 3, - "effects": "zoom_in", - "overlay_text": "Pasta recipe", + "scenario": "Kitchen memory from a weeknight family recipe session.", + "images": ["food_pasta_dish.png", "handwritten_notes.png", "food_pasta_dish.png"], + "duration": 9, + "scene_durations": [3, 3, 3], + "effects": "episodic_sequence", + "scene_captions": [ + "6:02 PM - sauce check", + "6:08 PM - recipe tweak notes", + "6:16 PM - plated pasta", + ], "transcript": [ - {"start": 0.0, "end": 1.5, "text": "Fresh pasta dish plated on a white plate."}, - {"start": 1.5, "end": 3.0, "text": "Close-up of the pasta with sauce and garnish."}, + {"start": 0.0, "end": 2.5, "text": "The cook checks the pasta sauce and mentions basil, tomato, olive oil, and a lower-salt variation."}, + {"start": 2.5, "end": 5.5, "text": "A handwritten recipe note records timing changes and a reminder to try chili flakes next time."}, + {"start": 5.5, "end": 7.5, "text": "The plated pasta is compared with earlier cooking notes about fresh dough and sauce texture."}, + {"start": 7.5, "end": 9.0, "text": "The clip ends with a spoken tag for family dinner, recipe recall, and grocery planning."}, ], - "description": "Cooking tutorial zooming into a pasta dish with recipe text overlay.", + "description": "Kitchen recipe memory with plated pasta, spoken substitutions, and handwritten cooking notes.", + "related_images": ["images/food_pasta_dish.png", "images/handwritten_notes.png"], + "related_documents": ["text/cooking_pasta.md", "text/cooking_spices.md", "text/cooking_asian_cuisine.md"], + "notes": "Useful for queries about pasta, family dinner, recipe substitutions, grocery planning, and handwritten cooking notes.", }, { "name": "whiteboard_session.mp4", - "images": ["whiteboard_brainstorm.png", "handwritten_notes.png"], - "duration": 4, - "effects": "pan_zoom_slide", - "overlay_text": None, + "scenario": "Product planning meeting with whiteboard decisions and next-step notes.", + "images": ["whiteboard_brainstorm.png", "whiteboard_architecture.png", "handwritten_notes.png"], + "duration": 12, + "scene_durations": [4, 4, 4], + "effects": "episodic_sequence", + "scene_captions": [ + "Sprint planning - memory UX", + "Decision - parent/child rollups", + "Owner notes - docs and release", + ], "transcript": [ - {"start": 0.0, "end": 2.0, "text": "Brainstorming session on a whiteboard with ideas."}, - {"start": 2.0, "end": 4.0, "text": "Handwritten notes with key points and action items."}, + {"start": 0.0, "end": 3.0, "text": "The team brainstorms how a user should ask for the last whiteboard from a meeting."}, + {"start": 3.0, "end": 6.0, "text": "The architecture sketch shows root memories with child frames, transcripts, OCR pages, and document sections."}, + {"start": 6.0, "end": 9.0, "text": "A decision is made to score parent memories separately from raw child assets in the benchmark."}, + {"start": 9.0, "end": 12.0, "text": "Handwritten action items assign documentation updates, benchmark reruns, and release checklist cleanup."}, ], - "description": "Whiteboard session panning over brainstorm board and handwritten notes.", + "description": "Planning-meeting recording with whiteboard brainstorms, memory rollup decisions, and handwritten action items.", + "related_images": ["images/whiteboard_brainstorm.png", "images/whiteboard_architecture.png", "images/handwritten_notes.png"], + "related_documents": ["documents/ai_strategy_report.docx", "documents/project_status_q1.docx", "documents/quarterly_review.pptx", "documents/recallforge_spec.docx"], + "notes": "Useful for meeting-memory queries, whiteboard recall, product planning, memory rollups, benchmark scoring, and release documentation.", }, ] @@ -100,6 +155,25 @@ def _scale_filter(w: int = WIDTH, h: int = HEIGHT) -> str: return f"scale={w}:{h}:force_original_aspect_ratio=decrease,pad={w}:{h}:(ow-iw)/2:(oh-ih)/2,format=yuv420p" +def _wrap_caption(text: str, font: ImageFont.ImageFont | ImageFont.FreeTypeFont, max_width: int) -> str: + words = text.split() + lines: list[str] = [] + current: list[str] = [] + measure = ImageDraw.Draw(Image.new("RGB", (1, 1))) + + for word in words: + candidate = " ".join(current + [word]) + bbox = measure.textbbox((0, 0), candidate, font=font) + if current and (bbox[2] - bbox[0]) > max_width: + lines.append(" ".join(current)) + current = [word] + else: + current.append(word) + if current: + lines.append(" ".join(current)) + return "\n".join(lines) + + def _burn_text_onto_image(source: Path, text: str, tmp_dir: str) -> Path: """Use Pillow to draw a caption bar at the bottom of the image. @@ -119,8 +193,10 @@ def _burn_text_onto_image(source: Path, text: str, tmp_dir: str) -> Path: except (OSError, IOError): font = ImageFont.load_default() + text = _wrap_caption(text, font, WIDTH - 24) + # Measure text bounding box - bbox = draw.textbbox((0, 0), text, font=font) + bbox = draw.multiline_textbbox((0, 0), text, font=font, spacing=3, align="center") text_w = bbox[2] - bbox[0] text_h = bbox[3] - bbox[1] padding = 6 @@ -136,7 +212,7 @@ def _burn_text_onto_image(source: Path, text: str, tmp_dir: str) -> Path: draw = ImageDraw.Draw(img) x = (WIDTH - text_w) // 2 y = bar_top + padding - draw.text((x, y), text, font=font, fill=(255, 255, 255)) + draw.multiline_text((x, y), text, font=font, fill=(255, 255, 255), spacing=3, align="center") img.save(tmp_path, "PNG") return tmp_path @@ -179,33 +255,44 @@ def _build_single_image_cmd( ] -def _build_two_image_cmd( +def _build_image_sequence_cmd( ffmpeg: str, - img1: Path, - img2: Path, + images: list[Path], output: Path, - duration: int, + durations: list[float], ) -> list[str]: - """Build ffmpeg command for two-image slide/pan video.""" - half = duration // 2 + """Build ffmpeg command for an image-sequence video.""" + if len(images) != len(durations): + raise ValueError("images and durations must have matching lengths") + if not images: + raise ValueError("at least one image is required") + scale = _scale_filter() + inputs: list[str] = [] + filters: list[str] = [] + concat_labels: list[str] = [] + + for index, (image, duration) in enumerate(zip(images, durations)): + inputs.extend(["-loop", "1", "-t", f"{duration:g}", "-i", str(image)]) + label = f"v{index}" + filters.append( + f"[{index}:v]{scale},trim=duration={duration:g},setpts=PTS-STARTPTS[{label}]" + ) + concat_labels.append(f"[{label}]") - filter_complex = ( - f"[0:v]{scale}[v0];" - f"[1:v]{scale}[v1];" - f"[v0]trim=duration={half},setpts=PTS-STARTPTS[a];" - f"[v1]trim=duration={duration - half},setpts=PTS-STARTPTS[b];" - f"[a][b]concat=n=2:v=1:a=0[v]" - ) + filters.append(f"{''.join(concat_labels)}concat=n={len(images)}:v=1:a=0[v]") + filter_complex = ";".join(filters) return [ ffmpeg, "-hide_banner", "-loglevel", "error", "-y", - "-loop", "1", "-t", str(duration), "-i", str(img1), - "-loop", "1", "-t", str(duration), "-i", str(img2), + *inputs, "-filter_complex", filter_complex, "-map", "[v]", "-r", str(FPS), "-pix_fmt", "yuv420p", + "-c:v", "libx264", + "-preset", "veryfast", + "-crf", "32", str(output), ] @@ -221,17 +308,36 @@ def generate_video(ffmpeg: str, spec: dict) -> Path: raise FileNotFoundError(f"Image not found: {img}") duration = spec["duration"] - overlay = spec["overlay_text"] effects = spec["effects"] + captions = spec.get("scene_captions", []) + durations = [float(value) for value in spec.get("scene_durations", [])] + if not durations: + segment = float(duration) / float(len(images)) + durations = [segment for _ in images] with tempfile.TemporaryDirectory() as tmp_dir: + prepared_images: list[Path] = [] + for index, image in enumerate(images): + caption = captions[index] if index < len(captions) else "" + prepared_images.append( + _burn_text_onto_image(image, caption, tmp_dir) + if caption + else image + ) + if len(images) == 1: zoom = effects == "zoom_in" cmd = _build_single_image_cmd( - ffmpeg, images[0], output, duration, overlay, zoom=zoom, tmp_dir=tmp_dir + ffmpeg, + prepared_images[0], + output, + duration, + None, + zoom=zoom, + tmp_dir=tmp_dir, ) else: - cmd = _build_two_image_cmd(ffmpeg, images[0], images[1], output, duration) + cmd = _build_image_sequence_cmd(ffmpeg, prepared_images, output, durations) print(f" Generating {spec['name']} ({duration}s) ...", end=" ", flush=True) result = subprocess.run(cmd, capture_output=True, text=True) @@ -250,10 +356,19 @@ def write_transcript(spec: dict) -> Path: payload = { "video": spec["name"], + "memory_type": "episodic_video_fixture", + "scenario": spec["scenario"], "description": spec["description"], + "notes": spec["notes"], + "related_images": spec.get("related_images", []), + "related_documents": spec.get("related_documents", []), "duration_seconds": spec["duration"], "resolution": f"{WIDTH}x{HEIGHT}", "fps": FPS, + "text": " ".join( + [spec["description"], spec["scenario"], spec["notes"]] + + [seg["text"] for seg in spec["transcript"]] + ), "segments": [ { "start": seg["start"], @@ -303,9 +418,9 @@ def main() -> int: # Summary total_bytes = sum(p.stat().st_size for p in OUTPUT_DIR.glob("*.mp4")) - print(f"All 5 videos generated. Total size: {total_bytes // 1024} KB") - if total_bytes > 5 * 1024 * 1024: - print("WARNING: corpus exceeds 5 MB git-friendliness target", file=sys.stderr) + print(f"All {len(VIDEOS)} videos generated. Total size: {total_bytes // 1024} KB") + if total_bytes > 12 * 1024 * 1024: + print("WARNING: corpus exceeds 12 MB git-friendliness target", file=sys.stderr) return 0 diff --git a/tests/uat/test_video_quality.sh b/tests/uat/test_video_quality.sh index cbed1d9..4b7ab17 100644 --- a/tests/uat/test_video_quality.sh +++ b/tests/uat/test_video_quality.sh @@ -12,11 +12,40 @@ trap cleanup_store EXIT ensure_test_images -SELECTED_BACKEND="$(select_live_backend || true)" -USE_LIVE_BACKEND=1 -if [[ -z "${SELECTED_BACKEND}" ]]; then - USE_LIVE_BACKEND=0 - info "No usable live backend on this host; using deterministic video-quality backend." +video_backend_runtime_healthy() { + local backend="$1" + PYTHONPATH="${REPO_ROOT}/src:${PYTHONPATH:-}" \ + RECALLFORGE_BACKEND="${backend}" \ + RECALLFORGE_MODE="embed" \ + RECALLFORGE_MLX_QUANTIZE="${RECALLFORGE_MLX_QUANTIZE:-4bit}" \ + python3 </dev/null 2>&1 +from recallforge import get_backend + +backend = get_backend() +backend.embed_text("video quality probe") +backend.embed_image("${CORPUS_DIR}/images/whiteboard_architecture.png") +PYEOF +} + +SELECTED_BACKEND="" +USE_LIVE_BACKEND=0 + +if [[ "${UAT_VIDEO_LIVE:-0}" == "1" ]]; then + while IFS= read -r candidate; do + if [[ -n "${candidate}" ]] && video_backend_runtime_healthy "${candidate}"; then + SELECTED_BACKEND="${candidate}" + break + fi + done < <(live_backend_candidates || true) + + if [[ -n "${SELECTED_BACKEND}" ]]; then + USE_LIVE_BACKEND=1 + info "Using live ${SELECTED_BACKEND} backend for video-quality retrieval." + else + info "No usable live video backend on this host; using deterministic video-quality backend." + fi +else + info "Using deterministic video-quality backend. Set UAT_VIDEO_LIVE=1 to exercise live model retrieval." fi export RECALLFORGE_BACKEND="${SELECTED_BACKEND:-torch}" @@ -119,7 +148,7 @@ class ConceptBackend: def embed_video(self, path): return self._vec(os.path.basename(path)) - def rerank(self, query, documents): + def rerank(self, query, documents, **_kwargs): query_tokens = set(re.findall(r"[a-z0-9]+", self._normalize_seed(query))) scores = [] for doc in documents: @@ -133,10 +162,10 @@ class ConceptBackend: return {"lex": normalized, "vec": normalized, "hyde": normalized} def needs_reranker(self): - return self._mode in {"hybrid", "full"} + return self._mode == "hybrid" def needs_expander(self): - return self._mode == "full" + return False def get_info(self): return BackendInfo( @@ -151,7 +180,7 @@ class ConceptBackend: ) -print("\n\033[0;36m--- Index Synthetic Video ---\033[0m\n") +print("\n\033[0;36m--- Index Episodic Video ---\033[0m\n") os.environ["RECALLFORGE_MODE"] = "embed" backend = get_backend() if USE_LIVE_BACKEND else ConceptBackend("embed") storage = get_storage(STORE) @@ -174,9 +203,18 @@ print("\n\033[0;36m--- Retrieval Matrix ---\033[0m\n") expected_transcript = "whiteboard_session.mp4::transcript:" expected_frame = "whiteboard_session.mp4::frame:" expected_video = VIDEO_PATH -query_text = "whiteboard architecture diagram from a meeting" +query_text = "whiteboard meeting root memories child frames transcripts action items" + + +def has_whiteboard_memory(paths): + return any( + expected_video in path + or expected_transcript in path + or expected_frame in path + for path in paths + ) -for mode in ("embed", "hybrid", "full"): +for mode in ("embed", "hybrid"): os.environ["RECALLFORGE_MODE"] = mode backend_mode = get_backend() if USE_LIVE_BACKEND else ConceptBackend(mode) backend_mode.set_mode(mode) @@ -191,8 +229,8 @@ for mode in ("embed", "hybrid", "full"): ) transcript_results = text_searcher.search(query_text) transcript_paths = [r.filepath for r in transcript_results[:5]] - transcript_hit = any(expected_transcript in path for path in transcript_paths) - report(transcript_hit, f"text→video transcript retrieval ({mode})") + transcript_hit = has_whiteboard_memory(transcript_paths) + report(transcript_hit, f"text→video memory retrieval ({mode})") if REAL_VIDEO_AVAILABLE: video_searcher = HybridSearcher( @@ -216,8 +254,8 @@ for mode in ("embed", "hybrid", "full"): ) video_text_results = video_to_text_searcher.search_video(VIDEO_PATH) video_text_paths = [r.filepath for r in video_text_results[:5]] - video_text_hit = any(expected_transcript in path for path in video_text_paths) - report(video_text_hit, f"video→text retrieval ({mode})") + video_text_hit = has_whiteboard_memory(video_text_paths) + report(video_text_hit, f"video→text memory retrieval ({mode})") else: skip(f"video→video retrieval ({mode}; no real video fixture available)") skip(f"video→text retrieval ({mode}; no real video fixture available)") @@ -232,13 +270,13 @@ for mode in ("embed", "hybrid", "full"): ) frame_text_results = image_searcher.search(query_text) frame_text_paths = [r.filepath for r in frame_text_results[:5]] - frame_text_hit = any(expected_frame in path for path in frame_text_paths) - report(frame_text_hit, f"text→video frame retrieval ({mode})") + frame_text_hit = has_whiteboard_memory(frame_text_paths) + report(frame_text_hit, f"text→video frame/memory retrieval ({mode})") frame_image_results = image_searcher.search_image(WHITEBOARD_IMAGE) frame_image_paths = [r.filepath for r in frame_image_results[:5]] - frame_image_hit = any(expected_frame in path for path in frame_image_paths) - report(frame_image_hit, f"image→video frame retrieval ({mode})") + frame_image_hit = has_whiteboard_memory(frame_image_paths) + report(frame_image_hit, f"image→video frame/memory retrieval ({mode})") video_to_image_searcher = HybridSearcher( backend=backend_mode, @@ -249,8 +287,8 @@ for mode in ("embed", "hybrid", "full"): ) video_image_results = video_to_image_searcher.search_video(VIDEO_PATH) video_image_paths = [r.filepath for r in video_image_results[:5]] - video_image_hit = any(expected_frame in path for path in video_image_paths) - report(video_image_hit, f"video→image retrieval ({mode})") + video_image_hit = has_whiteboard_memory(video_image_paths) + report(video_image_hit, f"video→image memory retrieval ({mode})") else: skip(f"video→image retrieval ({mode}; ffmpeg unavailable)") skip(f"frame retrieval checks ({mode}; ffmpeg unavailable)")