diff --git a/README.md b/README.md index 799a0af..6214e30 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ A multimodal evaluation framework for scheduling LLM and VLM evaluations across - **Task groups** for pre-defined evaluation suites with automatic dataset pre-downloading - **Multi-cluster support** with auto-detection (Leonardo, LUMI, JURECA, Snellius) - **Image evaluation** via lmms-eval (VQAv2, MMBench, MMMU, ChartQA, DocVQA, TextVQA, OCRBench, MathVista) +- **Video evaluation** via lmms-eval (MVBench, EgoSchema, VideoMME, ActivityNet-QA, LongVideoBench) - **Plugin system** for contributing custom benchmarks without touching core code - **Automatic building and deployment of containers** @@ -75,7 +76,18 @@ Super groups: `oellm-multilingual` (all multilingual benchmarks combined) | `image-ocrbench` | OCRBench | lmms-eval | | `image-mathvista` | MathVista | lmms-eval | -The lmms-eval adapter class (`llava_hf`, `qwen2_5_vl`, etc.) is auto-detected from the model name. +### Video + +| Group | Benchmark | Engine | +|---|---|---| +| `video-understanding` | All 5 benchmarks combined | lmms-eval | +| `video-mvbench` | MVBench (20 temporal tasks) | lmms-eval | +| `video-egoschema` | EgoSchema (long-form egocentric QA) | lmms-eval | +| `video-videomme` | Video-MME (11s-1h clips) | lmms-eval | +| `video-activitynet-qa` | ActivityNet-QA (requires GPT API) | lmms-eval | +| `video-longvideobench` | LongVideoBench (cross-segment reasoning) | lmms-eval | + +The lmms-eval adapter class (`llava_hf`, `llava_onevision`, `qwen2_5_vl`, etc.) is auto-detected from the model name. Install with `pip install oellm[video]` (or use a venv with lmms-eval). ### Custom Benchmarks (contrib) @@ -88,6 +100,12 @@ oellm schedule-eval \ --task-groups "image-vqa" \ --venv-path ~/elliot-venv +# Run all 5 video benchmarks +oellm schedule-eval \ + --models "lmms-lab/llava-onevision-7b" \ + --task-groups "video-understanding" \ + --venv-path ~/elliot-venv + # Mix image and text benchmarks in one submission oellm schedule-eval \ --models "llava-hf/llava-1.5-7b-hf" \ diff --git a/docs/VENV.md b/docs/VENV.md index bce8ac4..553500f 100644 --- a/docs/VENV.md +++ b/docs/VENV.md @@ -95,7 +95,7 @@ We use [Ali's fork](https://github.com/Ali-Elganzory/evalchemy) which includes a 3. Run with `EVALCHEMY_DIR` pointing to the cloned repo: ```bash - export HF_ALLOW_CODE_EVAL=1 # required by MBPP + export HF_ALLOW_CODE_EVAL=1 # required by MBPP EVALCHEMY_DIR=$(pwd)/evalchemy oellm schedule-eval \ --models HuggingFaceTB/SmolLM2-135M \ --task-groups reasoning \ diff --git a/oellm/constants.py b/oellm/constants.py index 15915a5..c8c7334 100644 --- a/oellm/constants.py +++ b/oellm/constants.py @@ -18,10 +18,17 @@ class EvaluationJob: LMMS_MODEL_ADAPTERS: list[tuple[list[str], str]] = [ (["qwen2.5-vl", "qwen2_5_vl", "qwen2.5vl"], "qwen2_5_vl"), (["qwen2-vl", "qwen2_vl"], "qwen2_vl"), + (["llava-hf"], "llava_hf"), + (["llava-onevision", "llava_onevision"], "llava_onevision"), + (["llava-vid", "llava_vid", "llava-video"], "llava_vid"), + (["video-llava", "video_llava"], "video_llava"), (["llava"], "llava_hf"), + (["internvideo"], "internvideo2"), (["internvl"], "internvl2"), (["idefics"], "idefics3"), (["minicpm"], "minicpm_v"), + (["longva"], "longva"), + (["videochat2"], "videochat2"), (["qwen"], "qwen_vl"), ] diff --git a/oellm/contrib/regiondial_bench/__init__.py b/oellm/contrib/regiondial_bench/__init__.py index 8b13789..e69de29 100644 --- a/oellm/contrib/regiondial_bench/__init__.py +++ b/oellm/contrib/regiondial_bench/__init__.py @@ -1 +0,0 @@ - diff --git a/oellm/resources/task-groups.yaml b/oellm/resources/task-groups.yaml index 6f1d757..ba602e2 100644 --- a/oellm/resources/task-groups.yaml +++ b/oellm/resources/task-groups.yaml @@ -35,6 +35,13 @@ task_metrics: mathvista_testmini_cot: llm_as_judge_eval mathvista_testmini_format: llm_as_judge_eval mathvista_testmini_solution: llm_as_judge_eval + # lmms-eval video benchmark metrics + video_mmmu: mmmu_acc + egoschema: submission + videomme: videomme_perception_score + # ActivityNet-QA requires GPT API access for evaluation (LLM-as-judge) + activitynetqa: gpt_eval_accuracy + longvideobench_val_v: lvb_acc task_groups: open-sci-0.01: @@ -416,6 +423,64 @@ task_groups: - task: mathvista_testmini dataset: AI4Math/MathVista + # ── Video Modality (lmms-eval) ──────────────────────────────────────────── + video-understanding: + description: "Video understanding benchmarks via lmms-eval (VideoMMMU, EgoSchema, VideoMME, ActivityNet-QA, LongVideoBench)" + suite: lmms_eval + n_shots: [0] + tasks: + - task: video_mmmu + dataset: lmms-lab/VideoMMMU + - task: egoschema + dataset: lmms-lab/egoschema + - task: videomme + dataset: lmms-lab/Video-MME + - task: activitynetqa + dataset: lmms-lab/ActivityNetQA + - task: longvideobench_val_v + dataset: longvideobench/LongVideoBench + + # ── Individual Video Benchmarks (single-task groups for targeted runs) ──── + video-videommmu: + description: "VideoMMMU multi-discipline video understanding via lmms-eval" + suite: lmms_eval + n_shots: [0] + tasks: + - task: video_mmmu + dataset: lmms-lab/VideoMMMU + + video-egoschema: + description: "EgoSchema long-form egocentric video QA via lmms-eval" + suite: lmms_eval + n_shots: [0] + tasks: + - task: egoschema + dataset: lmms-lab/egoschema + + video-videomme: + description: "Video-MME full-spectrum video understanding (11s-1h) via lmms-eval" + suite: lmms_eval + n_shots: [0] + tasks: + - task: videomme + dataset: lmms-lab/Video-MME + + video-activitynet-qa: + description: "ActivityNet-QA open-ended activity video QA via lmms-eval (requires GPT API for scoring)" + suite: lmms_eval + n_shots: [0] + tasks: + - task: activitynetqa + dataset: lmms-lab/ActivityNetQA + + video-longvideobench: + description: "LongVideoBench long-video cross-segment reasoning via lmms-eval" + suite: lmms_eval + n_shots: [0] + tasks: + - task: longvideobench_val_v + dataset: longvideobench/LongVideoBench + dclm-core-22: description: "DCLM core 22 evaluation tasks (lm-eval-harness, matching LLM Foundry task types)" suite: lm-eval-harness diff --git a/oellm/resources/template.sbatch b/oellm/resources/template.sbatch index 17c374b..d67ab94 100644 --- a/oellm/resources/template.sbatch +++ b/oellm/resources/template.sbatch @@ -190,11 +190,17 @@ do _lmms_adapter="${{eval_suite#*:}}" OUTPUT_JSON="{evals_dir}/$(openssl rand -hex 5).json" + # LLaVA adapters need model_name to avoid a missing-import bug in lmms-eval + _lmms_extra_args="" + if [[ "$_lmms_adapter" == "llava_onevision" || "$_lmms_adapter" == "llava_vid" || "$_lmms_adapter" == "video_llava" ]]; then + _lmms_extra_args=",model_name=$(basename "$model_path")" + fi + if [ -n "$VENV_PATH" ]; then source "$VENV_PATH/bin/activate" python -m lmms_eval \ --model "$_lmms_adapter" \ - --model_args "pretrained=$model_path,device_map=auto" \ + --model_args "pretrained=$model_path,device_map=auto$_lmms_extra_args" \ --tasks "$task_path" \ --num_fewshot "$n_shot" \ --output_path "$OUTPUT_JSON" \ @@ -205,7 +211,7 @@ do $EVAL_SIF_PATH \ python -m lmms_eval \ --model "$_lmms_adapter" \ - --model_args "pretrained=$model_path,device_map=auto" \ + --model_args "pretrained=$model_path,device_map=auto$_lmms_extra_args" \ --tasks "$task_path" \ --num_fewshot "$n_shot" \ --output_path "$OUTPUT_JSON" \ diff --git a/oellm/task_groups.py b/oellm/task_groups.py index 7291420..e925273 100644 --- a/oellm/task_groups.py +++ b/oellm/task_groups.py @@ -9,6 +9,7 @@ class DatasetSpec: repo_id: str subset: str | None = None + video: bool = False @dataclass @@ -154,16 +155,16 @@ class TaskGroupResult: def _iter_all_tasks( parsed: dict[str, TaskSuperGroup | TaskGroup], -) -> Iterable[tuple[_Task, str]]: - """Yield ``(task, suite)`` pairs from a parsed group dict, flattening super groups.""" - for group in parsed.values(): +) -> Iterable[tuple[_Task, str, str]]: + """Yield ``(task, suite, group_name)`` triples from a parsed group dict, flattening super groups.""" + for group_name, group in parsed.items(): if isinstance(group, TaskGroup): for t in group.tasks: - yield t, t.suite or group.suite + yield t, t.suite or group.suite, group_name else: for g in group.task_groups: for t in g.tasks: - yield t, t.suite or g.suite + yield t, t.suite or g.suite, g.name def _expand_task_groups(group_names: Iterable[str]) -> list[TaskGroupResult]: @@ -173,7 +174,7 @@ def _expand_task_groups(group_names: Iterable[str]) -> list[TaskGroupResult]: raise ValueError(f"Unknown task group(s): {', '.join(sorted(missing))}") results: list[TaskGroupResult] = [] - for t, suite in _iter_all_tasks(parsed): + for t, suite, _gname in _iter_all_tasks(parsed): for shot in (int(s) for s in (t.n_shots or [])): results.append(TaskGroupResult(task=t.name, n_shot=shot, suite=suite)) @@ -198,22 +199,28 @@ def _collect_dataset_specs(group_names: Iterable[str]) -> list[DatasetSpec]: parsed = _parse_task_groups([str(n).strip() for n in group_names if str(n).strip()]) specs: list[DatasetSpec] = [] - seen: set[tuple[str, str | None]] = set() + seen: set[tuple[str, str | None, str | None]] = set() - def add_spec(dataset: str | None, subset: str | None): + def add_spec( + dataset: str | None, + subset: str | None, + video: bool = False, + ): if dataset is None: return key = (dataset, subset) if key not in seen: seen.add(key) - specs.append(DatasetSpec(repo_id=dataset, subset=subset)) + specs.append(DatasetSpec(repo_id=dataset, subset=subset, video=video)) + + for t, _, group_name in _iter_all_tasks(parsed): + is_video = group_name.startswith("video-") - for t, _ in _iter_all_tasks(parsed): if t.dataset == "facebook/flores" and not t.subset: for lang in _extract_flores_subsets(t.name): add_spec(t.dataset, lang) else: - add_spec(t.dataset, t.subset) + add_spec(t.dataset, t.subset, video=is_video) return specs @@ -225,7 +232,7 @@ def _collect_hf_model_repos(group_names: Iterable[str]) -> list[str]: repos: list[str] = [] seen: set[str] = set() - for t, _ in _iter_all_tasks(parsed): + for t, _, _gname in _iter_all_tasks(parsed): for repo_id in t.hf_models or []: if repo_id not in seen: seen.add(repo_id) @@ -238,24 +245,32 @@ def _collect_hf_dataset_files(group_names: Iterable[str]) -> list[dict]: """Return deduplicated HF dataset file specs declared in task ``hf_dataset_files`` fields.""" parsed = _parse_task_groups([str(n).strip() for n in group_names if str(n).strip()]) - # Merge patterns from all tasks that share the same repo_id so that - # a single snapshot_download fetches everything needed. - merged: dict[str, list[str]] = {} + # Merge patterns from all tasks that share the same (repo_id, revision) + # so that a single snapshot_download fetches everything needed. + merged: dict[tuple[str, str | None], list[str]] = {} - for t, _ in _iter_all_tasks(parsed): + for t, _, _gname in _iter_all_tasks(parsed): for spec in t.hf_dataset_files or []: repo_id = spec.get("repo_id", "") if not repo_id: continue + revision = spec.get("revision") patterns = spec.get("patterns") or [] - if repo_id not in merged: - merged[repo_id] = list(patterns) + key = (repo_id, revision) + if key not in merged: + merged[key] = list(patterns) else: for p in patterns: - if p not in merged[repo_id]: - merged[repo_id].append(p) + if p not in merged[key]: + merged[key].append(p) - return [{"repo_id": rid, "patterns": pats} for rid, pats in merged.items()] + result = [] + for (rid, rev), pats in merged.items(): + entry: dict = {"repo_id": rid, "patterns": pats} + if rev: + entry["revision"] = rev + result.append(entry) + return result def _build_task_dataset_map() -> dict[str, list[DatasetSpec]]: @@ -268,7 +283,7 @@ def _build_task_dataset_map() -> dict[str, list[DatasetSpec]]: task_map: dict[str, list[DatasetSpec]] = {} - for t, _ in _iter_all_tasks(parsed): + for t, _, _gname in _iter_all_tasks(parsed): if t.dataset and t.name not in task_map: if t.dataset == "facebook/flores" and not t.subset: task_map[t.name] = [ diff --git a/oellm/utils.py b/oellm/utils.py index 27bde7d..0c381da 100644 --- a/oellm/utils.py +++ b/oellm/utils.py @@ -314,21 +314,9 @@ def _process_model_paths(models: Iterable[str]): if "HF_HOME" in os.environ else None ) - try: - from huggingface_hub import try_to_load_from_cache - - cached = try_to_load_from_cache( - model, "config.json", cache_dir=cache_dir - ) - if isinstance(cached, str): - logging.info( - f"Model '{model}' already cached, skipping download" - ) - per_model_paths.append(model) - continue - except Exception: - pass status.update(f"Downloading '{model}' ({idx}/{len(models_list)})") + # snapshot_download is idempotent — it skips files that + # are already cached and only fetches missing ones. snapshot_download( repo_id=model, cache_dir=cache_dir, @@ -373,16 +361,20 @@ def _pre_download_hf_dataset_files(dataset_files: list[dict]) -> None: for idx, spec in enumerate(dataset_files, 1): repo_id = spec.get("repo_id", "") patterns = spec.get("patterns") + revision = spec.get("revision") status.update(f"Downloading '{repo_id}' ({idx}/{len(dataset_files)})") try: - snapshot_download( - repo_id=repo_id, - repo_type="dataset", - allow_patterns=patterns, - cache_dir=Path(os.getenv("HF_HOME")) / "hub" + kwargs = { + "repo_id": repo_id, + "repo_type": "dataset", + "allow_patterns": patterns, + "cache_dir": Path(os.getenv("HF_HOME")) / "hub" if "HF_HOME" in os.environ else None, - ) + } + if revision: + kwargs["revision"] = revision + snapshot_download(**kwargs) except Exception as e: logging.warning(f"Failed to download dataset files from '{repo_id}': {e}") @@ -391,6 +383,7 @@ def _pre_download_datasets_from_specs( specs: Iterable, trust_remote_code: bool = True ) -> None: from datasets import get_dataset_config_names, load_dataset + from huggingface_hub import snapshot_download specs_list = list(specs) if not specs_list: @@ -406,6 +399,18 @@ def _pre_download_datasets_from_specs( label = f"{spec.repo_id}" + (f"/{spec.subset}" if spec.subset else "") status.update(f"Downloading '{label}' ({idx}/{len(specs_list)})") + # Video datasets: lmms-eval calls snapshot_download at runtime + # to get raw video files, then symlinks them into $HF_HOME. + # Pre-download so offline compute nodes find everything cached. + if spec.video: + try: + snapshot_download( + repo_id=spec.repo_id, + repo_type="dataset", + ) + except Exception as e: + logging.warning(f"Failed to snapshot_download '{spec.repo_id}': {e}") + try: load_dataset( spec.repo_id, diff --git a/pyproject.toml b/pyproject.toml index 242f8ea..37a2f8f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,9 @@ dev = [ image = [ "lmms-eval @ git+https://github.com/EvolvingLMMs-Lab/lmms-eval.git", ] +video = [ + "lmms-eval @ git+https://github.com/EvolvingLMMs-Lab/lmms-eval.git", +] [project.scripts] oellm = "oellm.main:main" diff --git a/requirements-venv-evalchemy.txt b/requirements-venv-evalchemy.txt index 9ec7d3d..c63c3fb 100644 --- a/requirements-venv-evalchemy.txt +++ b/requirements-venv-evalchemy.txt @@ -1,6 +1,6 @@ # Dependencies for evalchemy evaluation -# lm-eval fork used by evalchemy +# lm-eval fork used by evalchemy lm-eval @ git+https://github.com/EtashGuha/lm-evaluation-harness@etashg/tokenize_fix scipy==1.17.0 diff --git a/tests/test_video_task_groups.py b/tests/test_video_task_groups.py new file mode 100644 index 0000000..f5d7274 --- /dev/null +++ b/tests/test_video_task_groups.py @@ -0,0 +1,205 @@ +import os +import sys +from importlib.resources import files +from pathlib import Path +from unittest.mock import patch + +import yaml + +from oellm.task_groups import ( + _collect_dataset_specs, + _expand_task_groups, + get_all_task_group_names, +) + +VIDEO_TASK_GROUP = "video-understanding" + +EXPECTED_TASKS = { + "video_mmmu", + "egoschema", + "videomme", + "activitynetqa", + "longvideobench_val_v", +} + +EXPECTED_DATASETS = { + "lmms-lab/VideoMMMU", + "lmms-lab/egoschema", + "lmms-lab/Video-MME", + "lmms-lab/ActivityNetQA", + "longvideobench/LongVideoBench", +} + + +class TestVideoTaskGroupInRegistry: + def test_video_understanding_present_in_yaml(self): + all_groups = get_all_task_group_names() + assert VIDEO_TASK_GROUP in all_groups + + def test_video_understanding_suite_is_lmms_eval(self): + data = yaml.safe_load((files("oellm.resources") / "task-groups.yaml").read_text()) + suite = data["task_groups"][VIDEO_TASK_GROUP]["suite"] + assert suite == "lmms_eval" + + def test_video_understanding_has_five_tasks(self): + data = yaml.safe_load((files("oellm.resources") / "task-groups.yaml").read_text()) + tasks = data["task_groups"][VIDEO_TASK_GROUP]["tasks"] + assert len(tasks) == 5 + + def test_individual_video_groups_present(self): + all_groups = get_all_task_group_names() + for name in [ + "video-videommmu", + "video-egoschema", + "video-videomme", + "video-activitynet-qa", + "video-longvideobench", + ]: + assert name in all_groups, f"{name} not in task group registry" + + +class TestVideoTaskGroupExpansion: + def test_expands_to_correct_task_names(self): + results = _expand_task_groups([VIDEO_TASK_GROUP]) + task_names = {r.task for r in results} + assert task_names == EXPECTED_TASKS + + def test_all_tasks_have_zero_shot(self): + results = _expand_task_groups([VIDEO_TASK_GROUP]) + for r in results: + assert r.n_shot == 0, f"{r.task} has n_shot={r.n_shot}, expected 0" + + def test_all_tasks_route_to_lmms_eval(self): + results = _expand_task_groups([VIDEO_TASK_GROUP]) + for r in results: + assert r.suite == "lmms_eval", ( + f"{r.task} has suite='{r.suite}', expected 'lmms_eval'" + ) + + def test_expand_individual_video_group(self): + results = _expand_task_groups(["video-videommmu"]) + assert len(results) == 1 + assert results[0].task == "video_mmmu" + assert results[0].suite == "lmms_eval" + + +class TestVideoTaskGroupDatasetSpecs: + def test_all_expected_datasets_present(self): + specs = _collect_dataset_specs([VIDEO_TASK_GROUP]) + repo_ids = {s.repo_id for s in specs} + assert repo_ids == EXPECTED_DATASETS + + def test_no_duplicate_dataset_specs(self): + specs = _collect_dataset_specs([VIDEO_TASK_GROUP]) + keys = [(s.repo_id, s.subset) for s in specs] + assert len(keys) == len(set(keys)), "Duplicate dataset specs found" + + def test_videomme_dataset_included(self): + specs = _collect_dataset_specs([VIDEO_TASK_GROUP]) + repo_ids = {s.repo_id for s in specs} + assert "lmms-lab/Video-MME" in repo_ids + + +class TestVideoTaskGroupScheduleEvals: + """Verify video-understanding integrates with the schedule_evals dry-run path.""" + + def test_schedule_evals_dry_run_video(self, tmp_path): + from oellm.main import schedule_evals + + with ( + patch("oellm.scheduler._load_cluster_env"), + patch("oellm.scheduler._num_jobs_in_queue", return_value=0), + patch( + "oellm.runner.detect_lmms_model_type", + return_value="llava_onevision", + ), + patch.dict(os.environ, {"EVAL_OUTPUT_DIR": str(tmp_path)}), + ): + schedule_evals( + models="lmms-lab/llava-onevision-7b", + task_groups=VIDEO_TASK_GROUP, + skip_checks=True, + venv_path=str(Path(sys.prefix)), + dry_run=True, + ) + + sbatch_files = list(tmp_path.glob("**/submit_evals.sbatch")) + assert len(sbatch_files) == 1 + sbatch_content = sbatch_files[0].read_text() + assert "lmms_eval" in sbatch_content + + def test_schedule_evals_jobs_csv_has_lmms_eval_suite(self, tmp_path): + import pandas as pd + + from oellm.main import schedule_evals + + with ( + patch("oellm.scheduler._load_cluster_env"), + patch("oellm.scheduler._num_jobs_in_queue", return_value=0), + patch( + "oellm.runner.detect_lmms_model_type", + return_value="llava_onevision", + ), + patch.dict(os.environ, {"EVAL_OUTPUT_DIR": str(tmp_path)}), + ): + schedule_evals( + models="lmms-lab/llava-onevision-7b", + task_groups=VIDEO_TASK_GROUP, + skip_checks=True, + venv_path=str(Path(sys.prefix)), + dry_run=True, + ) + + csv_files = list(tmp_path.glob("**/jobs.csv")) + assert len(csv_files) == 1 + df = pd.read_csv(csv_files[0]) + assert all(s.startswith("lmms_eval") for s in df["eval_suite"].unique()) + assert set(df["task_path"].unique()) == EXPECTED_TASKS + + +class TestVideoModelAdapters: + """Verify video-specific model adapter detection.""" + + def test_llava_onevision_detected(self): + from oellm.constants import detect_lmms_model_type + + assert detect_lmms_model_type("lmms-lab/llava-onevision-7b") == "llava_onevision" + + def test_llava_vid_detected(self): + from oellm.constants import detect_lmms_model_type + + assert detect_lmms_model_type("llava-vid-7b") == "llava_vid" + + def test_video_llava_detected(self): + from oellm.constants import detect_lmms_model_type + + assert detect_lmms_model_type("video-llava-7b") == "video_llava" + + def test_longva_detected(self): + from oellm.constants import detect_lmms_model_type + + assert detect_lmms_model_type("longva-7b") == "longva" + + def test_internvideo_detected(self): + from oellm.constants import detect_lmms_model_type + + assert detect_lmms_model_type("internvideo2-chat") == "internvideo2" + + def test_llava_hf_onevision_routes_to_llava_hf(self): + """HuggingFace-format llava-onevision models must use llava_hf, not llava_onevision.""" + from oellm.constants import detect_lmms_model_type + + assert ( + detect_lmms_model_type("llava-hf/llava-onevision-qwen2-0.5b-ov-hf") + == "llava_hf" + ) + + def test_generic_llava_still_works(self): + from oellm.constants import detect_lmms_model_type + + assert detect_lmms_model_type("llava-hf/llava-1.5-7b-hf") == "llava_hf" + + def test_qwen25_vl_still_works(self): + from oellm.constants import detect_lmms_model_type + + assert detect_lmms_model_type("Qwen/Qwen2.5-VL-7B-Instruct") == "qwen2_5_vl"