From e6b8923378e2e21a6141fd7706187bdb5d5b6f6d Mon Sep 17 00:00:00 2001 From: Ivan Slobozhan Date: Fri, 10 Apr 2026 12:05:26 +0200 Subject: [PATCH 1/5] adding video modality --- README.md | 20 +++- oellm/constants.py | 7 ++ oellm/resources/task-groups.yaml | 65 ++++++++++ pyproject.toml | 3 + tests/test_video_task_groups.py | 196 +++++++++++++++++++++++++++++++ 5 files changed, 290 insertions(+), 1 deletion(-) create mode 100644 tests/test_video_task_groups.py diff --git a/README.md b/README.md index 799a0af..6214e30 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ A multimodal evaluation framework for scheduling LLM and VLM evaluations across - **Task groups** for pre-defined evaluation suites with automatic dataset pre-downloading - **Multi-cluster support** with auto-detection (Leonardo, LUMI, JURECA, Snellius) - **Image evaluation** via lmms-eval (VQAv2, MMBench, MMMU, ChartQA, DocVQA, TextVQA, OCRBench, MathVista) +- **Video evaluation** via lmms-eval (MVBench, EgoSchema, VideoMME, ActivityNet-QA, LongVideoBench) - **Plugin system** for contributing custom benchmarks without touching core code - **Automatic building and deployment of containers** @@ -75,7 +76,18 @@ Super groups: `oellm-multilingual` (all multilingual benchmarks combined) | `image-ocrbench` | OCRBench | lmms-eval | | `image-mathvista` | MathVista | lmms-eval | -The lmms-eval adapter class (`llava_hf`, `qwen2_5_vl`, etc.) is auto-detected from the model name. +### Video + +| Group | Benchmark | Engine | +|---|---|---| +| `video-understanding` | All 5 benchmarks combined | lmms-eval | +| `video-mvbench` | MVBench (20 temporal tasks) | lmms-eval | +| `video-egoschema` | EgoSchema (long-form egocentric QA) | lmms-eval | +| `video-videomme` | Video-MME (11s-1h clips) | lmms-eval | +| `video-activitynet-qa` | ActivityNet-QA (requires GPT API) | lmms-eval | +| `video-longvideobench` | LongVideoBench (cross-segment reasoning) | lmms-eval | + +The lmms-eval adapter class (`llava_hf`, `llava_onevision`, `qwen2_5_vl`, etc.) is auto-detected from the model name. Install with `pip install oellm[video]` (or use a venv with lmms-eval). ### Custom Benchmarks (contrib) @@ -88,6 +100,12 @@ oellm schedule-eval \ --task-groups "image-vqa" \ --venv-path ~/elliot-venv +# Run all 5 video benchmarks +oellm schedule-eval \ + --models "lmms-lab/llava-onevision-7b" \ + --task-groups "video-understanding" \ + --venv-path ~/elliot-venv + # Mix image and text benchmarks in one submission oellm schedule-eval \ --models "llava-hf/llava-1.5-7b-hf" \ diff --git a/oellm/constants.py b/oellm/constants.py index 15915a5..c8dbf1e 100644 --- a/oellm/constants.py +++ b/oellm/constants.py @@ -18,10 +18,17 @@ class EvaluationJob: LMMS_MODEL_ADAPTERS: list[tuple[list[str], str]] = [ (["qwen2.5-vl", "qwen2_5_vl", "qwen2.5vl"], "qwen2_5_vl"), (["qwen2-vl", "qwen2_vl"], "qwen2_vl"), + # Video-capable adapters — must precede the generic "llava" pattern + (["llava-onevision", "llava_onevision"], "llava_onevision"), + (["llava-vid", "llava_vid", "llava-video"], "llava_vid"), + (["video-llava", "video_llava"], "video_llava"), (["llava"], "llava_hf"), + (["internvideo"], "internvideo2"), (["internvl"], "internvl2"), (["idefics"], "idefics3"), (["minicpm"], "minicpm_v"), + (["longva"], "longva"), + (["videochat2"], "videochat2"), (["qwen"], "qwen_vl"), ] diff --git a/oellm/resources/task-groups.yaml b/oellm/resources/task-groups.yaml index 6f1d757..47e8865 100644 --- a/oellm/resources/task-groups.yaml +++ b/oellm/resources/task-groups.yaml @@ -35,6 +35,13 @@ task_metrics: mathvista_testmini_cot: llm_as_judge_eval mathvista_testmini_format: llm_as_judge_eval mathvista_testmini_solution: llm_as_judge_eval + # lmms-eval video benchmark metrics + mvbench: mvbench_accuracy + egoschema: submission + videomme: videomme_perception_score + # ActivityNet-QA requires GPT API access for evaluation (LLM-as-judge) + activitynetqa: gpt_eval_accuracy + longvideobench_val_v: lvb_acc task_groups: open-sci-0.01: @@ -416,6 +423,64 @@ task_groups: - task: mathvista_testmini dataset: AI4Math/MathVista + # ── Video Modality (lmms-eval) ──────────────────────────────────────────── + video-understanding: + description: "Video understanding benchmarks via lmms-eval (MVBench, EgoSchema, VideoMME, ActivityNet-QA, LongVideoBench)" + suite: lmms_eval + n_shots: [0] + tasks: + - task: mvbench + dataset: OpenGVLab/MVBench + - task: egoschema + dataset: lmms-lab/egoschema + - task: videomme + dataset: lmms-lab/Video-MME + - task: activitynetqa + dataset: lmms-lab/ActivityNetQA + - task: longvideobench_val_v + dataset: longvideobench/LongVideoBench + + # ── Individual Video Benchmarks (single-task groups for targeted runs) ──── + video-mvbench: + description: "MVBench short-clip temporal understanding (20 tasks) via lmms-eval" + suite: lmms_eval + n_shots: [0] + tasks: + - task: mvbench + dataset: OpenGVLab/MVBench + + video-egoschema: + description: "EgoSchema long-form egocentric video QA via lmms-eval" + suite: lmms_eval + n_shots: [0] + tasks: + - task: egoschema + dataset: lmms-lab/egoschema + + video-videomme: + description: "Video-MME full-spectrum video understanding (11s-1h) via lmms-eval" + suite: lmms_eval + n_shots: [0] + tasks: + - task: videomme + dataset: lmms-lab/Video-MME + + video-activitynet-qa: + description: "ActivityNet-QA open-ended activity video QA via lmms-eval (requires GPT API for scoring)" + suite: lmms_eval + n_shots: [0] + tasks: + - task: activitynetqa + dataset: lmms-lab/ActivityNetQA + + video-longvideobench: + description: "LongVideoBench long-video cross-segment reasoning via lmms-eval" + suite: lmms_eval + n_shots: [0] + tasks: + - task: longvideobench_val_v + dataset: longvideobench/LongVideoBench + dclm-core-22: description: "DCLM core 22 evaluation tasks (lm-eval-harness, matching LLM Foundry task types)" suite: lm-eval-harness diff --git a/pyproject.toml b/pyproject.toml index 242f8ea..37a2f8f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,9 @@ dev = [ image = [ "lmms-eval @ git+https://github.com/EvolvingLMMs-Lab/lmms-eval.git", ] +video = [ + "lmms-eval @ git+https://github.com/EvolvingLMMs-Lab/lmms-eval.git", +] [project.scripts] oellm = "oellm.main:main" diff --git a/tests/test_video_task_groups.py b/tests/test_video_task_groups.py new file mode 100644 index 0000000..35cfe70 --- /dev/null +++ b/tests/test_video_task_groups.py @@ -0,0 +1,196 @@ +import os +import sys +from importlib.resources import files +from pathlib import Path +from unittest.mock import patch + +import yaml + +from oellm.task_groups import ( + _collect_dataset_specs, + _expand_task_groups, + get_all_task_group_names, +) + +VIDEO_TASK_GROUP = "video-understanding" + +EXPECTED_TASKS = { + "mvbench", + "egoschema", + "videomme", + "activitynetqa", + "longvideobench_val_v", +} + +EXPECTED_DATASETS = { + "OpenGVLab/MVBench", + "lmms-lab/egoschema", + "lmms-lab/Video-MME", + "lmms-lab/ActivityNetQA", + "longvideobench/LongVideoBench", +} + + +class TestVideoTaskGroupInRegistry: + def test_video_understanding_present_in_yaml(self): + all_groups = get_all_task_group_names() + assert VIDEO_TASK_GROUP in all_groups + + def test_video_understanding_suite_is_lmms_eval(self): + data = yaml.safe_load((files("oellm.resources") / "task-groups.yaml").read_text()) + suite = data["task_groups"][VIDEO_TASK_GROUP]["suite"] + assert suite == "lmms_eval" + + def test_video_understanding_has_five_tasks(self): + data = yaml.safe_load((files("oellm.resources") / "task-groups.yaml").read_text()) + tasks = data["task_groups"][VIDEO_TASK_GROUP]["tasks"] + assert len(tasks) == 5 + + def test_individual_video_groups_present(self): + all_groups = get_all_task_group_names() + for name in [ + "video-mvbench", + "video-egoschema", + "video-videomme", + "video-activitynet-qa", + "video-longvideobench", + ]: + assert name in all_groups, f"{name} not in task group registry" + + +class TestVideoTaskGroupExpansion: + def test_expands_to_correct_task_names(self): + results = _expand_task_groups([VIDEO_TASK_GROUP]) + task_names = {r.task for r in results} + assert task_names == EXPECTED_TASKS + + def test_all_tasks_have_zero_shot(self): + results = _expand_task_groups([VIDEO_TASK_GROUP]) + for r in results: + assert r.n_shot == 0, f"{r.task} has n_shot={r.n_shot}, expected 0" + + def test_all_tasks_route_to_lmms_eval(self): + results = _expand_task_groups([VIDEO_TASK_GROUP]) + for r in results: + assert r.suite == "lmms_eval", ( + f"{r.task} has suite='{r.suite}', expected 'lmms_eval'" + ) + + def test_expand_individual_video_group(self): + results = _expand_task_groups(["video-mvbench"]) + assert len(results) == 1 + assert results[0].task == "mvbench" + assert results[0].suite == "lmms_eval" + + +class TestVideoTaskGroupDatasetSpecs: + def test_all_expected_datasets_present(self): + specs = _collect_dataset_specs([VIDEO_TASK_GROUP]) + repo_ids = {s.repo_id for s in specs} + assert repo_ids == EXPECTED_DATASETS + + def test_no_duplicate_dataset_specs(self): + specs = _collect_dataset_specs([VIDEO_TASK_GROUP]) + keys = [(s.repo_id, s.subset) for s in specs] + assert len(keys) == len(set(keys)), "Duplicate dataset specs found" + + def test_videomme_dataset_included(self): + specs = _collect_dataset_specs([VIDEO_TASK_GROUP]) + repo_ids = {s.repo_id for s in specs} + assert "lmms-lab/Video-MME" in repo_ids + + +class TestVideoTaskGroupScheduleEvals: + """Verify video-understanding integrates with the schedule_evals dry-run path.""" + + def test_schedule_evals_dry_run_video(self, tmp_path): + from oellm.main import schedule_evals + + with ( + patch("oellm.scheduler._load_cluster_env"), + patch("oellm.scheduler._num_jobs_in_queue", return_value=0), + patch( + "oellm.runner.detect_lmms_model_type", + return_value="llava_onevision", + ), + patch.dict(os.environ, {"EVAL_OUTPUT_DIR": str(tmp_path)}), + ): + schedule_evals( + models="lmms-lab/llava-onevision-7b", + task_groups=VIDEO_TASK_GROUP, + skip_checks=True, + venv_path=str(Path(sys.prefix)), + dry_run=True, + ) + + sbatch_files = list(tmp_path.glob("**/submit_evals.sbatch")) + assert len(sbatch_files) == 1 + sbatch_content = sbatch_files[0].read_text() + assert "lmms_eval" in sbatch_content + + def test_schedule_evals_jobs_csv_has_lmms_eval_suite(self, tmp_path): + import pandas as pd + + from oellm.main import schedule_evals + + with ( + patch("oellm.scheduler._load_cluster_env"), + patch("oellm.scheduler._num_jobs_in_queue", return_value=0), + patch( + "oellm.runner.detect_lmms_model_type", + return_value="llava_onevision", + ), + patch.dict(os.environ, {"EVAL_OUTPUT_DIR": str(tmp_path)}), + ): + schedule_evals( + models="lmms-lab/llava-onevision-7b", + task_groups=VIDEO_TASK_GROUP, + skip_checks=True, + venv_path=str(Path(sys.prefix)), + dry_run=True, + ) + + csv_files = list(tmp_path.glob("**/jobs.csv")) + assert len(csv_files) == 1 + df = pd.read_csv(csv_files[0]) + assert all(s.startswith("lmms_eval") for s in df["eval_suite"].unique()) + assert set(df["task_path"].unique()) == EXPECTED_TASKS + + +class TestVideoModelAdapters: + """Verify video-specific model adapter detection.""" + + def test_llava_onevision_detected(self): + from oellm.constants import detect_lmms_model_type + + assert detect_lmms_model_type("lmms-lab/llava-onevision-7b") == "llava_onevision" + + def test_llava_vid_detected(self): + from oellm.constants import detect_lmms_model_type + + assert detect_lmms_model_type("llava-vid-7b") == "llava_vid" + + def test_video_llava_detected(self): + from oellm.constants import detect_lmms_model_type + + assert detect_lmms_model_type("video-llava-7b") == "video_llava" + + def test_longva_detected(self): + from oellm.constants import detect_lmms_model_type + + assert detect_lmms_model_type("longva-7b") == "longva" + + def test_internvideo_detected(self): + from oellm.constants import detect_lmms_model_type + + assert detect_lmms_model_type("internvideo2-chat") == "internvideo2" + + def test_generic_llava_still_works(self): + from oellm.constants import detect_lmms_model_type + + assert detect_lmms_model_type("llava-hf/llava-1.5-7b-hf") == "llava_hf" + + def test_qwen25_vl_still_works(self): + from oellm.constants import detect_lmms_model_type + + assert detect_lmms_model_type("Qwen/Qwen2.5-VL-7B-Instruct") == "qwen2_5_vl" From 68dffcc4133636d2a1daf88807932c055f87aac0 Mon Sep 17 00:00:00 2001 From: Ivan Slobozhan Date: Mon, 13 Apr 2026 15:41:39 +0200 Subject: [PATCH 2/5] bug fixes --- oellm/constants.py | 2 +- oellm/resources/task-groups.yaml | 2 + oellm/resources/template.sbatch | 10 ++++- oellm/task_groups.py | 66 +++++++++++++++++++++----------- oellm/utils.py | 39 +++++++++++-------- tests/test_video_task_groups.py | 9 +++++ 6 files changed, 87 insertions(+), 41 deletions(-) diff --git a/oellm/constants.py b/oellm/constants.py index c8dbf1e..c8c7334 100644 --- a/oellm/constants.py +++ b/oellm/constants.py @@ -18,7 +18,7 @@ class EvaluationJob: LMMS_MODEL_ADAPTERS: list[tuple[list[str], str]] = [ (["qwen2.5-vl", "qwen2_5_vl", "qwen2.5vl"], "qwen2_5_vl"), (["qwen2-vl", "qwen2_vl"], "qwen2_vl"), - # Video-capable adapters — must precede the generic "llava" pattern + (["llava-hf"], "llava_hf"), (["llava-onevision", "llava_onevision"], "llava_onevision"), (["llava-vid", "llava_vid", "llava-video"], "llava_vid"), (["video-llava", "video_llava"], "video_llava"), diff --git a/oellm/resources/task-groups.yaml b/oellm/resources/task-groups.yaml index 47e8865..82a9ef9 100644 --- a/oellm/resources/task-groups.yaml +++ b/oellm/resources/task-groups.yaml @@ -431,6 +431,7 @@ task_groups: tasks: - task: mvbench dataset: OpenGVLab/MVBench + revision: video - task: egoschema dataset: lmms-lab/egoschema - task: videomme @@ -448,6 +449,7 @@ task_groups: tasks: - task: mvbench dataset: OpenGVLab/MVBench + revision: video video-egoschema: description: "EgoSchema long-form egocentric video QA via lmms-eval" diff --git a/oellm/resources/template.sbatch b/oellm/resources/template.sbatch index 17c374b..d67ab94 100644 --- a/oellm/resources/template.sbatch +++ b/oellm/resources/template.sbatch @@ -190,11 +190,17 @@ do _lmms_adapter="${{eval_suite#*:}}" OUTPUT_JSON="{evals_dir}/$(openssl rand -hex 5).json" + # LLaVA adapters need model_name to avoid a missing-import bug in lmms-eval + _lmms_extra_args="" + if [[ "$_lmms_adapter" == "llava_onevision" || "$_lmms_adapter" == "llava_vid" || "$_lmms_adapter" == "video_llava" ]]; then + _lmms_extra_args=",model_name=$(basename "$model_path")" + fi + if [ -n "$VENV_PATH" ]; then source "$VENV_PATH/bin/activate" python -m lmms_eval \ --model "$_lmms_adapter" \ - --model_args "pretrained=$model_path,device_map=auto" \ + --model_args "pretrained=$model_path,device_map=auto$_lmms_extra_args" \ --tasks "$task_path" \ --num_fewshot "$n_shot" \ --output_path "$OUTPUT_JSON" \ @@ -205,7 +211,7 @@ do $EVAL_SIF_PATH \ python -m lmms_eval \ --model "$_lmms_adapter" \ - --model_args "pretrained=$model_path,device_map=auto" \ + --model_args "pretrained=$model_path,device_map=auto$_lmms_extra_args" \ --tasks "$task_path" \ --num_fewshot "$n_shot" \ --output_path "$OUTPUT_JSON" \ diff --git a/oellm/task_groups.py b/oellm/task_groups.py index 7291420..84acbe5 100644 --- a/oellm/task_groups.py +++ b/oellm/task_groups.py @@ -9,6 +9,7 @@ class DatasetSpec: repo_id: str subset: str | None = None + revision: str | None = None @dataclass @@ -20,6 +21,7 @@ class _Task: hf_models: list[str] | None = None hf_dataset_files: list[dict] | None = None suite: str | None = None + revision: str | None = None @dataclass @@ -61,6 +63,7 @@ def from_dict(cls, name: str, data: dict) -> "TaskGroup": hf_models=task_hf_models, hf_dataset_files=task_hf_dataset_files, suite=task_data.get("suite"), + revision=task_data.get("revision"), ) ) @@ -154,16 +157,16 @@ class TaskGroupResult: def _iter_all_tasks( parsed: dict[str, TaskSuperGroup | TaskGroup], -) -> Iterable[tuple[_Task, str]]: - """Yield ``(task, suite)`` pairs from a parsed group dict, flattening super groups.""" - for group in parsed.values(): +) -> Iterable[tuple[_Task, str, str]]: + """Yield ``(task, suite, group_name)`` triples from a parsed group dict, flattening super groups.""" + for group_name, group in parsed.items(): if isinstance(group, TaskGroup): for t in group.tasks: - yield t, t.suite or group.suite + yield t, t.suite or group.suite, group_name else: for g in group.task_groups: for t in g.tasks: - yield t, t.suite or g.suite + yield t, t.suite or g.suite, g.name def _expand_task_groups(group_names: Iterable[str]) -> list[TaskGroupResult]: @@ -173,7 +176,7 @@ def _expand_task_groups(group_names: Iterable[str]) -> list[TaskGroupResult]: raise ValueError(f"Unknown task group(s): {', '.join(sorted(missing))}") results: list[TaskGroupResult] = [] - for t, suite in _iter_all_tasks(parsed): + for t, suite, _gname in _iter_all_tasks(parsed): for shot in (int(s) for s in (t.n_shots or [])): results.append(TaskGroupResult(task=t.name, n_shot=shot, suite=suite)) @@ -198,22 +201,31 @@ def _collect_dataset_specs(group_names: Iterable[str]) -> list[DatasetSpec]: parsed = _parse_task_groups([str(n).strip() for n in group_names if str(n).strip()]) specs: list[DatasetSpec] = [] - seen: set[tuple[str, str | None]] = set() + seen: set[tuple[str, str | None, str | None]] = set() - def add_spec(dataset: str | None, subset: str | None): + def add_spec( + dataset: str | None, + subset: str | None, + revision: str | None = None, + ): if dataset is None: return - key = (dataset, subset) + key = (dataset, subset, revision) if key not in seen: seen.add(key) - specs.append(DatasetSpec(repo_id=dataset, subset=subset)) + specs.append(DatasetSpec(repo_id=dataset, subset=subset, revision=revision)) + + for t, _, group_name in _iter_all_tasks(parsed): + # Video groups use snapshot_download; default revision is "main" + revision = t.revision + if revision is None and group_name.startswith("video-"): + revision = "main" - for t, _ in _iter_all_tasks(parsed): if t.dataset == "facebook/flores" and not t.subset: for lang in _extract_flores_subsets(t.name): add_spec(t.dataset, lang) else: - add_spec(t.dataset, t.subset) + add_spec(t.dataset, t.subset, revision) return specs @@ -225,7 +237,7 @@ def _collect_hf_model_repos(group_names: Iterable[str]) -> list[str]: repos: list[str] = [] seen: set[str] = set() - for t, _ in _iter_all_tasks(parsed): + for t, _, _gname in _iter_all_tasks(parsed): for repo_id in t.hf_models or []: if repo_id not in seen: seen.add(repo_id) @@ -238,24 +250,32 @@ def _collect_hf_dataset_files(group_names: Iterable[str]) -> list[dict]: """Return deduplicated HF dataset file specs declared in task ``hf_dataset_files`` fields.""" parsed = _parse_task_groups([str(n).strip() for n in group_names if str(n).strip()]) - # Merge patterns from all tasks that share the same repo_id so that - # a single snapshot_download fetches everything needed. - merged: dict[str, list[str]] = {} + # Merge patterns from all tasks that share the same (repo_id, revision) + # so that a single snapshot_download fetches everything needed. + merged: dict[tuple[str, str | None], list[str]] = {} - for t, _ in _iter_all_tasks(parsed): + for t, _, _gname in _iter_all_tasks(parsed): for spec in t.hf_dataset_files or []: repo_id = spec.get("repo_id", "") if not repo_id: continue + revision = spec.get("revision") patterns = spec.get("patterns") or [] - if repo_id not in merged: - merged[repo_id] = list(patterns) + key = (repo_id, revision) + if key not in merged: + merged[key] = list(patterns) else: for p in patterns: - if p not in merged[repo_id]: - merged[repo_id].append(p) + if p not in merged[key]: + merged[key].append(p) - return [{"repo_id": rid, "patterns": pats} for rid, pats in merged.items()] + result = [] + for (rid, rev), pats in merged.items(): + entry: dict = {"repo_id": rid, "patterns": pats} + if rev: + entry["revision"] = rev + result.append(entry) + return result def _build_task_dataset_map() -> dict[str, list[DatasetSpec]]: @@ -268,7 +288,7 @@ def _build_task_dataset_map() -> dict[str, list[DatasetSpec]]: task_map: dict[str, list[DatasetSpec]] = {} - for t, _ in _iter_all_tasks(parsed): + for t, _, _gname in _iter_all_tasks(parsed): if t.dataset and t.name not in task_map: if t.dataset == "facebook/flores" and not t.subset: task_map[t.name] = [ diff --git a/oellm/utils.py b/oellm/utils.py index 27bde7d..1b022ed 100644 --- a/oellm/utils.py +++ b/oellm/utils.py @@ -314,21 +314,9 @@ def _process_model_paths(models: Iterable[str]): if "HF_HOME" in os.environ else None ) - try: - from huggingface_hub import try_to_load_from_cache - - cached = try_to_load_from_cache( - model, "config.json", cache_dir=cache_dir - ) - if isinstance(cached, str): - logging.info( - f"Model '{model}' already cached, skipping download" - ) - per_model_paths.append(model) - continue - except Exception: - pass status.update(f"Downloading '{model}' ({idx}/{len(models_list)})") + # snapshot_download is idempotent — it skips files that + # are already cached and only fetches missing ones. snapshot_download( repo_id=model, cache_dir=cache_dir, @@ -373,9 +361,10 @@ def _pre_download_hf_dataset_files(dataset_files: list[dict]) -> None: for idx, spec in enumerate(dataset_files, 1): repo_id = spec.get("repo_id", "") patterns = spec.get("patterns") + revision = spec.get("revision") status.update(f"Downloading '{repo_id}' ({idx}/{len(dataset_files)})") try: - snapshot_download( + kwargs = dict( repo_id=repo_id, repo_type="dataset", allow_patterns=patterns, @@ -383,6 +372,9 @@ def _pre_download_hf_dataset_files(dataset_files: list[dict]) -> None: if "HF_HOME" in os.environ else None, ) + if revision: + kwargs["revision"] = revision + snapshot_download(**kwargs) except Exception as e: logging.warning(f"Failed to download dataset files from '{repo_id}': {e}") @@ -391,6 +383,7 @@ def _pre_download_datasets_from_specs( specs: Iterable, trust_remote_code: bool = True ) -> None: from datasets import get_dataset_config_names, load_dataset + from huggingface_hub import snapshot_download specs_list = list(specs) if not specs_list: @@ -406,6 +399,22 @@ def _pre_download_datasets_from_specs( label = f"{spec.repo_id}" + (f"/{spec.subset}" if spec.subset else "") status.update(f"Downloading '{label}' ({idx}/{len(specs_list)})") + # Video datasets: lmms-eval uses snapshot_download at runtime, + # so pre-download must use the same mechanism + revision. + if spec.revision is not None: + try: + snapshot_download( + repo_id=spec.repo_id, + repo_type="dataset", + revision=spec.revision, + ) + except Exception as e: + logging.warning( + f"Failed to snapshot_download '{spec.repo_id}' " + f"(revision={spec.revision}): {e}" + ) + continue + try: load_dataset( spec.repo_id, diff --git a/tests/test_video_task_groups.py b/tests/test_video_task_groups.py index 35cfe70..0b41448 100644 --- a/tests/test_video_task_groups.py +++ b/tests/test_video_task_groups.py @@ -185,6 +185,15 @@ def test_internvideo_detected(self): assert detect_lmms_model_type("internvideo2-chat") == "internvideo2" + def test_llava_hf_onevision_routes_to_llava_hf(self): + """HuggingFace-format llava-onevision models must use llava_hf, not llava_onevision.""" + from oellm.constants import detect_lmms_model_type + + assert ( + detect_lmms_model_type("llava-hf/llava-onevision-qwen2-0.5b-ov-hf") + == "llava_hf" + ) + def test_generic_llava_still_works(self): from oellm.constants import detect_lmms_model_type From 57209d06aeae47786d634b42cae1684c8cbc4db3 Mon Sep 17 00:00:00 2001 From: Ivan Slobozhan Date: Mon, 13 Apr 2026 15:47:43 +0200 Subject: [PATCH 3/5] fix lints --- docs/VENV.md | 2 +- oellm/contrib/regiondial_bench/__init__.py | 1 - oellm/utils.py | 12 ++++++------ requirements-venv-evalchemy.txt | 2 +- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/docs/VENV.md b/docs/VENV.md index bce8ac4..553500f 100644 --- a/docs/VENV.md +++ b/docs/VENV.md @@ -95,7 +95,7 @@ We use [Ali's fork](https://github.com/Ali-Elganzory/evalchemy) which includes a 3. Run with `EVALCHEMY_DIR` pointing to the cloned repo: ```bash - export HF_ALLOW_CODE_EVAL=1 # required by MBPP + export HF_ALLOW_CODE_EVAL=1 # required by MBPP EVALCHEMY_DIR=$(pwd)/evalchemy oellm schedule-eval \ --models HuggingFaceTB/SmolLM2-135M \ --task-groups reasoning \ diff --git a/oellm/contrib/regiondial_bench/__init__.py b/oellm/contrib/regiondial_bench/__init__.py index 8b13789..e69de29 100644 --- a/oellm/contrib/regiondial_bench/__init__.py +++ b/oellm/contrib/regiondial_bench/__init__.py @@ -1 +0,0 @@ - diff --git a/oellm/utils.py b/oellm/utils.py index 1b022ed..fd840e5 100644 --- a/oellm/utils.py +++ b/oellm/utils.py @@ -364,14 +364,14 @@ def _pre_download_hf_dataset_files(dataset_files: list[dict]) -> None: revision = spec.get("revision") status.update(f"Downloading '{repo_id}' ({idx}/{len(dataset_files)})") try: - kwargs = dict( - repo_id=repo_id, - repo_type="dataset", - allow_patterns=patterns, - cache_dir=Path(os.getenv("HF_HOME")) / "hub" + kwargs = { + "repo_id": repo_id, + "repo_type": "dataset", + "allow_patterns": patterns, + "cache_dir": Path(os.getenv("HF_HOME")) / "hub" if "HF_HOME" in os.environ else None, - ) + } if revision: kwargs["revision"] = revision snapshot_download(**kwargs) diff --git a/requirements-venv-evalchemy.txt b/requirements-venv-evalchemy.txt index 9ec7d3d..c63c3fb 100644 --- a/requirements-venv-evalchemy.txt +++ b/requirements-venv-evalchemy.txt @@ -1,6 +1,6 @@ # Dependencies for evalchemy evaluation -# lm-eval fork used by evalchemy +# lm-eval fork used by evalchemy lm-eval @ git+https://github.com/EtashGuha/lm-evaluation-harness@etashg/tokenize_fix scipy==1.17.0 From 06f2637b836dfa6a605a228995dbb8236391ec60 Mon Sep 17 00:00:00 2001 From: Ivan Slobozhan Date: Tue, 14 Apr 2026 00:10:25 +0200 Subject: [PATCH 4/5] change benchmark --- oellm/resources/task-groups.yaml | 18 ++++++++---------- oellm/utils.py | 6 +++--- tests/test_video_task_groups.py | 10 +++++----- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/oellm/resources/task-groups.yaml b/oellm/resources/task-groups.yaml index 82a9ef9..ba602e2 100644 --- a/oellm/resources/task-groups.yaml +++ b/oellm/resources/task-groups.yaml @@ -36,7 +36,7 @@ task_metrics: mathvista_testmini_format: llm_as_judge_eval mathvista_testmini_solution: llm_as_judge_eval # lmms-eval video benchmark metrics - mvbench: mvbench_accuracy + video_mmmu: mmmu_acc egoschema: submission videomme: videomme_perception_score # ActivityNet-QA requires GPT API access for evaluation (LLM-as-judge) @@ -425,13 +425,12 @@ task_groups: # ── Video Modality (lmms-eval) ──────────────────────────────────────────── video-understanding: - description: "Video understanding benchmarks via lmms-eval (MVBench, EgoSchema, VideoMME, ActivityNet-QA, LongVideoBench)" + description: "Video understanding benchmarks via lmms-eval (VideoMMMU, EgoSchema, VideoMME, ActivityNet-QA, LongVideoBench)" suite: lmms_eval n_shots: [0] tasks: - - task: mvbench - dataset: OpenGVLab/MVBench - revision: video + - task: video_mmmu + dataset: lmms-lab/VideoMMMU - task: egoschema dataset: lmms-lab/egoschema - task: videomme @@ -442,14 +441,13 @@ task_groups: dataset: longvideobench/LongVideoBench # ── Individual Video Benchmarks (single-task groups for targeted runs) ──── - video-mvbench: - description: "MVBench short-clip temporal understanding (20 tasks) via lmms-eval" + video-videommmu: + description: "VideoMMMU multi-discipline video understanding via lmms-eval" suite: lmms_eval n_shots: [0] tasks: - - task: mvbench - dataset: OpenGVLab/MVBench - revision: video + - task: video_mmmu + dataset: lmms-lab/VideoMMMU video-egoschema: description: "EgoSchema long-form egocentric video QA via lmms-eval" diff --git a/oellm/utils.py b/oellm/utils.py index fd840e5..f215a25 100644 --- a/oellm/utils.py +++ b/oellm/utils.py @@ -399,8 +399,9 @@ def _pre_download_datasets_from_specs( label = f"{spec.repo_id}" + (f"/{spec.subset}" if spec.subset else "") status.update(f"Downloading '{label}' ({idx}/{len(specs_list)})") - # Video datasets: lmms-eval uses snapshot_download at runtime, - # so pre-download must use the same mechanism + revision. + # Video datasets: lmms-eval uses snapshot_download for video + # files and load_dataset for annotations/metadata. Pre-download + # both so offline compute nodes can access everything. if spec.revision is not None: try: snapshot_download( @@ -413,7 +414,6 @@ def _pre_download_datasets_from_specs( f"Failed to snapshot_download '{spec.repo_id}' " f"(revision={spec.revision}): {e}" ) - continue try: load_dataset( diff --git a/tests/test_video_task_groups.py b/tests/test_video_task_groups.py index 0b41448..f5d7274 100644 --- a/tests/test_video_task_groups.py +++ b/tests/test_video_task_groups.py @@ -15,7 +15,7 @@ VIDEO_TASK_GROUP = "video-understanding" EXPECTED_TASKS = { - "mvbench", + "video_mmmu", "egoschema", "videomme", "activitynetqa", @@ -23,7 +23,7 @@ } EXPECTED_DATASETS = { - "OpenGVLab/MVBench", + "lmms-lab/VideoMMMU", "lmms-lab/egoschema", "lmms-lab/Video-MME", "lmms-lab/ActivityNetQA", @@ -49,7 +49,7 @@ def test_video_understanding_has_five_tasks(self): def test_individual_video_groups_present(self): all_groups = get_all_task_group_names() for name in [ - "video-mvbench", + "video-videommmu", "video-egoschema", "video-videomme", "video-activitynet-qa", @@ -77,9 +77,9 @@ def test_all_tasks_route_to_lmms_eval(self): ) def test_expand_individual_video_group(self): - results = _expand_task_groups(["video-mvbench"]) + results = _expand_task_groups(["video-videommmu"]) assert len(results) == 1 - assert results[0].task == "mvbench" + assert results[0].task == "video_mmmu" assert results[0].suite == "lmms_eval" From 1633129be824cab8df14472159810133510c9dbd Mon Sep 17 00:00:00 2001 From: Ivan Slobozhan Date: Tue, 14 Apr 2026 11:55:29 +0200 Subject: [PATCH 5/5] clean up --- oellm/task_groups.py | 17 ++++++----------- oellm/utils.py | 14 +++++--------- 2 files changed, 11 insertions(+), 20 deletions(-) diff --git a/oellm/task_groups.py b/oellm/task_groups.py index 84acbe5..e925273 100644 --- a/oellm/task_groups.py +++ b/oellm/task_groups.py @@ -9,7 +9,7 @@ class DatasetSpec: repo_id: str subset: str | None = None - revision: str | None = None + video: bool = False @dataclass @@ -21,7 +21,6 @@ class _Task: hf_models: list[str] | None = None hf_dataset_files: list[dict] | None = None suite: str | None = None - revision: str | None = None @dataclass @@ -63,7 +62,6 @@ def from_dict(cls, name: str, data: dict) -> "TaskGroup": hf_models=task_hf_models, hf_dataset_files=task_hf_dataset_files, suite=task_data.get("suite"), - revision=task_data.get("revision"), ) ) @@ -206,26 +204,23 @@ def _collect_dataset_specs(group_names: Iterable[str]) -> list[DatasetSpec]: def add_spec( dataset: str | None, subset: str | None, - revision: str | None = None, + video: bool = False, ): if dataset is None: return - key = (dataset, subset, revision) + key = (dataset, subset) if key not in seen: seen.add(key) - specs.append(DatasetSpec(repo_id=dataset, subset=subset, revision=revision)) + specs.append(DatasetSpec(repo_id=dataset, subset=subset, video=video)) for t, _, group_name in _iter_all_tasks(parsed): - # Video groups use snapshot_download; default revision is "main" - revision = t.revision - if revision is None and group_name.startswith("video-"): - revision = "main" + is_video = group_name.startswith("video-") if t.dataset == "facebook/flores" and not t.subset: for lang in _extract_flores_subsets(t.name): add_spec(t.dataset, lang) else: - add_spec(t.dataset, t.subset, revision) + add_spec(t.dataset, t.subset, video=is_video) return specs diff --git a/oellm/utils.py b/oellm/utils.py index f215a25..0c381da 100644 --- a/oellm/utils.py +++ b/oellm/utils.py @@ -399,21 +399,17 @@ def _pre_download_datasets_from_specs( label = f"{spec.repo_id}" + (f"/{spec.subset}" if spec.subset else "") status.update(f"Downloading '{label}' ({idx}/{len(specs_list)})") - # Video datasets: lmms-eval uses snapshot_download for video - # files and load_dataset for annotations/metadata. Pre-download - # both so offline compute nodes can access everything. - if spec.revision is not None: + # Video datasets: lmms-eval calls snapshot_download at runtime + # to get raw video files, then symlinks them into $HF_HOME. + # Pre-download so offline compute nodes find everything cached. + if spec.video: try: snapshot_download( repo_id=spec.repo_id, repo_type="dataset", - revision=spec.revision, ) except Exception as e: - logging.warning( - f"Failed to snapshot_download '{spec.repo_id}' " - f"(revision={spec.revision}): {e}" - ) + logging.warning(f"Failed to snapshot_download '{spec.repo_id}': {e}") try: load_dataset(