From e6b8923378e2e21a6141fd7706187bdb5d5b6f6d Mon Sep 17 00:00:00 2001
From: Ivan Slobozhan <ivan.slobozhan@gmail.com>
Date: Fri, 10 Apr 2026 12:05:26 +0200
Subject: [PATCH 1/5] adding video modality

---
 README.md                        |  20 +++-
 oellm/constants.py               |   7 ++
 oellm/resources/task-groups.yaml |  65 ++++++++++
 pyproject.toml                   |   3 +
 tests/test_video_task_groups.py  | 196 +++++++++++++++++++++++++++++++
 5 files changed, 290 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_video_task_groups.py

diff --git a/README.md b/README.md
index 799a0af..6214e30 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,7 @@ A multimodal evaluation framework for scheduling LLM and VLM evaluations across
 - **Task groups** for pre-defined evaluation suites with automatic dataset pre-downloading
 - **Multi-cluster support** with auto-detection (Leonardo, LUMI, JURECA, Snellius)
 - **Image evaluation** via lmms-eval (VQAv2, MMBench, MMMU, ChartQA, DocVQA, TextVQA, OCRBench, MathVista)
+- **Video evaluation** via lmms-eval (MVBench, EgoSchema, VideoMME, ActivityNet-QA, LongVideoBench)
 - **Plugin system** for contributing custom benchmarks without touching core code
 - **Automatic building and deployment of containers**
 
@@ -75,7 +76,18 @@ Super groups: `oellm-multilingual` (all multilingual benchmarks combined)
 | `image-ocrbench` | OCRBench | lmms-eval |
 | `image-mathvista` | MathVista | lmms-eval |
 
-The lmms-eval adapter class (`llava_hf`, `qwen2_5_vl`, etc.) is auto-detected from the model name.
+### Video
+
+| Group | Benchmark | Engine |
+|---|---|---|
+| `video-understanding` | All 5 benchmarks combined | lmms-eval |
+| `video-mvbench` | MVBench (20 temporal tasks) | lmms-eval |
+| `video-egoschema` | EgoSchema (long-form egocentric QA) | lmms-eval |
+| `video-videomme` | Video-MME (11s-1h clips) | lmms-eval |
+| `video-activitynet-qa` | ActivityNet-QA (requires GPT API) | lmms-eval |
+| `video-longvideobench` | LongVideoBench (cross-segment reasoning) | lmms-eval |
+
+The lmms-eval adapter class (`llava_hf`, `llava_onevision`, `qwen2_5_vl`, etc.) is auto-detected from the model name. Install with `pip install oellm[video]` (or use a venv with lmms-eval).
 
 ### Custom Benchmarks (contrib)
 
@@ -88,6 +100,12 @@ oellm schedule-eval \
     --task-groups "image-vqa" \
     --venv-path ~/elliot-venv
 
+# Run all 5 video benchmarks
+oellm schedule-eval \
+    --models "lmms-lab/llava-onevision-7b" \
+    --task-groups "video-understanding" \
+    --venv-path ~/elliot-venv
+
 # Mix image and text benchmarks in one submission
 oellm schedule-eval \
     --models "llava-hf/llava-1.5-7b-hf" \
diff --git a/oellm/constants.py b/oellm/constants.py
index 15915a5..c8dbf1e 100644
--- a/oellm/constants.py
+++ b/oellm/constants.py
@@ -18,10 +18,17 @@ class EvaluationJob:
 LMMS_MODEL_ADAPTERS: list[tuple[list[str], str]] = [
     (["qwen2.5-vl", "qwen2_5_vl", "qwen2.5vl"], "qwen2_5_vl"),
     (["qwen2-vl", "qwen2_vl"], "qwen2_vl"),
+    # Video-capable adapters — must precede the generic "llava" pattern
+    (["llava-onevision", "llava_onevision"], "llava_onevision"),
+    (["llava-vid", "llava_vid", "llava-video"], "llava_vid"),
+    (["video-llava", "video_llava"], "video_llava"),
     (["llava"], "llava_hf"),
+    (["internvideo"], "internvideo2"),
     (["internvl"], "internvl2"),
     (["idefics"], "idefics3"),
     (["minicpm"], "minicpm_v"),
+    (["longva"], "longva"),
+    (["videochat2"], "videochat2"),
     (["qwen"], "qwen_vl"),
 ]
 
diff --git a/oellm/resources/task-groups.yaml b/oellm/resources/task-groups.yaml
index 6f1d757..47e8865 100644
--- a/oellm/resources/task-groups.yaml
+++ b/oellm/resources/task-groups.yaml
@@ -35,6 +35,13 @@ task_metrics:
   mathvista_testmini_cot: llm_as_judge_eval
   mathvista_testmini_format: llm_as_judge_eval
   mathvista_testmini_solution: llm_as_judge_eval
+  # lmms-eval video benchmark metrics
+  mvbench: mvbench_accuracy
+  egoschema: submission
+  videomme: videomme_perception_score
+  # ActivityNet-QA requires GPT API access for evaluation (LLM-as-judge)
+  activitynetqa: gpt_eval_accuracy
+  longvideobench_val_v: lvb_acc
 
 task_groups:
   open-sci-0.01:
@@ -416,6 +423,64 @@ task_groups:
       - task: mathvista_testmini
         dataset: AI4Math/MathVista
 
+  # ── Video Modality (lmms-eval) ────────────────────────────────────────────
+  video-understanding:
+    description: "Video understanding benchmarks via lmms-eval (MVBench, EgoSchema, VideoMME, ActivityNet-QA, LongVideoBench)"
+    suite: lmms_eval
+    n_shots: [0]
+    tasks:
+      - task: mvbench
+        dataset: OpenGVLab/MVBench
+      - task: egoschema
+        dataset: lmms-lab/egoschema
+      - task: videomme
+        dataset: lmms-lab/Video-MME
+      - task: activitynetqa
+        dataset: lmms-lab/ActivityNetQA
+      - task: longvideobench_val_v
+        dataset: longvideobench/LongVideoBench
+
+  # ── Individual Video Benchmarks (single-task groups for targeted runs) ────
+  video-mvbench:
+    description: "MVBench short-clip temporal understanding (20 tasks) via lmms-eval"
+    suite: lmms_eval
+    n_shots: [0]
+    tasks:
+      - task: mvbench
+        dataset: OpenGVLab/MVBench
+
+  video-egoschema:
+    description: "EgoSchema long-form egocentric video QA via lmms-eval"
+    suite: lmms_eval
+    n_shots: [0]
+    tasks:
+      - task: egoschema
+        dataset: lmms-lab/egoschema
+
+  video-videomme:
+    description: "Video-MME full-spectrum video understanding (11s-1h) via lmms-eval"
+    suite: lmms_eval
+    n_shots: [0]
+    tasks:
+      - task: videomme
+        dataset: lmms-lab/Video-MME
+
+  video-activitynet-qa:
+    description: "ActivityNet-QA open-ended activity video QA via lmms-eval (requires GPT API for scoring)"
+    suite: lmms_eval
+    n_shots: [0]
+    tasks:
+      - task: activitynetqa
+        dataset: lmms-lab/ActivityNetQA
+
+  video-longvideobench:
+    description: "LongVideoBench long-video cross-segment reasoning via lmms-eval"
+    suite: lmms_eval
+    n_shots: [0]
+    tasks:
+      - task: longvideobench_val_v
+        dataset: longvideobench/LongVideoBench
+
   dclm-core-22:
     description: "DCLM core 22 evaluation tasks (lm-eval-harness, matching LLM Foundry task types)"
     suite: lm-eval-harness
diff --git a/pyproject.toml b/pyproject.toml
index 242f8ea..37a2f8f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,6 +23,9 @@ dev = [
 image = [
     "lmms-eval @ git+https://github.com/EvolvingLMMs-Lab/lmms-eval.git",
 ]
+video = [
+    "lmms-eval @ git+https://github.com/EvolvingLMMs-Lab/lmms-eval.git",
+]
 
 [project.scripts]
 oellm = "oellm.main:main"
diff --git a/tests/test_video_task_groups.py b/tests/test_video_task_groups.py
new file mode 100644
index 0000000..35cfe70
--- /dev/null
+++ b/tests/test_video_task_groups.py
@@ -0,0 +1,196 @@
+import os
+import sys
+from importlib.resources import files
+from pathlib import Path
+from unittest.mock import patch
+
+import yaml
+
+from oellm.task_groups import (
+    _collect_dataset_specs,
+    _expand_task_groups,
+    get_all_task_group_names,
+)
+
+VIDEO_TASK_GROUP = "video-understanding"
+
+EXPECTED_TASKS = {
+    "mvbench",
+    "egoschema",
+    "videomme",
+    "activitynetqa",
+    "longvideobench_val_v",
+}
+
+EXPECTED_DATASETS = {
+    "OpenGVLab/MVBench",
+    "lmms-lab/egoschema",
+    "lmms-lab/Video-MME",
+    "lmms-lab/ActivityNetQA",
+    "longvideobench/LongVideoBench",
+}
+
+
+class TestVideoTaskGroupInRegistry:
+    def test_video_understanding_present_in_yaml(self):
+        all_groups = get_all_task_group_names()
+        assert VIDEO_TASK_GROUP in all_groups
+
+    def test_video_understanding_suite_is_lmms_eval(self):
+        data = yaml.safe_load((files("oellm.resources") / "task-groups.yaml").read_text())
+        suite = data["task_groups"][VIDEO_TASK_GROUP]["suite"]
+        assert suite == "lmms_eval"
+
+    def test_video_understanding_has_five_tasks(self):
+        data = yaml.safe_load((files("oellm.resources") / "task-groups.yaml").read_text())
+        tasks = data["task_groups"][VIDEO_TASK_GROUP]["tasks"]
+        assert len(tasks) == 5
+
+    def test_individual_video_groups_present(self):
+        all_groups = get_all_task_group_names()
+        for name in [
+            "video-mvbench",
+            "video-egoschema",
+            "video-videomme",
+            "video-activitynet-qa",
+            "video-longvideobench",
+        ]:
+            assert name in all_groups, f"{name} not in task group registry"
+
+
+class TestVideoTaskGroupExpansion:
+    def test_expands_to_correct_task_names(self):
+        results = _expand_task_groups([VIDEO_TASK_GROUP])
+        task_names = {r.task for r in results}
+        assert task_names == EXPECTED_TASKS
+
+    def test_all_tasks_have_zero_shot(self):
+        results = _expand_task_groups([VIDEO_TASK_GROUP])
+        for r in results:
+            assert r.n_shot == 0, f"{r.task} has n_shot={r.n_shot}, expected 0"
+
+    def test_all_tasks_route_to_lmms_eval(self):
+        results = _expand_task_groups([VIDEO_TASK_GROUP])
+        for r in results:
+            assert r.suite == "lmms_eval", (
+                f"{r.task} has suite='{r.suite}', expected 'lmms_eval'"
+            )
+
+    def test_expand_individual_video_group(self):
+        results = _expand_task_groups(["video-mvbench"])
+        assert len(results) == 1
+        assert results[0].task == "mvbench"
+        assert results[0].suite == "lmms_eval"
+
+
+class TestVideoTaskGroupDatasetSpecs:
+    def test_all_expected_datasets_present(self):
+        specs = _collect_dataset_specs([VIDEO_TASK_GROUP])
+        repo_ids = {s.repo_id for s in specs}
+        assert repo_ids == EXPECTED_DATASETS
+
+    def test_no_duplicate_dataset_specs(self):
+        specs = _collect_dataset_specs([VIDEO_TASK_GROUP])
+        keys = [(s.repo_id, s.subset) for s in specs]
+        assert len(keys) == len(set(keys)), "Duplicate dataset specs found"
+
+    def test_videomme_dataset_included(self):
+        specs = _collect_dataset_specs([VIDEO_TASK_GROUP])
+        repo_ids = {s.repo_id for s in specs}
+        assert "lmms-lab/Video-MME" in repo_ids
+
+
+class TestVideoTaskGroupScheduleEvals:
+    """Verify video-understanding integrates with the schedule_evals dry-run path."""
+
+    def test_schedule_evals_dry_run_video(self, tmp_path):
+        from oellm.main import schedule_evals
+
+        with (
+            patch("oellm.scheduler._load_cluster_env"),
+            patch("oellm.scheduler._num_jobs_in_queue", return_value=0),
+            patch(
+                "oellm.runner.detect_lmms_model_type",
+                return_value="llava_onevision",
+            ),
+            patch.dict(os.environ, {"EVAL_OUTPUT_DIR": str(tmp_path)}),
+        ):
+            schedule_evals(
+                models="lmms-lab/llava-onevision-7b",
+                task_groups=VIDEO_TASK_GROUP,
+                skip_checks=True,
+                venv_path=str(Path(sys.prefix)),
+                dry_run=True,
+            )
+
+        sbatch_files = list(tmp_path.glob("**/submit_evals.sbatch"))
+        assert len(sbatch_files) == 1
+        sbatch_content = sbatch_files[0].read_text()
+        assert "lmms_eval" in sbatch_content
+
+    def test_schedule_evals_jobs_csv_has_lmms_eval_suite(self, tmp_path):
+        import pandas as pd
+
+        from oellm.main import schedule_evals
+
+        with (
+            patch("oellm.scheduler._load_cluster_env"),
+            patch("oellm.scheduler._num_jobs_in_queue", return_value=0),
+            patch(
+                "oellm.runner.detect_lmms_model_type",
+                return_value="llava_onevision",
+            ),
+            patch.dict(os.environ, {"EVAL_OUTPUT_DIR": str(tmp_path)}),
+        ):
+            schedule_evals(
+                models="lmms-lab/llava-onevision-7b",
+                task_groups=VIDEO_TASK_GROUP,
+                skip_checks=True,
+                venv_path=str(Path(sys.prefix)),
+                dry_run=True,
+            )
+
+        csv_files = list(tmp_path.glob("**/jobs.csv"))
+        assert len(csv_files) == 1
+        df = pd.read_csv(csv_files[0])
+        assert all(s.startswith("lmms_eval") for s in df["eval_suite"].unique())
+        assert set(df["task_path"].unique()) == EXPECTED_TASKS
+
+
+class TestVideoModelAdapters:
+    """Verify video-specific model adapter detection."""
+
+    def test_llava_onevision_detected(self):
+        from oellm.constants import detect_lmms_model_type
+
+        assert detect_lmms_model_type("lmms-lab/llava-onevision-7b") == "llava_onevision"
+
+    def test_llava_vid_detected(self):
+        from oellm.constants import detect_lmms_model_type
+
+        assert detect_lmms_model_type("llava-vid-7b") == "llava_vid"
+
+    def test_video_llava_detected(self):
+        from oellm.constants import detect_lmms_model_type
+
+        assert detect_lmms_model_type("video-llava-7b") == "video_llava"
+
+    def test_longva_detected(self):
+        from oellm.constants import detect_lmms_model_type
+
+        assert detect_lmms_model_type("longva-7b") == "longva"
+
+    def test_internvideo_detected(self):
+        from oellm.constants import detect_lmms_model_type
+
+        assert detect_lmms_model_type("internvideo2-chat") == "internvideo2"
+
+    def test_generic_llava_still_works(self):
+        from oellm.constants import detect_lmms_model_type
+
+        assert detect_lmms_model_type("llava-hf/llava-1.5-7b-hf") == "llava_hf"
+
+    def test_qwen25_vl_still_works(self):
+        from oellm.constants import detect_lmms_model_type
+
+        assert detect_lmms_model_type("Qwen/Qwen2.5-VL-7B-Instruct") == "qwen2_5_vl"

From 68dffcc4133636d2a1daf88807932c055f87aac0 Mon Sep 17 00:00:00 2001
From: Ivan Slobozhan <ivan.slobozhan@gmail.com>
Date: Mon, 13 Apr 2026 15:41:39 +0200
Subject: [PATCH 2/5] bug fixes

---
 oellm/constants.py               |  2 +-
 oellm/resources/task-groups.yaml |  2 +
 oellm/resources/template.sbatch  | 10 ++++-
 oellm/task_groups.py             | 66 +++++++++++++++++++++-----------
 oellm/utils.py                   | 39 +++++++++++--------
 tests/test_video_task_groups.py  |  9 +++++
 6 files changed, 87 insertions(+), 41 deletions(-)

diff --git a/oellm/constants.py b/oellm/constants.py
index c8dbf1e..c8c7334 100644
--- a/oellm/constants.py
+++ b/oellm/constants.py
@@ -18,7 +18,7 @@ class EvaluationJob:
 LMMS_MODEL_ADAPTERS: list[tuple[list[str], str]] = [
     (["qwen2.5-vl", "qwen2_5_vl", "qwen2.5vl"], "qwen2_5_vl"),
     (["qwen2-vl", "qwen2_vl"], "qwen2_vl"),
-    # Video-capable adapters — must precede the generic "llava" pattern
+    (["llava-hf"], "llava_hf"),
     (["llava-onevision", "llava_onevision"], "llava_onevision"),
     (["llava-vid", "llava_vid", "llava-video"], "llava_vid"),
     (["video-llava", "video_llava"], "video_llava"),
diff --git a/oellm/resources/task-groups.yaml b/oellm/resources/task-groups.yaml
index 47e8865..82a9ef9 100644
--- a/oellm/resources/task-groups.yaml
+++ b/oellm/resources/task-groups.yaml
@@ -431,6 +431,7 @@ task_groups:
     tasks:
       - task: mvbench
         dataset: OpenGVLab/MVBench
+        revision: video
       - task: egoschema
         dataset: lmms-lab/egoschema
       - task: videomme
@@ -448,6 +449,7 @@ task_groups:
     tasks:
       - task: mvbench
         dataset: OpenGVLab/MVBench
+        revision: video
 
   video-egoschema:
     description: "EgoSchema long-form egocentric video QA via lmms-eval"
diff --git a/oellm/resources/template.sbatch b/oellm/resources/template.sbatch
index 17c374b..d67ab94 100644
--- a/oellm/resources/template.sbatch
+++ b/oellm/resources/template.sbatch
@@ -190,11 +190,17 @@ do
             _lmms_adapter="${{eval_suite#*:}}"
             OUTPUT_JSON="{evals_dir}/$(openssl rand -hex 5).json"
 
+            # LLaVA adapters need model_name to avoid a missing-import bug in lmms-eval
+            _lmms_extra_args=""
+            if [[ "$_lmms_adapter" == "llava_onevision" || "$_lmms_adapter" == "llava_vid" || "$_lmms_adapter" == "video_llava" ]]; then
+                _lmms_extra_args=",model_name=$(basename "$model_path")"
+            fi
+
             if [ -n "$VENV_PATH" ]; then
                 source "$VENV_PATH/bin/activate"
                 python -m lmms_eval \
                     --model "$_lmms_adapter" \
-                    --model_args "pretrained=$model_path,device_map=auto" \
+                    --model_args "pretrained=$model_path,device_map=auto$_lmms_extra_args" \
                     --tasks "$task_path" \
                     --num_fewshot "$n_shot" \
                     --output_path "$OUTPUT_JSON" \
@@ -205,7 +211,7 @@ do
                     $EVAL_SIF_PATH \
                     python -m lmms_eval \
                         --model "$_lmms_adapter" \
-                        --model_args "pretrained=$model_path,device_map=auto" \
+                        --model_args "pretrained=$model_path,device_map=auto$_lmms_extra_args" \
                         --tasks "$task_path" \
                         --num_fewshot "$n_shot" \
                         --output_path "$OUTPUT_JSON" \
diff --git a/oellm/task_groups.py b/oellm/task_groups.py
index 7291420..84acbe5 100644
--- a/oellm/task_groups.py
+++ b/oellm/task_groups.py
@@ -9,6 +9,7 @@
 class DatasetSpec:
     repo_id: str
     subset: str | None = None
+    revision: str | None = None
 
 
 @dataclass
@@ -20,6 +21,7 @@ class _Task:
     hf_models: list[str] | None = None
     hf_dataset_files: list[dict] | None = None
     suite: str | None = None
+    revision: str | None = None
 
 
 @dataclass
@@ -61,6 +63,7 @@ def from_dict(cls, name: str, data: dict) -> "TaskGroup":
                     hf_models=task_hf_models,
                     hf_dataset_files=task_hf_dataset_files,
                     suite=task_data.get("suite"),
+                    revision=task_data.get("revision"),
                 )
             )
 
@@ -154,16 +157,16 @@ class TaskGroupResult:
 
 def _iter_all_tasks(
     parsed: dict[str, TaskSuperGroup | TaskGroup],
-) -> Iterable[tuple[_Task, str]]:
-    """Yield ``(task, suite)`` pairs from a parsed group dict, flattening super groups."""
-    for group in parsed.values():
+) -> Iterable[tuple[_Task, str, str]]:
+    """Yield ``(task, suite, group_name)`` triples from a parsed group dict, flattening super groups."""
+    for group_name, group in parsed.items():
         if isinstance(group, TaskGroup):
             for t in group.tasks:
-                yield t, t.suite or group.suite
+                yield t, t.suite or group.suite, group_name
         else:
             for g in group.task_groups:
                 for t in g.tasks:
-                    yield t, t.suite or g.suite
+                    yield t, t.suite or g.suite, g.name
 
 
 def _expand_task_groups(group_names: Iterable[str]) -> list[TaskGroupResult]:
@@ -173,7 +176,7 @@ def _expand_task_groups(group_names: Iterable[str]) -> list[TaskGroupResult]:
         raise ValueError(f"Unknown task group(s): {', '.join(sorted(missing))}")
 
     results: list[TaskGroupResult] = []
-    for t, suite in _iter_all_tasks(parsed):
+    for t, suite, _gname in _iter_all_tasks(parsed):
         for shot in (int(s) for s in (t.n_shots or [])):
             results.append(TaskGroupResult(task=t.name, n_shot=shot, suite=suite))
 
@@ -198,22 +201,31 @@ def _collect_dataset_specs(group_names: Iterable[str]) -> list[DatasetSpec]:
     parsed = _parse_task_groups([str(n).strip() for n in group_names if str(n).strip()])
 
     specs: list[DatasetSpec] = []
-    seen: set[tuple[str, str | None]] = set()
+    seen: set[tuple[str, str | None, str | None]] = set()
 
-    def add_spec(dataset: str | None, subset: str | None):
+    def add_spec(
+        dataset: str | None,
+        subset: str | None,
+        revision: str | None = None,
+    ):
         if dataset is None:
             return
-        key = (dataset, subset)
+        key = (dataset, subset, revision)
         if key not in seen:
             seen.add(key)
-            specs.append(DatasetSpec(repo_id=dataset, subset=subset))
+            specs.append(DatasetSpec(repo_id=dataset, subset=subset, revision=revision))
+
+    for t, _, group_name in _iter_all_tasks(parsed):
+        # Video groups use snapshot_download; default revision is "main"
+        revision = t.revision
+        if revision is None and group_name.startswith("video-"):
+            revision = "main"
 
-    for t, _ in _iter_all_tasks(parsed):
         if t.dataset == "facebook/flores" and not t.subset:
             for lang in _extract_flores_subsets(t.name):
                 add_spec(t.dataset, lang)
         else:
-            add_spec(t.dataset, t.subset)
+            add_spec(t.dataset, t.subset, revision)
 
     return specs
 
@@ -225,7 +237,7 @@ def _collect_hf_model_repos(group_names: Iterable[str]) -> list[str]:
     repos: list[str] = []
     seen: set[str] = set()
 
-    for t, _ in _iter_all_tasks(parsed):
+    for t, _, _gname in _iter_all_tasks(parsed):
         for repo_id in t.hf_models or []:
             if repo_id not in seen:
                 seen.add(repo_id)
@@ -238,24 +250,32 @@ def _collect_hf_dataset_files(group_names: Iterable[str]) -> list[dict]:
     """Return deduplicated HF dataset file specs declared in task ``hf_dataset_files`` fields."""
     parsed = _parse_task_groups([str(n).strip() for n in group_names if str(n).strip()])
 
-    # Merge patterns from all tasks that share the same repo_id so that
-    # a single snapshot_download fetches everything needed.
-    merged: dict[str, list[str]] = {}
+    # Merge patterns from all tasks that share the same (repo_id, revision)
+    # so that a single snapshot_download fetches everything needed.
+    merged: dict[tuple[str, str | None], list[str]] = {}
 
-    for t, _ in _iter_all_tasks(parsed):
+    for t, _, _gname in _iter_all_tasks(parsed):
         for spec in t.hf_dataset_files or []:
             repo_id = spec.get("repo_id", "")
             if not repo_id:
                 continue
+            revision = spec.get("revision")
             patterns = spec.get("patterns") or []
-            if repo_id not in merged:
-                merged[repo_id] = list(patterns)
+            key = (repo_id, revision)
+            if key not in merged:
+                merged[key] = list(patterns)
             else:
                 for p in patterns:
-                    if p not in merged[repo_id]:
-                        merged[repo_id].append(p)
+                    if p not in merged[key]:
+                        merged[key].append(p)
 
-    return [{"repo_id": rid, "patterns": pats} for rid, pats in merged.items()]
+    result = []
+    for (rid, rev), pats in merged.items():
+        entry: dict = {"repo_id": rid, "patterns": pats}
+        if rev:
+            entry["revision"] = rev
+        result.append(entry)
+    return result
 
 
 def _build_task_dataset_map() -> dict[str, list[DatasetSpec]]:
@@ -268,7 +288,7 @@ def _build_task_dataset_map() -> dict[str, list[DatasetSpec]]:
 
     task_map: dict[str, list[DatasetSpec]] = {}
 
-    for t, _ in _iter_all_tasks(parsed):
+    for t, _, _gname in _iter_all_tasks(parsed):
         if t.dataset and t.name not in task_map:
             if t.dataset == "facebook/flores" and not t.subset:
                 task_map[t.name] = [
diff --git a/oellm/utils.py b/oellm/utils.py
index 27bde7d..1b022ed 100644
--- a/oellm/utils.py
+++ b/oellm/utils.py
@@ -314,21 +314,9 @@ def _process_model_paths(models: Iterable[str]):
                         if "HF_HOME" in os.environ
                         else None
                     )
-                    try:
-                        from huggingface_hub import try_to_load_from_cache
-
-                        cached = try_to_load_from_cache(
-                            model, "config.json", cache_dir=cache_dir
-                        )
-                        if isinstance(cached, str):
-                            logging.info(
-                                f"Model '{model}' already cached, skipping download"
-                            )
-                            per_model_paths.append(model)
-                            continue
-                    except Exception:
-                        pass
                     status.update(f"Downloading '{model}' ({idx}/{len(models_list)})")
+                    # snapshot_download is idempotent — it skips files that
+                    # are already cached and only fetches missing ones.
                     snapshot_download(
                         repo_id=model,
                         cache_dir=cache_dir,
@@ -373,9 +361,10 @@ def _pre_download_hf_dataset_files(dataset_files: list[dict]) -> None:
         for idx, spec in enumerate(dataset_files, 1):
             repo_id = spec.get("repo_id", "")
             patterns = spec.get("patterns")
+            revision = spec.get("revision")
             status.update(f"Downloading '{repo_id}' ({idx}/{len(dataset_files)})")
             try:
-                snapshot_download(
+                kwargs = dict(
                     repo_id=repo_id,
                     repo_type="dataset",
                     allow_patterns=patterns,
@@ -383,6 +372,9 @@ def _pre_download_hf_dataset_files(dataset_files: list[dict]) -> None:
                     if "HF_HOME" in os.environ
                     else None,
                 )
+                if revision:
+                    kwargs["revision"] = revision
+                snapshot_download(**kwargs)
             except Exception as e:
                 logging.warning(f"Failed to download dataset files from '{repo_id}': {e}")
 
@@ -391,6 +383,7 @@ def _pre_download_datasets_from_specs(
     specs: Iterable, trust_remote_code: bool = True
 ) -> None:
     from datasets import get_dataset_config_names, load_dataset
+    from huggingface_hub import snapshot_download
 
     specs_list = list(specs)
     if not specs_list:
@@ -406,6 +399,22 @@ def _pre_download_datasets_from_specs(
             label = f"{spec.repo_id}" + (f"/{spec.subset}" if spec.subset else "")
             status.update(f"Downloading '{label}' ({idx}/{len(specs_list)})")
 
+            # Video datasets: lmms-eval uses snapshot_download at runtime,
+            # so pre-download must use the same mechanism + revision.
+            if spec.revision is not None:
+                try:
+                    snapshot_download(
+                        repo_id=spec.repo_id,
+                        repo_type="dataset",
+                        revision=spec.revision,
+                    )
+                except Exception as e:
+                    logging.warning(
+                        f"Failed to snapshot_download '{spec.repo_id}' "
+                        f"(revision={spec.revision}): {e}"
+                    )
+                continue
+
             try:
                 load_dataset(
                     spec.repo_id,
diff --git a/tests/test_video_task_groups.py b/tests/test_video_task_groups.py
index 35cfe70..0b41448 100644
--- a/tests/test_video_task_groups.py
+++ b/tests/test_video_task_groups.py
@@ -185,6 +185,15 @@ def test_internvideo_detected(self):
 
         assert detect_lmms_model_type("internvideo2-chat") == "internvideo2"
 
+    def test_llava_hf_onevision_routes_to_llava_hf(self):
+        """HuggingFace-format llava-onevision models must use llava_hf, not llava_onevision."""
+        from oellm.constants import detect_lmms_model_type
+
+        assert (
+            detect_lmms_model_type("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
+            == "llava_hf"
+        )
+
     def test_generic_llava_still_works(self):
         from oellm.constants import detect_lmms_model_type
 

From 57209d06aeae47786d634b42cae1684c8cbc4db3 Mon Sep 17 00:00:00 2001
From: Ivan Slobozhan <ivan.slobozhan@gmail.com>
Date: Mon, 13 Apr 2026 15:47:43 +0200
Subject: [PATCH 3/5] fix lints

---
 docs/VENV.md                               |  2 +-
 oellm/contrib/regiondial_bench/__init__.py |  1 -
 oellm/utils.py                             | 12 ++++++------
 requirements-venv-evalchemy.txt            |  2 +-
 4 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/docs/VENV.md b/docs/VENV.md
index bce8ac4..553500f 100644
--- a/docs/VENV.md
+++ b/docs/VENV.md
@@ -95,7 +95,7 @@ We use [Ali's fork](https://github.com/Ali-Elganzory/evalchemy) which includes a
 
 3. Run with `EVALCHEMY_DIR` pointing to the cloned repo:
    ```bash
-   export HF_ALLOW_CODE_EVAL=1  # required by MBPP 
+   export HF_ALLOW_CODE_EVAL=1  # required by MBPP
    EVALCHEMY_DIR=$(pwd)/evalchemy oellm schedule-eval \
        --models HuggingFaceTB/SmolLM2-135M \
        --task-groups reasoning \
diff --git a/oellm/contrib/regiondial_bench/__init__.py b/oellm/contrib/regiondial_bench/__init__.py
index 8b13789..e69de29 100644
--- a/oellm/contrib/regiondial_bench/__init__.py
+++ b/oellm/contrib/regiondial_bench/__init__.py
@@ -1 +0,0 @@
-
diff --git a/oellm/utils.py b/oellm/utils.py
index 1b022ed..fd840e5 100644
--- a/oellm/utils.py
+++ b/oellm/utils.py
@@ -364,14 +364,14 @@ def _pre_download_hf_dataset_files(dataset_files: list[dict]) -> None:
             revision = spec.get("revision")
             status.update(f"Downloading '{repo_id}' ({idx}/{len(dataset_files)})")
             try:
-                kwargs = dict(
-                    repo_id=repo_id,
-                    repo_type="dataset",
-                    allow_patterns=patterns,
-                    cache_dir=Path(os.getenv("HF_HOME")) / "hub"
+                kwargs = {
+                    "repo_id": repo_id,
+                    "repo_type": "dataset",
+                    "allow_patterns": patterns,
+                    "cache_dir": Path(os.getenv("HF_HOME")) / "hub"
                     if "HF_HOME" in os.environ
                     else None,
-                )
+                }
                 if revision:
                     kwargs["revision"] = revision
                 snapshot_download(**kwargs)
diff --git a/requirements-venv-evalchemy.txt b/requirements-venv-evalchemy.txt
index 9ec7d3d..c63c3fb 100644
--- a/requirements-venv-evalchemy.txt
+++ b/requirements-venv-evalchemy.txt
@@ -1,6 +1,6 @@
 # Dependencies for evalchemy evaluation
 
-# lm-eval fork used by evalchemy 
+# lm-eval fork used by evalchemy
 lm-eval @ git+https://github.com/EtashGuha/lm-evaluation-harness@etashg/tokenize_fix
 
 scipy==1.17.0

From 06f2637b836dfa6a605a228995dbb8236391ec60 Mon Sep 17 00:00:00 2001
From: Ivan Slobozhan <ivan.slobozhan@gmail.com>
Date: Tue, 14 Apr 2026 00:10:25 +0200
Subject: [PATCH 4/5] change benchmark

---
 oellm/resources/task-groups.yaml | 18 ++++++++----------
 oellm/utils.py                   |  6 +++---
 tests/test_video_task_groups.py  | 10 +++++-----
 3 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/oellm/resources/task-groups.yaml b/oellm/resources/task-groups.yaml
index 82a9ef9..ba602e2 100644
--- a/oellm/resources/task-groups.yaml
+++ b/oellm/resources/task-groups.yaml
@@ -36,7 +36,7 @@ task_metrics:
   mathvista_testmini_format: llm_as_judge_eval
   mathvista_testmini_solution: llm_as_judge_eval
   # lmms-eval video benchmark metrics
-  mvbench: mvbench_accuracy
+  video_mmmu: mmmu_acc
   egoschema: submission
   videomme: videomme_perception_score
   # ActivityNet-QA requires GPT API access for evaluation (LLM-as-judge)
@@ -425,13 +425,12 @@ task_groups:
 
   # ── Video Modality (lmms-eval) ────────────────────────────────────────────
   video-understanding:
-    description: "Video understanding benchmarks via lmms-eval (MVBench, EgoSchema, VideoMME, ActivityNet-QA, LongVideoBench)"
+    description: "Video understanding benchmarks via lmms-eval (VideoMMMU, EgoSchema, VideoMME, ActivityNet-QA, LongVideoBench)"
     suite: lmms_eval
     n_shots: [0]
     tasks:
-      - task: mvbench
-        dataset: OpenGVLab/MVBench
-        revision: video
+      - task: video_mmmu
+        dataset: lmms-lab/VideoMMMU
       - task: egoschema
         dataset: lmms-lab/egoschema
       - task: videomme
@@ -442,14 +441,13 @@ task_groups:
         dataset: longvideobench/LongVideoBench
 
   # ── Individual Video Benchmarks (single-task groups for targeted runs) ────
-  video-mvbench:
-    description: "MVBench short-clip temporal understanding (20 tasks) via lmms-eval"
+  video-videommmu:
+    description: "VideoMMMU multi-discipline video understanding via lmms-eval"
     suite: lmms_eval
     n_shots: [0]
     tasks:
-      - task: mvbench
-        dataset: OpenGVLab/MVBench
-        revision: video
+      - task: video_mmmu
+        dataset: lmms-lab/VideoMMMU
 
   video-egoschema:
     description: "EgoSchema long-form egocentric video QA via lmms-eval"
diff --git a/oellm/utils.py b/oellm/utils.py
index fd840e5..f215a25 100644
--- a/oellm/utils.py
+++ b/oellm/utils.py
@@ -399,8 +399,9 @@ def _pre_download_datasets_from_specs(
             label = f"{spec.repo_id}" + (f"/{spec.subset}" if spec.subset else "")
             status.update(f"Downloading '{label}' ({idx}/{len(specs_list)})")
 
-            # Video datasets: lmms-eval uses snapshot_download at runtime,
-            # so pre-download must use the same mechanism + revision.
+            # Video datasets: lmms-eval uses snapshot_download for video
+            # files and load_dataset for annotations/metadata.  Pre-download
+            # both so offline compute nodes can access everything.
             if spec.revision is not None:
                 try:
                     snapshot_download(
@@ -413,7 +414,6 @@ def _pre_download_datasets_from_specs(
                         f"Failed to snapshot_download '{spec.repo_id}' "
                         f"(revision={spec.revision}): {e}"
                     )
-                continue
 
             try:
                 load_dataset(
diff --git a/tests/test_video_task_groups.py b/tests/test_video_task_groups.py
index 0b41448..f5d7274 100644
--- a/tests/test_video_task_groups.py
+++ b/tests/test_video_task_groups.py
@@ -15,7 +15,7 @@
 VIDEO_TASK_GROUP = "video-understanding"
 
 EXPECTED_TASKS = {
-    "mvbench",
+    "video_mmmu",
     "egoschema",
     "videomme",
     "activitynetqa",
@@ -23,7 +23,7 @@
 }
 
 EXPECTED_DATASETS = {
-    "OpenGVLab/MVBench",
+    "lmms-lab/VideoMMMU",
     "lmms-lab/egoschema",
     "lmms-lab/Video-MME",
     "lmms-lab/ActivityNetQA",
@@ -49,7 +49,7 @@ def test_video_understanding_has_five_tasks(self):
     def test_individual_video_groups_present(self):
         all_groups = get_all_task_group_names()
         for name in [
-            "video-mvbench",
+            "video-videommmu",
             "video-egoschema",
             "video-videomme",
             "video-activitynet-qa",
@@ -77,9 +77,9 @@ def test_all_tasks_route_to_lmms_eval(self):
             )
 
     def test_expand_individual_video_group(self):
-        results = _expand_task_groups(["video-mvbench"])
+        results = _expand_task_groups(["video-videommmu"])
         assert len(results) == 1
-        assert results[0].task == "mvbench"
+        assert results[0].task == "video_mmmu"
         assert results[0].suite == "lmms_eval"
 
 

From 1633129be824cab8df14472159810133510c9dbd Mon Sep 17 00:00:00 2001
From: Ivan Slobozhan <ivan.slobozhan@gmail.com>
Date: Tue, 14 Apr 2026 11:55:29 +0200
Subject: [PATCH 5/5] clean up

---
 oellm/task_groups.py | 17 ++++++-----------
 oellm/utils.py       | 14 +++++---------
 2 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/oellm/task_groups.py b/oellm/task_groups.py
index 84acbe5..e925273 100644
--- a/oellm/task_groups.py
+++ b/oellm/task_groups.py
@@ -9,7 +9,7 @@
 class DatasetSpec:
     repo_id: str
     subset: str | None = None
-    revision: str | None = None
+    video: bool = False
 
 
 @dataclass
@@ -21,7 +21,6 @@ class _Task:
     hf_models: list[str] | None = None
     hf_dataset_files: list[dict] | None = None
     suite: str | None = None
-    revision: str | None = None
 
 
 @dataclass
@@ -63,7 +62,6 @@ def from_dict(cls, name: str, data: dict) -> "TaskGroup":
                     hf_models=task_hf_models,
                     hf_dataset_files=task_hf_dataset_files,
                     suite=task_data.get("suite"),
-                    revision=task_data.get("revision"),
                 )
             )
 
@@ -206,26 +204,23 @@ def _collect_dataset_specs(group_names: Iterable[str]) -> list[DatasetSpec]:
     def add_spec(
         dataset: str | None,
         subset: str | None,
-        revision: str | None = None,
+        video: bool = False,
     ):
         if dataset is None:
             return
-        key = (dataset, subset, revision)
+        key = (dataset, subset)
         if key not in seen:
             seen.add(key)
-            specs.append(DatasetSpec(repo_id=dataset, subset=subset, revision=revision))
+            specs.append(DatasetSpec(repo_id=dataset, subset=subset, video=video))
 
     for t, _, group_name in _iter_all_tasks(parsed):
-        # Video groups use snapshot_download; default revision is "main"
-        revision = t.revision
-        if revision is None and group_name.startswith("video-"):
-            revision = "main"
+        is_video = group_name.startswith("video-")
 
         if t.dataset == "facebook/flores" and not t.subset:
             for lang in _extract_flores_subsets(t.name):
                 add_spec(t.dataset, lang)
         else:
-            add_spec(t.dataset, t.subset, revision)
+            add_spec(t.dataset, t.subset, video=is_video)
 
     return specs
 
diff --git a/oellm/utils.py b/oellm/utils.py
index f215a25..0c381da 100644
--- a/oellm/utils.py
+++ b/oellm/utils.py
@@ -399,21 +399,17 @@ def _pre_download_datasets_from_specs(
             label = f"{spec.repo_id}" + (f"/{spec.subset}" if spec.subset else "")
             status.update(f"Downloading '{label}' ({idx}/{len(specs_list)})")
 
-            # Video datasets: lmms-eval uses snapshot_download for video
-            # files and load_dataset for annotations/metadata.  Pre-download
-            # both so offline compute nodes can access everything.
-            if spec.revision is not None:
+            # Video datasets: lmms-eval calls snapshot_download at runtime
+            # to get raw video files, then symlinks them into $HF_HOME.
+            # Pre-download so offline compute nodes find everything cached.
+            if spec.video:
                 try:
                     snapshot_download(
                         repo_id=spec.repo_id,
                         repo_type="dataset",
-                        revision=spec.revision,
                     )
                 except Exception as e:
-                    logging.warning(
-                        f"Failed to snapshot_download '{spec.repo_id}' "
-                        f"(revision={spec.revision}): {e}"
-                    )
+                    logging.warning(f"Failed to snapshot_download '{spec.repo_id}': {e}")
 
             try:
                 load_dataset(