elliot-project · islobozhan · Apr 14, 2026 · Apr 10, 2026 · Apr 13, 2026 · Apr 13, 2026
diff --git a/README.md b/README.md
@@ -9,6 +9,7 @@ A multimodal evaluation framework for scheduling LLM and VLM evaluations across
 - **Task groups** for pre-defined evaluation suites with automatic dataset pre-downloading
 - **Multi-cluster support** with auto-detection (Leonardo, LUMI, JURECA, Snellius)
 - **Image evaluation** via lmms-eval (VQAv2, MMBench, MMMU, ChartQA, DocVQA, TextVQA, OCRBench, MathVista)
+- **Video evaluation** via lmms-eval (MVBench, EgoSchema, VideoMME, ActivityNet-QA, LongVideoBench)
 - **Plugin system** for contributing custom benchmarks without touching core code
 - **Automatic building and deployment of containers**
 
@@ -75,7 +76,18 @@ Super groups: `oellm-multilingual` (all multilingual benchmarks combined)
 | `image-ocrbench` | OCRBench | lmms-eval |
 | `image-mathvista` | MathVista | lmms-eval |
 
-The lmms-eval adapter class (`llava_hf`, `qwen2_5_vl`, etc.) is auto-detected from the model name.
+### Video
+
+| Group | Benchmark | Engine |
+|---|---|---|
+| `video-understanding` | All 5 benchmarks combined | lmms-eval |
+| `video-mvbench` | MVBench (20 temporal tasks) | lmms-eval |
+| `video-egoschema` | EgoSchema (long-form egocentric QA) | lmms-eval |
+| `video-videomme` | Video-MME (11s-1h clips) | lmms-eval |
+| `video-activitynet-qa` | ActivityNet-QA (requires GPT API) | lmms-eval |
+| `video-longvideobench` | LongVideoBench (cross-segment reasoning) | lmms-eval |
+
+The lmms-eval adapter class (`llava_hf`, `llava_onevision`, `qwen2_5_vl`, etc.) is auto-detected from the model name. Install with `pip install oellm[video]` (or use a venv with lmms-eval).
 
 ### Custom Benchmarks (contrib)
 
@@ -88,6 +100,12 @@ oellm schedule-eval \
     --task-groups "image-vqa" \
     --venv-path ~/elliot-venv
 
+# Run all 5 video benchmarks
+oellm schedule-eval \
+    --models "lmms-lab/llava-onevision-7b" \
+    --task-groups "video-understanding" \
+    --venv-path ~/elliot-venv
+
 # Mix image and text benchmarks in one submission
 oellm schedule-eval \
     --models "llava-hf/llava-1.5-7b-hf" \

diff --git a/docs/VENV.md b/docs/VENV.md
@@ -95,7 +95,7 @@ We use [Ali's fork](https://github.com/Ali-Elganzory/evalchemy) which includes a
 
 3. Run with `EVALCHEMY_DIR` pointing to the cloned repo:
    ```bash
-   export HF_ALLOW_CODE_EVAL=1  # required by MBPP 
+   export HF_ALLOW_CODE_EVAL=1  # required by MBPP
    EVALCHEMY_DIR=$(pwd)/evalchemy oellm schedule-eval \
        --models HuggingFaceTB/SmolLM2-135M \
        --task-groups reasoning \

diff --git a/oellm/constants.py b/oellm/constants.py
@@ -18,10 +18,17 @@ class EvaluationJob:
 LMMS_MODEL_ADAPTERS: list[tuple[list[str], str]] = [
     (["qwen2.5-vl", "qwen2_5_vl", "qwen2.5vl"], "qwen2_5_vl"),
     (["qwen2-vl", "qwen2_vl"], "qwen2_vl"),
+    (["llava-hf"], "llava_hf"),
+    (["llava-onevision", "llava_onevision"], "llava_onevision"),
+    (["llava-vid", "llava_vid", "llava-video"], "llava_vid"),
+    (["video-llava", "video_llava"], "video_llava"),
     (["llava"], "llava_hf"),
+    (["internvideo"], "internvideo2"),
     (["internvl"], "internvl2"),
     (["idefics"], "idefics3"),
     (["minicpm"], "minicpm_v"),
+    (["longva"], "longva"),
+    (["videochat2"], "videochat2"),
     (["qwen"], "qwen_vl"),
 ]
 

diff --git a/oellm/contrib/regiondial_bench/__init__.py b/oellm/contrib/regiondial_bench/__init__.py
@@ -1 +0,0 @@
-

diff --git a/oellm/resources/task-groups.yaml b/oellm/resources/task-groups.yaml
@@ -35,6 +35,13 @@ task_metrics:
   mathvista_testmini_cot: llm_as_judge_eval
   mathvista_testmini_format: llm_as_judge_eval
   mathvista_testmini_solution: llm_as_judge_eval
+  # lmms-eval video benchmark metrics
+  video_mmmu: mmmu_acc
+  egoschema: submission
+  videomme: videomme_perception_score
+  # ActivityNet-QA requires GPT API access for evaluation (LLM-as-judge)
+  activitynetqa: gpt_eval_accuracy
+  longvideobench_val_v: lvb_acc
 
 task_groups:
   open-sci-0.01:
@@ -416,6 +423,64 @@ task_groups:
       - task: mathvista_testmini
         dataset: AI4Math/MathVista
 
+  # ── Video Modality (lmms-eval) ────────────────────────────────────────────
+  video-understanding:
+    description: "Video understanding benchmarks via lmms-eval (VideoMMMU, EgoSchema, VideoMME, ActivityNet-QA, LongVideoBench)"
+    suite: lmms_eval
+    n_shots: [0]
+    tasks:
+      - task: video_mmmu
+        dataset: lmms-lab/VideoMMMU
+      - task: egoschema
+        dataset: lmms-lab/egoschema
+      - task: videomme
+        dataset: lmms-lab/Video-MME
+      - task: activitynetqa
+        dataset: lmms-lab/ActivityNetQA
+      - task: longvideobench_val_v
+        dataset: longvideobench/LongVideoBench
+
+  # ── Individual Video Benchmarks (single-task groups for targeted runs) ────
+  video-videommmu:
+    description: "VideoMMMU multi-discipline video understanding via lmms-eval"
+    suite: lmms_eval
+    n_shots: [0]
+    tasks:
+      - task: video_mmmu
+        dataset: lmms-lab/VideoMMMU
+
+  video-egoschema:
+    description: "EgoSchema long-form egocentric video QA via lmms-eval"
+    suite: lmms_eval
+    n_shots: [0]
+    tasks:
+      - task: egoschema
+        dataset: lmms-lab/egoschema
+
+  video-videomme:
+    description: "Video-MME full-spectrum video understanding (11s-1h) via lmms-eval"
+    suite: lmms_eval
+    n_shots: [0]
+    tasks:
+      - task: videomme
+        dataset: lmms-lab/Video-MME
+
+  video-activitynet-qa:
+    description: "ActivityNet-QA open-ended activity video QA via lmms-eval (requires GPT API for scoring)"
+    suite: lmms_eval
+    n_shots: [0]
+    tasks:
+      - task: activitynetqa
+        dataset: lmms-lab/ActivityNetQA
+
+  video-longvideobench:
+    description: "LongVideoBench long-video cross-segment reasoning via lmms-eval"
+    suite: lmms_eval
+    n_shots: [0]
+    tasks:
+      - task: longvideobench_val_v
+        dataset: longvideobench/LongVideoBench
+
   dclm-core-22:
     description: "DCLM core 22 evaluation tasks (lm-eval-harness, matching LLM Foundry task types)"
     suite: lm-eval-harness

diff --git a/oellm/resources/template.sbatch b/oellm/resources/template.sbatch
@@ -190,11 +190,17 @@ do
             _lmms_adapter="${{eval_suite#*:}}"
             OUTPUT_JSON="{evals_dir}/$(openssl rand -hex 5).json"
 
+            # LLaVA adapters need model_name to avoid a missing-import bug in lmms-eval
+            _lmms_extra_args=""
+            if [[ "$_lmms_adapter" == "llava_onevision" || "$_lmms_adapter" == "llava_vid" || "$_lmms_adapter" == "video_llava" ]]; then
+                _lmms_extra_args=",model_name=$(basename "$model_path")"
+            fi
+
             if [ -n "$VENV_PATH" ]; then
                 source "$VENV_PATH/bin/activate"
                 python -m lmms_eval \
                     --model "$_lmms_adapter" \
-                    --model_args "pretrained=$model_path,device_map=auto" \
+                    --model_args "pretrained=$model_path,device_map=auto$_lmms_extra_args" \
                     --tasks "$task_path" \
                     --num_fewshot "$n_shot" \
                     --output_path "$OUTPUT_JSON" \
@@ -205,7 +211,7 @@ do
                     $EVAL_SIF_PATH \
                     python -m lmms_eval \
                         --model "$_lmms_adapter" \
-                        --model_args "pretrained=$model_path,device_map=auto" \
+                        --model_args "pretrained=$model_path,device_map=auto$_lmms_extra_args" \
                         --tasks "$task_path" \
                         --num_fewshot "$n_shot" \
                         --output_path "$OUTPUT_JSON" \

diff --git a/oellm/task_groups.py b/oellm/task_groups.py
@@ -9,6 +9,7 @@
 class DatasetSpec:
     repo_id: str
     subset: str | None = None
+    video: bool = False
 
 
 @dataclass
@@ -154,16 +155,16 @@ class TaskGroupResult:
 
 def _iter_all_tasks(
     parsed: dict[str, TaskSuperGroup | TaskGroup],
-) -> Iterable[tuple[_Task, str]]:
-    """Yield ``(task, suite)`` pairs from a parsed group dict, flattening super groups."""
-    for group in parsed.values():
+) -> Iterable[tuple[_Task, str, str]]:
+    """Yield ``(task, suite, group_name)`` triples from a parsed group dict, flattening super groups."""
+    for group_name, group in parsed.items():
         if isinstance(group, TaskGroup):
             for t in group.tasks:
-                yield t, t.suite or group.suite
+                yield t, t.suite or group.suite, group_name
         else:
             for g in group.task_groups:
                 for t in g.tasks:
-                    yield t, t.suite or g.suite
+                    yield t, t.suite or g.suite, g.name
 
 
 def _expand_task_groups(group_names: Iterable[str]) -> list[TaskGroupResult]:
@@ -173,7 +174,7 @@ def _expand_task_groups(group_names: Iterable[str]) -> list[TaskGroupResult]:
         raise ValueError(f"Unknown task group(s): {', '.join(sorted(missing))}")
 
     results: list[TaskGroupResult] = []
-    for t, suite in _iter_all_tasks(parsed):
+    for t, suite, _gname in _iter_all_tasks(parsed):
         for shot in (int(s) for s in (t.n_shots or [])):
             results.append(TaskGroupResult(task=t.name, n_shot=shot, suite=suite))
 
@@ -198,22 +199,28 @@ def _collect_dataset_specs(group_names: Iterable[str]) -> list[DatasetSpec]:
     parsed = _parse_task_groups([str(n).strip() for n in group_names if str(n).strip()])
 
     specs: list[DatasetSpec] = []
-    seen: set[tuple[str, str | None]] = set()
+    seen: set[tuple[str, str | None, str | None]] = set()
 
-    def add_spec(dataset: str | None, subset: str | None):
+    def add_spec(
+        dataset: str | None,
+        subset: str | None,
+        video: bool = False,
+    ):
         if dataset is None:
             return
         key = (dataset, subset)
         if key not in seen:
             seen.add(key)
-            specs.append(DatasetSpec(repo_id=dataset, subset=subset))
+            specs.append(DatasetSpec(repo_id=dataset, subset=subset, video=video))
+
+    for t, _, group_name in _iter_all_tasks(parsed):
+        is_video = group_name.startswith("video-")
 
-    for t, _ in _iter_all_tasks(parsed):
         if t.dataset == "facebook/flores" and not t.subset:
             for lang in _extract_flores_subsets(t.name):
                 add_spec(t.dataset, lang)
         else:
-            add_spec(t.dataset, t.subset)
+            add_spec(t.dataset, t.subset, video=is_video)
 
     return specs
 
@@ -225,7 +232,7 @@ def _collect_hf_model_repos(group_names: Iterable[str]) -> list[str]:
     repos: list[str] = []
     seen: set[str] = set()
 
-    for t, _ in _iter_all_tasks(parsed):
+    for t, _, _gname in _iter_all_tasks(parsed):
         for repo_id in t.hf_models or []:
             if repo_id not in seen:
                 seen.add(repo_id)
@@ -238,24 +245,32 @@ def _collect_hf_dataset_files(group_names: Iterable[str]) -> list[dict]:
     """Return deduplicated HF dataset file specs declared in task ``hf_dataset_files`` fields."""
     parsed = _parse_task_groups([str(n).strip() for n in group_names if str(n).strip()])
 
-    # Merge patterns from all tasks that share the same repo_id so that
-    # a single snapshot_download fetches everything needed.
-    merged: dict[str, list[str]] = {}
+    # Merge patterns from all tasks that share the same (repo_id, revision)
+    # so that a single snapshot_download fetches everything needed.
+    merged: dict[tuple[str, str | None], list[str]] = {}
 
-    for t, _ in _iter_all_tasks(parsed):
+    for t, _, _gname in _iter_all_tasks(parsed):
         for spec in t.hf_dataset_files or []:
             repo_id = spec.get("repo_id", "")
             if not repo_id:
                 continue
+            revision = spec.get("revision")
             patterns = spec.get("patterns") or []
-            if repo_id not in merged:
-                merged[repo_id] = list(patterns)
+            key = (repo_id, revision)
+            if key not in merged:
+                merged[key] = list(patterns)
             else:
                 for p in patterns:
-                    if p not in merged[repo_id]:
-                        merged[repo_id].append(p)
+                    if p not in merged[key]:
+                        merged[key].append(p)
 
-    return [{"repo_id": rid, "patterns": pats} for rid, pats in merged.items()]
+    result = []
+    for (rid, rev), pats in merged.items():
+        entry: dict = {"repo_id": rid, "patterns": pats}
+        if rev:
+            entry["revision"] = rev
+        result.append(entry)
+    return result
 
 
 def _build_task_dataset_map() -> dict[str, list[DatasetSpec]]:
@@ -268,7 +283,7 @@ def _build_task_dataset_map() -> dict[str, list[DatasetSpec]]:
 
     task_map: dict[str, list[DatasetSpec]] = {}
 
-    for t, _ in _iter_all_tasks(parsed):
+    for t, _, _gname in _iter_all_tasks(parsed):
         if t.dataset and t.name not in task_map:
             if t.dataset == "facebook/flores" and not t.subset:
                 task_map[t.name] = [

diff --git a/oellm/utils.py b/oellm/utils.py
@@ -314,21 +314,9 @@ def _process_model_paths(models: Iterable[str]):
                         if "HF_HOME" in os.environ
                         else None
                     )
-                    try:
-                        from huggingface_hub import try_to_load_from_cache
-
-                        cached = try_to_load_from_cache(
-                            model, "config.json", cache_dir=cache_dir
-                        )
-                        if isinstance(cached, str):
-                            logging.info(
-                                f"Model '{model}' already cached, skipping download"
-                            )
-                            per_model_paths.append(model)
-                            continue
-                    except Exception:
-                        pass
                     status.update(f"Downloading '{model}' ({idx}/{len(models_list)})")
+                    # snapshot_download is idempotent — it skips files that
+                    # are already cached and only fetches missing ones.
                     snapshot_download(
                         repo_id=model,
                         cache_dir=cache_dir,
@@ -373,16 +361,20 @@ def _pre_download_hf_dataset_files(dataset_files: list[dict]) -> None:
         for idx, spec in enumerate(dataset_files, 1):
             repo_id = spec.get("repo_id", "")
             patterns = spec.get("patterns")
+            revision = spec.get("revision")
             status.update(f"Downloading '{repo_id}' ({idx}/{len(dataset_files)})")
             try:
-                snapshot_download(
-                    repo_id=repo_id,
-                    repo_type="dataset",
-                    allow_patterns=patterns,
-                    cache_dir=Path(os.getenv("HF_HOME")) / "hub"
+                kwargs = {
+                    "repo_id": repo_id,
+                    "repo_type": "dataset",
+                    "allow_patterns": patterns,
+                    "cache_dir": Path(os.getenv("HF_HOME")) / "hub"
                     if "HF_HOME" in os.environ
                     else None,
-                )
+                }
+                if revision:
+                    kwargs["revision"] = revision
+                snapshot_download(**kwargs)
             except Exception as e:
                 logging.warning(f"Failed to download dataset files from '{repo_id}': {e}")
 
@@ -391,6 +383,7 @@ def _pre_download_datasets_from_specs(
     specs: Iterable, trust_remote_code: bool = True
 ) -> None:
     from datasets import get_dataset_config_names, load_dataset
+    from huggingface_hub import snapshot_download
 
     specs_list = list(specs)
     if not specs_list:
@@ -406,6 +399,18 @@ def _pre_download_datasets_from_specs(
             label = f"{spec.repo_id}" + (f"/{spec.subset}" if spec.subset else "")
             status.update(f"Downloading '{label}' ({idx}/{len(specs_list)})")
 
+            # Video datasets: lmms-eval calls snapshot_download at runtime
+            # to get raw video files, then symlinks them into $HF_HOME.
+            # Pre-download so offline compute nodes find everything cached.
+            if spec.video:
+                try:
+                    snapshot_download(
+                        repo_id=spec.repo_id,
+                        repo_type="dataset",
+                    )
+                except Exception as e:
+                    logging.warning(f"Failed to snapshot_download '{spec.repo_id}': {e}")
+
             try:
                 load_dataset(
                     spec.repo_id,

diff --git a/pyproject.toml b/pyproject.toml
@@ -23,6 +23,9 @@ dev = [
 image = [
     "lmms-eval @ git+https://github.com/EvolvingLMMs-Lab/lmms-eval.git",
 ]
+video = [
+    "lmms-eval @ git+https://github.com/EvolvingLMMs-Lab/lmms-eval.git",
+]
 
 [project.scripts]
 oellm = "oellm.main:main"