From 6954ae99575499b389b173e13525ef13400d4d7f Mon Sep 17 00:00:00 2001
From: Ivan Slobozhan <ivan.slobozhan@gmail.com>
Date: Tue, 21 Apr 2026 11:24:02 +0200
Subject: [PATCH 1/6] adding audiobench benchmark as contrib

---
 oellm/contrib/audiobench/README.md   | 199 +++++++++++++
 oellm/contrib/audiobench/__init__.py |   0
 oellm/contrib/audiobench/adapter.py  |  80 +++++
 oellm/contrib/audiobench/suite.py    | 417 +++++++++++++++++++++++++++
 oellm/contrib/audiobench/task.py     | 293 +++++++++++++++++++
 oellm/resources/template.sbatch      |   5 +-
 pyproject.toml                       |  14 +
 7 files changed, 1007 insertions(+), 1 deletion(-)
 create mode 100644 oellm/contrib/audiobench/README.md
 create mode 100644 oellm/contrib/audiobench/__init__.py
 create mode 100644 oellm/contrib/audiobench/adapter.py
 create mode 100644 oellm/contrib/audiobench/suite.py
 create mode 100644 oellm/contrib/audiobench/task.py

diff --git a/oellm/contrib/audiobench/README.md b/oellm/contrib/audiobench/README.md
new file mode 100644
index 00000000..b1eee38a
--- /dev/null
+++ b/oellm/contrib/audiobench/README.md
@@ -0,0 +1,199 @@
+# AudioBench
+
+AudioBench (AudioLLMs/AudioBench, [arXiv 2406.16020](https://arxiv.org/abs/2406.16020))
+is a broad audio-understanding benchmark covering ASR, speech translation,
+spoken reasoning, audio scene QA, and paralinguistics. This contrib plugin
+wraps AudioBench as a callable `audiobench` suite inside elliot-cli so WP4
+can produce numbers directly comparable with the AudioBench paper and
+leaderboard, without the scoring-normalisation drift that would come from
+running the same datasets through lmms-eval.
+
+## Scope — Phase 1 (this release)
+
+**27 judge-free tasks** across ASR (WER), speech translation (BLEU), spoken
+reasoning (accuracy / string_match), and AudioCaps (METEOR). Of these:
+
+- **20 tasks are genuinely new** to the platform — not in any of our
+  existing lmms-eval `audio-*` groups. Examples: `earnings21_test`,
+  `earnings22_test`, GigaSpeech2 (Thai / Indonesian / Vietnamese),
+  SEAME code-switch, Spoken-MQA reasoning splits, MMAU mini.
+- **7 tasks are dual-registered** duplicates of benchmarks we already run
+  through lmms-eval (LibriSpeech test-clean/other, Common Voice 15 EN,
+  GigaSpeech, People's Speech, TED-LIUM 3, CoVoST2 en→zh). These use
+  AudioBench's own scorer and normaliser so WP4 can report numbers
+  aligned with the AudioBench paper.
+
+Every AudioBench task is namespaced with an `audiobench_` prefix so the CSV
+`task_path` column unambiguously identifies which scorer produced a number
+(e.g. `audiobench_librispeech_test_clean` is AudioBench-scored;
+`librispeech_test_clean` remains the lmms-eval version).
+
+**Phase 2** (not in this release) will add ~19 judge-dependent tasks
+(SLUE-SQA5, Spoken-SQuAD, AudioCaps-QA, IEMOCAP / MELD / VoxCeleb probes,
+AudioLLM-InstructionFollowing) once a vLLM judge server is provisioned on
+Leonardo.
+
+## Prerequisites
+
+### 1. Clone AudioBench on the cluster
+
+AudioBench is **not** pip-installable — upstream is a script harness with
+bare imports (`from dataset import ...` inside `src/main_evaluate.py`) and
+no `pyproject.toml` / `setup.py`. The plugin invokes it as a subprocess
+from an on-cluster clone.
+
+```bash
+git clone https://github.com/AudioLLMs/AudioBench /path/to/AudioBench
+```
+
+We track the **latest `main`** — no pinned SHA — so updates are a simple
+`git pull` under `$AUDIOBENCH_DIR`. If a breaking upstream change lands,
+file an issue and we'll introduce a pin.
+
+### 2. Install AudioBench's own runtime dependencies
+
+Still inside the clone:
+
+```bash
+cd /path/to/AudioBench
+python -m venv .venv && source .venv/bin/activate
+pip install -r requirements.txt
+```
+
+AudioBench's deps (unpinned upstream): `transformers`, `vllm`, `datasets`,
+`torchaudio`, `peft`, `autoawq`, `huggingface-hub`, `librosa`, `soundfile`,
+`fire`, `evaluate`, `jiwer`, `more_itertools`. Use a **separate venv**
+from the elliot-cli venv — AudioBench typically pulls in a bleeding-edge
+`transformers` that will conflict with lmms-eval's pin.
+
+### 3. Configure `clusters.yaml`
+
+Add `AUDIOBENCH_DIR` to your cluster block in
+`oellm/resources/clusters.yaml`:
+
+```yaml
+leonardo:
+  ...
+  AUDIOBENCH_DIR: "/leonardo/home/userexternal/<user>/AudioBench"
+```
+
+The plugin fails fast at dispatch time (via
+`oellm.contrib.dispatch`'s `CLUSTER_ENV_VARS` check) if the variable is
+missing, so you'll get a clean error message instead of a crash deep
+inside the subprocess.
+
+### 4. Install the elliot-cli `audiobench` extra
+
+On the submission / login node where you run `oellm schedule-evals`:
+
+```bash
+uv pip install -e ".[audiobench]"
+```
+
+This installs our Python-side scorer deps (`jiwer`, `sacrebleu`,
+`pythainlp`, `evaluate`) used for result post-processing — **not**
+AudioBench itself.
+
+### 5. Dataset pre-download
+
+No manual steps required. `schedule-evals` auto-downloads every
+`AudioLLMs/*` HF repo referenced by the requested task group on the
+login node via `huggingface_hub.snapshot_download(max_workers=2)` so the
+compute nodes do not need internet access. The rate-limit-friendly
+`max_workers=2` is shared infrastructure — see `oellm/utils.py`.
+
+## Running
+
+### Available task groups
+
+| Task group                       | Leaves | What it covers                                                  |
+|----------------------------------|--------|-----------------------------------------------------------------|
+| `audio-audiobench`               | 27     | Full Phase-1 suite (everything below).                          |
+| `audio-audiobench-asr`           | 15     | WER tasks — 9 new + 6 dual-registered with lmms-eval.           |
+| `audio-audiobench-st`            | 6      | BLEU speech-translation — 5 new + 1 dual (en→zh).               |
+| `audio-audiobench-reasoning`     | 6      | Spoken-MQA × 4, MMAU mini, AudioCaps METEOR.                    |
+
+### Example
+
+```bash
+# Full AudioBench Phase-1 suite on a Qwen2-Audio model:
+oellm schedule-evals \
+    --models Qwen/Qwen2-Audio-7B-Instruct \
+    --task-groups audio-audiobench \
+    --venv-path ~/elliot-venv
+
+# ASR only:
+oellm schedule-evals \
+    --models Qwen/Qwen2-Audio-7B-Instruct \
+    --task-groups audio-audiobench-asr \
+    --venv-path ~/elliot-venv
+
+# Smoke test with --limit:
+oellm schedule-evals \
+    --models Qwen/Qwen2-Audio-7B-Instruct \
+    --task-groups audio-audiobench-asr \
+    --limit 100 \
+    --venv-path ~/elliot-venv
+```
+
+`--limit N` is forwarded to AudioBench's `--number_of_samples N`. When
+unset, the full test split is evaluated.
+
+### Collecting results
+
+```bash
+oellm collect-results \
+    --eval-output-dir /path/to/evals \
+    --output-csv audiobench_results.csv
+```
+
+The primary metric per task is what's registered in `task_metrics`
+(`wer` / `bleu` / `accuracy` / `string_match` / `meteor`). Dual-registered
+tasks land in the CSV **alongside** their lmms-eval counterparts, with
+different `task_path` values (`audiobench_librispeech_test_clean` vs
+`librispeech_test_clean`) and different `eval_suite` values (`audiobench`
+vs `lmms_eval`) — no silent averaging.
+
+## Supported model adapters
+
+| Model path pattern                  | AudioBench `--model` key |
+|-------------------------------------|--------------------------|
+| `*qwen2-audio*` / `*qwen-audio*`    | `qwen2_audio`            |
+| `*salmonn*`                         | `salmonn`                |
+| `*ltu-*` / `*/ltu*` / `*ltu_as*`    | `ltu`                    |
+| `*whisper-*` / `*/whisper*`         | `whisper`                |
+| `*audio-flamingo*` / `*audioflamingo*` | `audioflamingo`        |
+| `*meralion*`                        | `meralion`               |
+| (anything else)                     | `generic` (default HF pipeline) |
+
+To override detection explicitly, pass the key as a suffix in the suite
+column: `audiobench:qwen2_audio`. The dispatcher in
+`oellm/contrib/dispatch.py` already splits on `:`.
+
+## How results flow end-to-end
+
+1. `schedule-evals` expands `audio-audiobench*` groups → 27 rows in
+   `jobs.csv` with `eval_suite=audiobench` (plus an adapter suffix from
+   `detect_model_flags`).
+2. `_collect_dataset_specs` auto-derives `needs_snapshot_download=True`
+   from the group-name prefix (`audio-*`) and snapshots every referenced
+   `AudioLLMs/*` repo to the shared HF cache.
+3. `template.sbatch`'s `*)` catch-all invokes
+   `python -m oellm.contrib.dispatch --suite audiobench:<adapter> …`.
+4. `oellm.contrib.audiobench.suite.run()` subprocesses
+   `python src/main_evaluate.py …` inside `$AUDIOBENCH_DIR`, captures
+   the result JSON AudioBench writes under its `--log_dir`, extracts the
+   metric value, and writes a lmms-eval-compatible JSON at
+   `$output_path`.
+5. `collect-results` reads it via `parse_results()` and the standard
+   `_resolve_metric` fallback chain — no special-casing in core code.
+
+## Open questions / Phase-2 prerequisites
+
+- **Judge service hosting:** Phase 2 needs a Llama-3-70B-AWQ judge on an
+  OpenAI-compatible endpoint. Plan is a separate long-running vLLM sbatch
+  whose URL/model lands in `clusters.yaml` as `AUDIOBENCH_JUDGE_URL` and
+  `AUDIOBENCH_JUDGE_MODEL`.
+- **MERaLiON / IMDA NSC tasks:** ~21 gated AudioBench tasks require
+  corpora not on public HF. These will ship in a later phase — or not,
+  depending on whether WP4 needs them.
diff --git a/oellm/contrib/audiobench/__init__.py b/oellm/contrib/audiobench/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/oellm/contrib/audiobench/adapter.py b/oellm/contrib/audiobench/adapter.py
new file mode 100644
index 00000000..2b15c7f6
--- /dev/null
+++ b/oellm/contrib/audiobench/adapter.py
@@ -0,0 +1,80 @@
+"""AudioBench model adapter.
+
+Maps a HuggingFace model path (or local filesystem path) to the string key
+that AudioBench's ``src/main_evaluate.py --model`` argument expects.  The
+upstream dispatch table lives in ``AudioBench/src/model.py`` and is
+hand-wired — one entry per model family.
+
+The adapter returns one of:
+
+- ``"qwen2_audio"`` — Qwen2-Audio / Qwen-Audio checkpoints.
+- ``"salmonn"``     — SALMONN family (Tsinghua).
+- ``"ltu"``         — Listen-Think-Understand.
+- ``"whisper"``     — Whisper (OpenAI).
+- ``"audioflamingo"`` — Audio-Flamingo (NVIDIA).
+- ``"meralion"``    — MERaLiON (Singapore-NLP).
+- ``"generic"``     — fallback.  AudioBench treats this as the default HF
+                      pipeline dispatch, which works for many generic audio
+                      LLMs but may need tuning per model.
+
+The detected value is passed to :mod:`oellm.contrib.dispatch` as the
+``model_flags`` portion of the ``eval_suite`` column
+(``audiobench:<model_flags>``), exactly like the regiondial_bench pattern.
+"""
+
+from __future__ import annotations
+
+from oellm.core.base_model_adapter import BaseModelAdapter
+
+# (model-family key, substrings to match in lowered model path)
+# Order matters — first match wins.  More-specific patterns must appear
+# before their super-strings (e.g. "qwen2-audio" before "qwen").
+_PATTERNS: list[tuple[str, tuple[str, ...]]] = [
+    ("qwen2_audio", ("qwen2-audio", "qwen2_audio", "qwen-audio", "qwen_audio")),
+    ("salmonn", ("salmonn",)),
+    # LTU checkpoints often have paths like "ltu-as/", "ltu-7b", or
+    # "MIT/ltu".  Prefix with "/" / "-" where possible to avoid false
+    # matches (e.g. "altus").
+    ("ltu", ("ltu-", "/ltu", "_ltu", "ltu_as")),
+    ("whisper", ("whisper-", "/whisper", "openai/whisper")),
+    ("audioflamingo", ("audio-flamingo", "audioflamingo", "audio_flamingo")),
+    ("meralion", ("meralion",)),
+]
+
+
+class AudioBenchModelAdapter(BaseModelAdapter):
+    """Adapter that resolves ``--model`` flag for AudioBench subprocess."""
+
+    def __init__(self, model_path: str) -> None:
+        self._path = model_path
+
+    @property
+    def model_path(self) -> str:
+        return self._path
+
+    def to_lm_eval_args(self) -> str:
+        # Not used — AudioBench doesn't route through lm-eval.  Provided
+        # only to satisfy the BaseModelAdapter contract.
+        return f"pretrained={self._path},trust_remote_code=True"
+
+    def to_lmms_eval_args(self) -> str:
+        # Not used — see note on to_lm_eval_args().
+        return f"pretrained={self._path}"
+
+    def to_contrib_flags(self) -> str | None:
+        """Return the AudioBench ``--model`` key for this model path."""
+        lowered = self._path.lower()
+        for key, needles in _PATTERNS:
+            if any(n in lowered for n in needles):
+                return key
+        return "generic"
+
+
+def detect_audiobench_model_type(model_path: str) -> str:
+    """Module-level convenience — matches :func:`oellm.constants.detect_lmms_model_type`.
+
+    Returns the same value as
+    ``AudioBenchModelAdapter(model_path).to_contrib_flags()`` but never
+    returns ``None`` (falls back to ``"generic"``).
+    """
+    return AudioBenchModelAdapter(model_path).to_contrib_flags() or "generic"
diff --git a/oellm/contrib/audiobench/suite.py b/oellm/contrib/audiobench/suite.py
new file mode 100644
index 00000000..7fed48a1
--- /dev/null
+++ b/oellm/contrib/audiobench/suite.py
@@ -0,0 +1,417 @@
+"""AudioBench contrib suite — plugin protocol implementation.
+
+Implements the :mod:`oellm.registry` plugin protocol for the AudioBench
+benchmark (AudioLLMs/AudioBench, arXiv 2406.16020).  AudioBench is **not** a
+pip-installable library — it is a script harness.  We invoke its entry point
+via ``python src/main_evaluate.py`` as a subprocess, from a clone pointed at
+by the ``$AUDIOBENCH_DIR`` environment variable (configured in
+``clusters.yaml``).  This mirrors the precedent set by ``regiondial_bench``.
+
+Cluster setup
+-------------
+The following environment variables must be set in ``clusters.yaml`` (or the
+cluster's module/profile system) before using any ``audio-audiobench-*``
+task group:
+
+``AUDIOBENCH_DIR``
+    Absolute path to a local clone of
+    https://github.com/AudioLLMs/AudioBench.  The entry point
+    ``src/main_evaluate.py`` must be present and the repo's own Python
+    dependencies must be installed in the active environment.
+
+Phase 2 (judge-dependent tasks) will additionally require:
+
+``AUDIOBENCH_JUDGE_URL`` / ``AUDIOBENCH_JUDGE_MODEL``
+    OpenAI-compatible URL and model name for the judge server (typically a
+    vLLM deployment of ``meta-llama/Meta-Llama-3-70B-Instruct-AWQ``).  Not
+    needed for Phase-1 judge-free tasks shipped today.
+
+Output format
+-------------
+:func:`run` writes a lmms-eval-compatible JSON file to *output_path* so
+that :func:`oellm.main.collect_results` can parse it without modification::
+
+    {
+      "model_name_or_path": "<model_path>",
+      "results": {
+        "audiobench_librispeech_test_clean": {
+          "wer": 0.047
+        }
+      },
+      "configs": {
+        "audiobench_librispeech_test_clean": {"num_fewshot": 0}
+      }
+    }
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import subprocess
+from pathlib import Path
+
+from oellm.contrib.audiobench.task import (
+    AUDIOBENCH_TASKS,
+    SUITE_NAME,
+    AudioBenchTaskSpec,
+    get_task_spec,
+)
+
+logger = logging.getLogger(__name__)
+
+CLUSTER_ENV_VARS = ["AUDIOBENCH_DIR"]
+
+# Mapping family → (group_name, human description).
+_FAMILY_GROUPS = {
+    "asr": (
+        "audio-audiobench-asr",
+        "AudioBench ASR tasks (WER).  Covers AudioBench-scored LibriSpeech, "
+        "Common Voice 15 EN, GigaSpeech, People's Speech, TED-LIUM 3 — dual "
+        "with our lmms-eval versions for paper-comparable numbers — plus new "
+        "tasks not in lmms-eval: earnings21/22, TED-LIUM 3 long-form, "
+        "AISHELL Mandarin, GigaSpeech2 (th/id/vi), SEAME code-switch.",
+    ),
+    "st": (
+        "audio-audiobench-st",
+        "AudioBench speech-translation tasks (BLEU).  CoVoST2 covering "
+        "en↔id, en↔ta, zh→en, ta→en (new), plus dual-registered en→zh.",
+    ),
+    "reasoning": (
+        "audio-audiobench-reasoning",
+        "AudioBench spoken reasoning / captioning.  Spoken-MQA digit + "
+        "reasoning splits (accuracy), MMAU-mini (string_match), "
+        "AudioCaps (METEOR).",
+    ),
+}
+
+_TOP_LEVEL_GROUP = "audio-audiobench"
+_TOP_LEVEL_DESC = (
+    "AudioBench Phase-1 suite (judge-free).  Runs all 27 AudioBench tasks "
+    "that do not require an LLM judge: ASR (WER), speech translation (BLEU), "
+    "spoken reasoning (accuracy/string_match), and AudioCaps captioning "
+    "(METEOR).  Phase 2 (judge-dependent tasks) will extend this group once "
+    "the judge service is configured."
+)
+
+
+def _build_task_groups() -> dict:
+    """Assemble the :data:`TASK_GROUPS` dict from :data:`AUDIOBENCH_TASKS`.
+
+    One top-level ``audio-audiobench`` group containing all 27 leaves, plus
+    three sub-groups keyed by family (``-asr`` / ``-st`` / ``-reasoning``).
+    All groups are zero-shot by design — AudioBench tasks do not support
+    in-context examples.
+    """
+    task_metrics: dict[str, str] = {t.name: t.metric for t in AUDIOBENCH_TASKS}
+
+    def _task_entry(t: AudioBenchTaskSpec) -> dict:
+        entry: dict = {"task": t.name, "dataset": t.hf_repo}
+        # ``data_dir``-style subsetting: we deliberately do NOT set ``subset``
+        # in the YAML entry.  The reason is that ``load_dataset(name=...)``
+        # used by ``_pre_download_datasets_from_specs`` treats ``subset`` as a
+        # config name, not a ``data_dir`` — and for gigaspeech2/spoken-mqa the
+        # upstream distinction is a data_dir, not a config.  Since the group
+        # name starts with "audio-", ``_collect_dataset_specs`` auto-sets
+        # ``needs_snapshot_download=True`` which downloads the whole repo,
+        # so AudioBench can read the right data_dir at runtime.  This also
+        # means multiple tasks sharing one HF repo dedupe to a single spec.
+        return entry
+
+    groups: dict[str, dict] = {}
+
+    # Sub-groups per family.
+    tasks_by_family: dict[str, list[AudioBenchTaskSpec]] = {
+        "asr": [],
+        "st": [],
+        "reasoning": [],
+    }
+    for t in AUDIOBENCH_TASKS:
+        tasks_by_family[t.family].append(t)
+
+    for family, (group_name, desc) in _FAMILY_GROUPS.items():
+        entries = tasks_by_family[family]
+        if not entries:
+            continue
+        groups[group_name] = {
+            "suite": SUITE_NAME,
+            "n_shots": [0],
+            "description": desc,
+            "tasks": [_task_entry(t) for t in entries],
+        }
+
+    # Top-level group — union of everything.
+    groups[_TOP_LEVEL_GROUP] = {
+        "suite": SUITE_NAME,
+        "n_shots": [0],
+        "description": _TOP_LEVEL_DESC,
+        "tasks": [_task_entry(t) for t in AUDIOBENCH_TASKS],
+    }
+
+    return {"task_metrics": task_metrics, "task_groups": groups}
+
+
+TASK_GROUPS: dict = _build_task_groups()
+
+
+# ---------------------------------------------------------------------------
+# Model-flag detection.
+# ---------------------------------------------------------------------------
+
+
+def detect_model_flags(model_path: str) -> str | None:
+    """Delegate to :class:`AudioBenchModelAdapter`.
+
+    Called by :class:`oellm.runner.EvalRunner.resolve_suite` to append the
+    AudioBench model-family key to ``eval_suite`` as
+    ``audiobench:<family>``.
+    """
+    from oellm.contrib.audiobench.adapter import AudioBenchModelAdapter
+
+    return AudioBenchModelAdapter(model_path).to_contrib_flags()
+
+
+# ---------------------------------------------------------------------------
+# Runtime — subprocess into AudioBench's src/main_evaluate.py.
+# ---------------------------------------------------------------------------
+
+
+def run(
+    *,
+    model_path: str,
+    task: str,
+    n_shot: int,
+    output_path: Path,
+    model_flags: str | None,
+    env: dict[str, str],
+) -> None:
+    """Execute one AudioBench task and write lmms-eval-shaped JSON.
+
+    Args:
+        model_path: HF repo ID or local path of the model under evaluation.
+        task: Canonical task name (must start with ``audiobench_``).
+        n_shot: Always 0 for AudioBench — recorded in the output ``configs``
+            block for downstream compatibility.
+        output_path: Destination for the lmms-eval-compatible result JSON.
+        model_flags: AudioBench ``--model`` key (e.g. ``"qwen2_audio"``);
+            produced by :func:`detect_model_flags`.  Falls back to
+            ``"generic"`` if not supplied.
+        env: Environment dict passed to the subprocess.  Must contain
+            ``AUDIOBENCH_DIR`` (validated by dispatch.py before ``run`` is
+            called, but we re-check for safety).
+
+    Raises:
+        RuntimeError: if AudioBench returns non-zero or produces no output.
+        KeyError: if *task* is not in the registry.
+    """
+    ab_dir = env.get("AUDIOBENCH_DIR")
+    if not ab_dir:
+        raise RuntimeError(
+            "AUDIOBENCH_DIR must be set.  Add it to clusters.yaml — "
+            "it should point at a local clone of "
+            "https://github.com/AudioLLMs/AudioBench."
+        )
+
+    entrypoint = Path(ab_dir) / "src" / "main_evaluate.py"
+    if not entrypoint.exists():
+        raise FileNotFoundError(
+            f"AudioBench entry point not found: {entrypoint}\n"
+            f"Check that AUDIOBENCH_DIR={ab_dir!r} points at a valid "
+            "AudioBench clone."
+        )
+
+    spec = get_task_spec(task)
+    model_key = model_flags or "generic"
+
+    # AudioBench writes outputs under a run-specific log directory; we set
+    # it to our output_path's parent so we can recover the raw result.
+    run_dir = output_path.parent / f"audiobench_{output_path.stem}"
+    run_dir.mkdir(parents=True, exist_ok=True)
+
+    cmd = [
+        "python",
+        "src/main_evaluate.py",
+        "--dataset",
+        spec.upstream_name,
+        "--model",
+        model_key,
+        "--model_name",
+        model_path,
+        "--metrics",
+        spec.upstream_metric,
+        "--log_dir",
+        str(run_dir),
+    ]
+    if spec.data_dir:
+        cmd.extend(["--data_dir", spec.data_dir])
+
+    # Forward LIMIT (set by template.sbatch) as AudioBench's
+    # --number_of_samples when present.  "-1" means no limit in AudioBench.
+    limit = env.get("LIMIT", "").strip()
+    if limit:
+        cmd.extend(["--number_of_samples", str(limit)])
+
+    logger.info("AudioBench cmd: %s (cwd=%s)", " ".join(cmd), ab_dir)
+    completed = subprocess.run(
+        cmd,
+        cwd=ab_dir,
+        env=env,
+        check=False,
+    )
+    if completed.returncode != 0:
+        raise RuntimeError(
+            f"AudioBench exited with code {completed.returncode} for "
+            f"task={task!r} model={model_path!r}"
+        )
+
+    metrics = _extract_metrics(run_dir, spec)
+    _write_lmms_shaped_json(
+        output_path=output_path,
+        model_path=model_path,
+        task_name=task,
+        n_shot=n_shot,
+        metrics=metrics,
+    )
+    logger.info("Results written to %s", output_path)
+
+
+def _extract_metrics(run_dir: Path, spec: AudioBenchTaskSpec) -> dict[str, float]:
+    """Find AudioBench's per-task score JSON inside *run_dir* and read it.
+
+    AudioBench writes one JSON file per task under its ``--log_dir`` with
+    the score under a key matching ``--metrics``.  We search recursively
+    for any ``*.json`` and pick the first one whose body contains the
+    expected metric key.  This is intentionally lenient because upstream
+    log-layout has changed across releases.
+
+    Raises:
+        RuntimeError: if no matching result file is found.
+    """
+    candidates = sorted(run_dir.rglob("*.json"))
+    if not candidates:
+        raise RuntimeError(
+            f"AudioBench produced no result JSON under {run_dir}.  "
+            "Check stdout/stderr for crashes."
+        )
+
+    target_key = spec.upstream_metric
+    for path in candidates:
+        try:
+            with open(path) as f:
+                body = json.load(f)
+        except (json.JSONDecodeError, OSError):
+            continue
+        value = _find_metric(body, target_key)
+        if value is not None:
+            # Emit the metric under OUR canonical key (spec.metric) so the
+            # lmms-eval-style ``task/metric,none`` stripping in
+            # collect_results() resolves to what's in task_metrics.yaml.
+            return {spec.metric: float(value)}
+
+    raise RuntimeError(
+        f"Could not locate metric {target_key!r} in any of "
+        f"{len(candidates)} AudioBench result JSON(s) under {run_dir}"
+    )
+
+
+def _find_metric(body: object, key: str) -> float | None:
+    """Recursive search for a numeric value keyed by *key* anywhere in *body*.
+
+    AudioBench's per-task JSON has nested structure that has drifted across
+    releases (sometimes ``{"wer": 0.04}``, sometimes
+    ``{"metrics": {"wer": {"score": 0.04}}}``).  We tolerate either form.
+    """
+    if isinstance(body, dict):
+        if key in body:
+            candidate = body[key]
+            if isinstance(candidate, int | float):
+                return float(candidate)
+            if isinstance(candidate, dict) and "score" in candidate:
+                score = candidate["score"]
+                if isinstance(score, int | float):
+                    return float(score)
+        for v in body.values():
+            found = _find_metric(v, key)
+            if found is not None:
+                return found
+    elif isinstance(body, list):
+        for item in body:
+            found = _find_metric(item, key)
+            if found is not None:
+                return found
+    return None
+
+
+def _write_lmms_shaped_json(
+    *,
+    output_path: Path,
+    model_path: str,
+    task_name: str,
+    n_shot: int,
+    metrics: dict[str, float],
+) -> None:
+    """Write a lmms-eval-compatible JSON at *output_path*.
+
+    :func:`oellm.main.collect_results` reads this shape directly; the
+    ``_resolve_metric`` fallback chain picks up our ``task_metrics``
+    mapping to extract the primary value.
+    """
+    payload = {
+        "model_name_or_path": model_path,
+        "results": {task_name: metrics},
+        "configs": {task_name: {"num_fewshot": n_shot}},
+    }
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w") as f:
+        json.dump(payload, f, indent=2)
+
+
+# ---------------------------------------------------------------------------
+# parse_results — invoked by collect_results to recognise our output files.
+# ---------------------------------------------------------------------------
+
+
+def parse_results(data: dict) -> tuple[str, str, int, dict[str, float]] | None:
+    """Recognise a JSON dict produced by :func:`run`.
+
+    Detection heuristic: the ``results`` dict contains at least one key
+    that starts with ``"audiobench_"``.  Returns the tuple expected by
+    :func:`oellm.main.collect_results`:
+
+        ``(model_id, task_name, n_shot, {metric: value})``
+
+    Returns ``None`` for JSON blobs that don't belong to this suite.
+    """
+    results = data.get("results", {})
+    if not isinstance(results, dict):
+        return None
+    for task_name, task_results in results.items():
+        if not isinstance(task_name, str) or not task_name.startswith("audiobench_"):
+            continue
+        if not isinstance(task_results, dict):
+            continue
+        model_id = data.get("model_name_or_path") or data.get("model_name") or "unknown"
+        n_shot = data.get("configs", {}).get(task_name, {}).get("num_fewshot", 0)
+        # Coerce everything that can be float; leave non-numeric alone so
+        # _resolve_metric can still see them.
+        coerced: dict[str, float] = {}
+        for k, v in task_results.items():
+            if isinstance(v, int | float):
+                coerced[k] = float(v)
+        return model_id, task_name, int(n_shot), coerced
+    return None
+
+
+# Re-exports used by the test suite.
+__all__ = [
+    "CLUSTER_ENV_VARS",
+    "SUITE_NAME",
+    "TASK_GROUPS",
+    "detect_model_flags",
+    "parse_results",
+    "run",
+]
+
+# Silence unused-import lint (the symbol is exported for consumer reuse).
+_ = os
diff --git a/oellm/contrib/audiobench/task.py b/oellm/contrib/audiobench/task.py
new file mode 100644
index 00000000..2a86bc14
--- /dev/null
+++ b/oellm/contrib/audiobench/task.py
@@ -0,0 +1,293 @@
+"""AudioBench task registry.
+
+Single source of truth for the AudioBench (AudioLLMs/AudioBench, arXiv 2406.16020)
+Phase-1 task set.  The registry is consumed by :mod:`oellm.contrib.audiobench.suite`
+to auto-generate ``TASK_GROUPS`` and to look up per-task metadata (HF repo,
+upstream task name, metric) at dispatch time.
+
+Phase 1 = judge-free tasks only (27 total):
+
+- **20 new** benchmarks not covered by our lmms-eval task groups
+  (``earnings{21,22}``, ``gigaspeech2`` {thai, indonesian, vietnamese},
+  ``aishell`` ZH ASR, ``seame`` code-switch, covost2 extra language pairs,
+  ``spoken-mqa`` reasoning splits, ``mmau_mini``, ``audiocaps`` METEOR).
+- **7 dual-registered** duplicates of benchmarks we already run via lmms-eval
+  (LibriSpeech test-clean/other, Common Voice 15 EN, GigaSpeech, People's
+  Speech, TED-LIUM 3, covost2 en→zh).  These use AudioBench's own scorer
+  and normalizer so WP4 can compare numbers against the AudioBench paper.
+
+Naming
+------
+Every task name is prefixed ``audiobench_`` so the CSV ``task_path`` column
+uniquely identifies the scorer and there is no collision with lmms-eval's
+``librispeech_test_clean`` etc.  :func:`AudioBenchTaskSpec.upstream_name`
+returns the bare name that AudioBench's ``src/main_evaluate.py --dataset``
+flag expects.
+
+Phase 2 (judge-dependent tasks) will extend this registry with ~19 more
+entries driven by a vLLM Llama-3-70B judge or the OpenAI API; see the
+plugin README for the rollout plan.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+SUITE_NAME = "audiobench"
+_TASK_NAME_PREFIX = "audiobench_"
+
+
+@dataclass(frozen=True)
+class AudioBenchTaskSpec:
+    """Metadata for a single AudioBench task.
+
+    Attributes:
+        name: Canonical ``audiobench_*`` task name used in the CSV
+            ``task_path`` column and in ``task_metrics`` / ``task_groups``.
+        upstream_name: The ``--dataset`` value AudioBench's
+            ``src/main_evaluate.py`` expects (e.g. ``"librispeech_test_clean"``).
+        hf_repo: HuggingFace dataset repo ID for pre-download
+            (e.g. ``"AudioLLMs/librispeech_test_clean"``).
+        metric: Primary metric key written to our ``task_metrics`` mapping.
+            One of ``wer`` / ``bleu`` / ``accuracy`` / ``string_match`` /
+            ``meteor``.
+        upstream_metric: The value passed to AudioBench's ``--metrics`` CLI
+            flag.  Usually identical to :attr:`metric` but allows divergence
+            when AudioBench uses a different key for the same score (e.g.
+            ``wer`` vs ``bleu`` match; ``accuracy`` vs upstream ``acc``).
+        family: One of ``"asr" | "st" | "reasoning"``.  Controls which
+            ``audio-audiobench-*`` sub-group the task lands in.
+        data_dir: Optional upstream ``data_dir=...`` selector, used by the
+            gigaspeech2 multi-language repo.  Passed to AudioBench via
+            ``--data_dir`` (upstream convention).
+    """
+
+    name: str
+    upstream_name: str
+    hf_repo: str
+    metric: str
+    upstream_metric: str
+    family: str
+    data_dir: str | None = None
+
+    @property
+    def task_group(self) -> str:
+        """Return the ``audio-audiobench-*`` sub-group this task belongs to."""
+        return f"audio-audiobench-{self.family}"
+
+
+def _t(
+    upstream_name: str,
+    hf_repo: str,
+    metric: str,
+    family: str,
+    *,
+    upstream_metric: str | None = None,
+    data_dir: str | None = None,
+    name: str | None = None,
+) -> AudioBenchTaskSpec:
+    """Helper — build an :class:`AudioBenchTaskSpec` with sensible defaults.
+
+    By default the canonical name is ``audiobench_<upstream_name>``.  Pass
+    ``name`` to override (used when upstream names collide across
+    data_dir variants of the same HF repo, e.g. gigaspeech2).
+    """
+    return AudioBenchTaskSpec(
+        name=name if name is not None else _TASK_NAME_PREFIX + upstream_name,
+        upstream_name=upstream_name,
+        hf_repo=hf_repo,
+        metric=metric,
+        upstream_metric=upstream_metric or metric,
+        family=family,
+        data_dir=data_dir,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Bucket B — 20 genuinely new tasks (not in our lmms-eval task groups)
+# ---------------------------------------------------------------------------
+
+_BUCKET_B_ASR = [
+    # Mandarin ASR (not in lmms-eval).
+    _t("aishell_asr_zh_test", "AudioLLMs/aishell_1_zh_test", "wer", "asr"),
+    # Long-form English ASR from financial calls.
+    _t("earnings21_test", "AudioLLMs/earnings21_test", "wer", "asr"),
+    _t("earnings22_test", "AudioLLMs/earnings22_test", "wer", "asr"),
+    # Long-form TED talks (distinct from our tedlium_dev_test).
+    _t("tedlium3_long_form_test", "AudioLLMs/tedlium3_long_form_test", "wer", "asr"),
+    # GigaSpeech2 — multilingual SE-Asian ASR.  All 3 share one HF repo and
+    # are disambiguated by ``data_dir``.  Upstream --dataset name is the same,
+    # so we override ``name`` with a language suffix to keep canonical names
+    # unique in our CSV.
+    _t(
+        "gigaspeech2",
+        "AudioLLMs/gigaspeech2-test",
+        "wer",
+        "asr",
+        data_dir="th-test",
+        name="audiobench_gigaspeech2_thai",
+    ),
+    _t(
+        "gigaspeech2",
+        "AudioLLMs/gigaspeech2-test",
+        "wer",
+        "asr",
+        data_dir="id-test",
+        name="audiobench_gigaspeech2_indo",
+    ),
+    _t(
+        "gigaspeech2",
+        "AudioLLMs/gigaspeech2-test",
+        "wer",
+        "asr",
+        data_dir="vi-test",
+        name="audiobench_gigaspeech2_viet",
+    ),
+    # SEAME code-switch (English ↔ Mandarin).
+    _t("seame_dev_man", "AudioLLMs/seame_dev_man", "wer", "asr"),
+    _t("seame_dev_sge", "AudioLLMs/seame_dev_sge", "wer", "asr"),
+]
+
+_BUCKET_B_ST = [
+    # CoVoST2 language pairs not in lmms-eval (only en-zh is there).
+    _t("covost2_en_id_test", "AudioLLMs/covost2_en_id_test", "bleu", "st"),
+    _t("covost2_en_ta_test", "AudioLLMs/covost2_en_ta_test", "bleu", "st"),
+    _t("covost2_id_en_test", "AudioLLMs/covost2_id_en_test", "bleu", "st"),
+    _t("covost2_zh_en_test", "AudioLLMs/covost2_zh_en_test", "bleu", "st"),
+    _t("covost2_ta_en_test", "AudioLLMs/covost2_ta_en_test", "bleu", "st"),
+]
+
+_BUCKET_B_REASONING = [
+    # Spoken-MQA reasoning splits (GSM-8K-like, acc scoring).  All 4 share
+    # one HF repo; the split is an upstream config — passed as ``data_dir``
+    # so the YAML/HF snapshot_download dedups across splits while AudioBench
+    # still knows which split to read.
+    _t(
+        "spoken-mqa",
+        "amao0o0/spoken-mqa",
+        "accuracy",
+        "reasoning",
+        upstream_metric="acc",
+        data_dir="short_digit",
+        name="audiobench_spoken_mqa_short_digit",
+    ),
+    _t(
+        "spoken-mqa",
+        "amao0o0/spoken-mqa",
+        "accuracy",
+        "reasoning",
+        upstream_metric="acc",
+        data_dir="long_digit",
+        name="audiobench_spoken_mqa_long_digit",
+    ),
+    _t(
+        "spoken-mqa",
+        "amao0o0/spoken-mqa",
+        "accuracy",
+        "reasoning",
+        upstream_metric="acc",
+        data_dir="single_step_reasoning",
+        name="audiobench_spoken_mqa_single_step_reasoning",
+    ),
+    _t(
+        "spoken-mqa",
+        "amao0o0/spoken-mqa",
+        "accuracy",
+        "reasoning",
+        upstream_metric="acc",
+        data_dir="multi_step_reasoning",
+        name="audiobench_spoken_mqa_multi_step_reasoning",
+    ),
+    # MMAU mini — deterministic string-match scoring (judge-free path).
+    _t(
+        "mmau_mini",
+        "AudioLLMs/MMAU-mini",
+        "string_match",
+        "reasoning",
+        upstream_metric="string_match",
+    ),
+    # AudioCaps — METEOR is the judge-free scorer (judges also available).
+    _t(
+        "audiocaps_test",
+        "AudioLLMs/audiocaps_test",
+        "meteor",
+        "reasoning",
+        upstream_metric="meteor",
+    ),
+]
+
+# ---------------------------------------------------------------------------
+# Bucket A — 7 dual-registered duplicates of benchmarks already in lmms-eval.
+# These are for paper-comparability with AudioBench; the lmms-eval versions
+# stay in place and produce independent numbers under their own task names.
+# The HF repos are distinct (AudioLLMs/* vs lmms-lab/*) so there is no risk
+# of snapshot_download collision.
+# ---------------------------------------------------------------------------
+
+_BUCKET_A_DUAL = [
+    # LibriSpeech (English ASR).
+    _t("librispeech_test_clean", "AudioLLMs/librispeech_test_clean", "wer", "asr"),
+    _t("librispeech_test_other", "AudioLLMs/librispeech_test_other", "wer", "asr"),
+    # Common Voice 15 English ASR.
+    _t("common_voice_15_en_test", "AudioLLMs/common_voice_15_en_test", "wer", "asr"),
+    # GigaSpeech v1 English ASR.
+    _t("gigaspeech_test", "AudioLLMs/gigaspeech_test", "wer", "asr"),
+    # People's Speech English ASR (note upstream repo name has the "s").
+    _t("peoples_speech_test", "AudioLLMs/peoples_speech_test", "wer", "asr"),
+    # TED-LIUM 3 standard test (distinct from tedlium3_long_form_test above).
+    _t("tedlium3_test", "AudioLLMs/tedlium3_test", "wer", "asr"),
+    # CoVoST2 en→zh (ST).
+    _t("covost2_en_zh_test", "AudioLLMs/covost2_en_zh_test", "bleu", "st"),
+]
+
+
+# ---------------------------------------------------------------------------
+# Public registry — flat list of all Phase-1 task specs.
+# Order is stable (ASR / ST / reasoning) for deterministic YAML ordering
+# and for readable test-failure diffs.
+# ---------------------------------------------------------------------------
+
+AUDIOBENCH_TASKS: list[AudioBenchTaskSpec] = [
+    *_BUCKET_B_ASR,
+    *_BUCKET_B_ST,
+    *_BUCKET_B_REASONING,
+    *_BUCKET_A_DUAL,
+]
+
+
+# Fail-fast consistency checks — runs at import time so a typo in the
+# registry breaks the test suite rather than manifesting as a silent job
+# routing error later.
+def _validate() -> None:
+    seen_names: set[str] = set()
+    for t in AUDIOBENCH_TASKS:
+        if t.name in seen_names:
+            raise RuntimeError(f"Duplicate AudioBench task name {t.name!r} in registry")
+        seen_names.add(t.name)
+        if not t.name.startswith(_TASK_NAME_PREFIX):
+            raise RuntimeError(
+                f"AudioBench task {t.name!r} missing required prefix "
+                f"{_TASK_NAME_PREFIX!r}"
+            )
+        if t.family not in {"asr", "st", "reasoning"}:
+            raise RuntimeError(
+                f"AudioBench task {t.name!r} has unknown family {t.family!r}"
+            )
+
+
+_validate()
+
+
+def get_task_spec(name: str) -> AudioBenchTaskSpec:
+    """Look up an :class:`AudioBenchTaskSpec` by canonical task name.
+
+    Raises
+    ------
+    KeyError
+        If *name* does not correspond to any registered AudioBench task.
+    """
+    for t in AUDIOBENCH_TASKS:
+        if t.name == name:
+            return t
+    known = sorted(t.name for t in AUDIOBENCH_TASKS)
+    raise KeyError(f"Unknown AudioBench task {name!r}.  Known tasks: {', '.join(known)}")
diff --git a/oellm/resources/template.sbatch b/oellm/resources/template.sbatch
index d67ab948..bab9643b 100644
--- a/oellm/resources/template.sbatch
+++ b/oellm/resources/template.sbatch
@@ -13,7 +13,10 @@
 CSV_PATH="{csv_path}"
 NUM_JOBS={num_jobs}
 TOTAL_EVALS={total_evals}
-LIMIT="{limit}"
+# Exported so contrib suite plugins (which spawn their own Python subprocesses
+# via oellm.contrib.dispatch) can read it from os.environ.  Built-in suites
+# below still interpolate $LIMIT directly into their CLI flags.
+export LIMIT="{limit}"
 VENV_PATH="{venv_path}"
 LM_EVAL_INCLUDE_PATH="{lm_eval_include_path}"
 
diff --git a/pyproject.toml b/pyproject.toml
index e8662646..09f9d036 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,6 +33,20 @@ audio = [
     "librosa",
     "jiwer",
 ]
+# AudioBench contrib plugin.  AudioBench itself is NOT pip-installable (no
+# build backend upstream and bare imports like ``from dataset import ...``
+# that break after install), so we don't list it as a Python dependency.
+# Instead, AUDIOBENCH_DIR in clusters.yaml points at a local git clone and
+# suite.py subprocesses into ``python src/main_evaluate.py``.  What we do
+# need here is our own post-processing / scorer deps for result parsing.
+audiobench = [
+    "jiwer",         # Phase 1 — WER result sanity checks
+    "sacrebleu",     # Phase 1 — BLEU scorer verification (covost2)
+    "pythainlp",     # Phase 1 — Thai tokenisation for gigaspeech2_thai
+    "evaluate",      # Phase 1 — MMAU / METEOR post-processing
+    "soundfile",
+    "librosa",
+]
 
 [project.scripts]
 oellm = "oellm.main:main"

From 6e2f2a24fbd4adaf6654b76765a8f875c7696f6f Mon Sep 17 00:00:00 2001
From: Ivan Slobozhan <ivan.slobozhan@gmail.com>
Date: Tue, 21 Apr 2026 11:35:22 +0200
Subject: [PATCH 2/6] add audiobench tests

---
 oellm/contrib/audiobench/README.md  |  29 +-
 oellm/contrib/audiobench/adapter.py |  48 +-
 oellm/contrib/audiobench/suite.py   | 190 ++------
 oellm/contrib/audiobench/task.py    | 153 ++----
 oellm/resources/clusters.yaml       |  17 +-
 pyproject.toml                      |  18 +-
 tests/test_audiobench.py            | 725 ++++++++++++++++++++++++++++
 7 files changed, 830 insertions(+), 350 deletions(-)
 create mode 100644 tests/test_audiobench.py

diff --git a/oellm/contrib/audiobench/README.md b/oellm/contrib/audiobench/README.md
index b1eee38a..4306901a 100644
--- a/oellm/contrib/audiobench/README.md
+++ b/oellm/contrib/audiobench/README.md
@@ -8,7 +8,7 @@ can produce numbers directly comparable with the AudioBench paper and
 leaderboard, without the scoring-normalisation drift that would come from
 running the same datasets through lmms-eval.
 
-## Scope — Phase 1 (this release)
+## Scope
 
 **27 judge-free tasks** across ASR (WER), speech translation (BLEU), spoken
 reasoning (accuracy / string_match), and AudioCaps (METEOR). Of these:
@@ -28,10 +28,9 @@ Every AudioBench task is namespaced with an `audiobench_` prefix so the CSV
 (e.g. `audiobench_librispeech_test_clean` is AudioBench-scored;
 `librispeech_test_clean` remains the lmms-eval version).
 
-**Phase 2** (not in this release) will add ~19 judge-dependent tasks
-(SLUE-SQA5, Spoken-SQuAD, AudioCaps-QA, IEMOCAP / MELD / VoxCeleb probes,
-AudioLLM-InstructionFollowing) once a vLLM judge server is provisioned on
-Leonardo.
+Judge-dependent tasks (SLUE-SQA5, Spoken-SQuAD, AudioCaps-QA, IEMOCAP /
+MELD / VoxCeleb probes, AudioLLM-InstructionFollowing) are not included
+and depend on a vLLM judge service being provisioned on Leonardo.
 
 ## Prerequisites
 
@@ -99,8 +98,7 @@ AudioBench itself.
 No manual steps required. `schedule-evals` auto-downloads every
 `AudioLLMs/*` HF repo referenced by the requested task group on the
 login node via `huggingface_hub.snapshot_download(max_workers=2)` so the
-compute nodes do not need internet access. The rate-limit-friendly
-`max_workers=2` is shared infrastructure — see `oellm/utils.py`.
+compute nodes do not need internet access.
 
 ## Running
 
@@ -108,7 +106,7 @@ compute nodes do not need internet access. The rate-limit-friendly
 
 | Task group                       | Leaves | What it covers                                                  |
 |----------------------------------|--------|-----------------------------------------------------------------|
-| `audio-audiobench`               | 27     | Full Phase-1 suite (everything below).                          |
+| `audio-audiobench`               | 27     | Full suite (everything below).                                  |
 | `audio-audiobench-asr`           | 15     | WER tasks — 9 new + 6 dual-registered with lmms-eval.           |
 | `audio-audiobench-st`            | 6      | BLEU speech-translation — 5 new + 1 dual (en→zh).               |
 | `audio-audiobench-reasoning`     | 6      | Spoken-MQA × 4, MMAU mini, AudioCaps METEOR.                    |
@@ -116,7 +114,7 @@ compute nodes do not need internet access. The rate-limit-friendly
 ### Example
 
 ```bash
-# Full AudioBench Phase-1 suite on a Qwen2-Audio model:
+# Full AudioBench suite on a Qwen2-Audio model:
 oellm schedule-evals \
     --models Qwen/Qwen2-Audio-7B-Instruct \
     --task-groups audio-audiobench \
@@ -188,12 +186,11 @@ column: `audiobench:qwen2_audio`. The dispatcher in
 5. `collect-results` reads it via `parse_results()` and the standard
    `_resolve_metric` fallback chain — no special-casing in core code.
 
-## Open questions / Phase-2 prerequisites
+## Open items
 
-- **Judge service hosting:** Phase 2 needs a Llama-3-70B-AWQ judge on an
-  OpenAI-compatible endpoint. Plan is a separate long-running vLLM sbatch
-  whose URL/model lands in `clusters.yaml` as `AUDIOBENCH_JUDGE_URL` and
-  `AUDIOBENCH_JUDGE_MODEL`.
+- **Judge service hosting:** judge-dependent tasks need a Llama-3-70B-AWQ
+  judge on an OpenAI-compatible endpoint. Plan is a separate long-running
+  vLLM sbatch whose URL/model lands in `clusters.yaml` as
+  `AUDIOBENCH_JUDGE_URL` and `AUDIOBENCH_JUDGE_MODEL`.
 - **MERaLiON / IMDA NSC tasks:** ~21 gated AudioBench tasks require
-  corpora not on public HF. These will ship in a later phase — or not,
-  depending on whether WP4 needs them.
+  corpora not on public HF. Deferred until WP4 needs them.
diff --git a/oellm/contrib/audiobench/adapter.py b/oellm/contrib/audiobench/adapter.py
index 2b15c7f6..4734ba69 100644
--- a/oellm/contrib/audiobench/adapter.py
+++ b/oellm/contrib/audiobench/adapter.py
@@ -1,40 +1,20 @@
 """AudioBench model adapter.
 
-Maps a HuggingFace model path (or local filesystem path) to the string key
-that AudioBench's ``src/main_evaluate.py --model`` argument expects.  The
-upstream dispatch table lives in ``AudioBench/src/model.py`` and is
-hand-wired — one entry per model family.
-
-The adapter returns one of:
-
-- ``"qwen2_audio"`` — Qwen2-Audio / Qwen-Audio checkpoints.
-- ``"salmonn"``     — SALMONN family (Tsinghua).
-- ``"ltu"``         — Listen-Think-Understand.
-- ``"whisper"``     — Whisper (OpenAI).
-- ``"audioflamingo"`` — Audio-Flamingo (NVIDIA).
-- ``"meralion"``    — MERaLiON (Singapore-NLP).
-- ``"generic"``     — fallback.  AudioBench treats this as the default HF
-                      pipeline dispatch, which works for many generic audio
-                      LLMs but may need tuning per model.
-
-The detected value is passed to :mod:`oellm.contrib.dispatch` as the
-``model_flags`` portion of the ``eval_suite`` column
-(``audiobench:<model_flags>``), exactly like the regiondial_bench pattern.
+Maps a HuggingFace model path to the string key that AudioBench's
+``src/main_evaluate.py --model`` argument expects.  The detected value is
+passed to :mod:`oellm.contrib.dispatch` as the ``model_flags`` portion of
+the ``eval_suite`` column (``audiobench:<model_flags>``).
 """
 
 from __future__ import annotations
 
 from oellm.core.base_model_adapter import BaseModelAdapter
 
-# (model-family key, substrings to match in lowered model path)
-# Order matters — first match wins.  More-specific patterns must appear
-# before their super-strings (e.g. "qwen2-audio" before "qwen").
+# (model-family key, substrings to match in lowered model path).  Order
+# matters — first match wins, so more-specific patterns come first.
 _PATTERNS: list[tuple[str, tuple[str, ...]]] = [
     ("qwen2_audio", ("qwen2-audio", "qwen2_audio", "qwen-audio", "qwen_audio")),
     ("salmonn", ("salmonn",)),
-    # LTU checkpoints often have paths like "ltu-as/", "ltu-7b", or
-    # "MIT/ltu".  Prefix with "/" / "-" where possible to avoid false
-    # matches (e.g. "altus").
     ("ltu", ("ltu-", "/ltu", "_ltu", "ltu_as")),
     ("whisper", ("whisper-", "/whisper", "openai/whisper")),
     ("audioflamingo", ("audio-flamingo", "audioflamingo", "audio_flamingo")),
@@ -43,7 +23,7 @@
 
 
 class AudioBenchModelAdapter(BaseModelAdapter):
-    """Adapter that resolves ``--model`` flag for AudioBench subprocess."""
+    """Adapter resolving the ``--model`` flag for the AudioBench subprocess."""
 
     def __init__(self, model_path: str) -> None:
         self._path = model_path
@@ -53,16 +33,15 @@ def model_path(self) -> str:
         return self._path
 
     def to_lm_eval_args(self) -> str:
-        # Not used — AudioBench doesn't route through lm-eval.  Provided
-        # only to satisfy the BaseModelAdapter contract.
+        # Unused — AudioBench doesn't route through lm-eval.  Required by
+        # BaseModelAdapter.
         return f"pretrained={self._path},trust_remote_code=True"
 
     def to_lmms_eval_args(self) -> str:
-        # Not used — see note on to_lm_eval_args().
+        # Unused — see to_lm_eval_args().
         return f"pretrained={self._path}"
 
     def to_contrib_flags(self) -> str | None:
-        """Return the AudioBench ``--model`` key for this model path."""
         lowered = self._path.lower()
         for key, needles in _PATTERNS:
             if any(n in lowered for n in needles):
@@ -71,10 +50,5 @@ def to_contrib_flags(self) -> str | None:
 
 
 def detect_audiobench_model_type(model_path: str) -> str:
-    """Module-level convenience — matches :func:`oellm.constants.detect_lmms_model_type`.
-
-    Returns the same value as
-    ``AudioBenchModelAdapter(model_path).to_contrib_flags()`` but never
-    returns ``None`` (falls back to ``"generic"``).
-    """
+    """Like ``to_contrib_flags`` but always returns a string (default ``generic``)."""
     return AudioBenchModelAdapter(model_path).to_contrib_flags() or "generic"
diff --git a/oellm/contrib/audiobench/suite.py b/oellm/contrib/audiobench/suite.py
index 7fed48a1..e601e784 100644
--- a/oellm/contrib/audiobench/suite.py
+++ b/oellm/contrib/audiobench/suite.py
@@ -1,47 +1,11 @@
 """AudioBench contrib suite — plugin protocol implementation.
 
-Implements the :mod:`oellm.registry` plugin protocol for the AudioBench
-benchmark (AudioLLMs/AudioBench, arXiv 2406.16020).  AudioBench is **not** a
-pip-installable library — it is a script harness.  We invoke its entry point
-via ``python src/main_evaluate.py`` as a subprocess, from a clone pointed at
-by the ``$AUDIOBENCH_DIR`` environment variable (configured in
-``clusters.yaml``).  This mirrors the precedent set by ``regiondial_bench``.
-
-Cluster setup
--------------
-The following environment variables must be set in ``clusters.yaml`` (or the
-cluster's module/profile system) before using any ``audio-audiobench-*``
-task group:
-
-``AUDIOBENCH_DIR``
-    Absolute path to a local clone of
-    https://github.com/AudioLLMs/AudioBench.  The entry point
-    ``src/main_evaluate.py`` must be present and the repo's own Python
-    dependencies must be installed in the active environment.
-
-Phase 2 (judge-dependent tasks) will additionally require:
-
-``AUDIOBENCH_JUDGE_URL`` / ``AUDIOBENCH_JUDGE_MODEL``
-    OpenAI-compatible URL and model name for the judge server (typically a
-    vLLM deployment of ``meta-llama/Meta-Llama-3-70B-Instruct-AWQ``).  Not
-    needed for Phase-1 judge-free tasks shipped today.
-
-Output format
--------------
-:func:`run` writes a lmms-eval-compatible JSON file to *output_path* so
-that :func:`oellm.main.collect_results` can parse it without modification::
-
-    {
-      "model_name_or_path": "<model_path>",
-      "results": {
-        "audiobench_librispeech_test_clean": {
-          "wer": 0.047
-        }
-      },
-      "configs": {
-        "audiobench_librispeech_test_clean": {"num_fewshot": 0}
-      }
-    }
+AudioBench is not pip-installable (upstream has no build backend and uses
+bare imports like ``from dataset import ...``), so :func:`run` invokes its
+``src/main_evaluate.py`` entry point as a subprocess with ``cwd`` set to
+``$AUDIOBENCH_DIR``.  :func:`run` then re-shapes AudioBench's result JSON
+into a lmms-eval-compatible payload that :func:`oellm.main.collect_results`
+can parse unchanged.
 """
 
 from __future__ import annotations
@@ -63,65 +27,45 @@
 
 CLUSTER_ENV_VARS = ["AUDIOBENCH_DIR"]
 
-# Mapping family → (group_name, human description).
 _FAMILY_GROUPS = {
     "asr": (
         "audio-audiobench-asr",
-        "AudioBench ASR tasks (WER).  Covers AudioBench-scored LibriSpeech, "
-        "Common Voice 15 EN, GigaSpeech, People's Speech, TED-LIUM 3 — dual "
-        "with our lmms-eval versions for paper-comparable numbers — plus new "
-        "tasks not in lmms-eval: earnings21/22, TED-LIUM 3 long-form, "
-        "AISHELL Mandarin, GigaSpeech2 (th/id/vi), SEAME code-switch.",
+        "AudioBench ASR tasks (WER).",
     ),
     "st": (
         "audio-audiobench-st",
-        "AudioBench speech-translation tasks (BLEU).  CoVoST2 covering "
-        "en↔id, en↔ta, zh→en, ta→en (new), plus dual-registered en→zh.",
+        "AudioBench speech-translation tasks (BLEU).",
     ),
     "reasoning": (
         "audio-audiobench-reasoning",
-        "AudioBench spoken reasoning / captioning.  Spoken-MQA digit + "
-        "reasoning splits (accuracy), MMAU-mini (string_match), "
-        "AudioCaps (METEOR).",
+        "AudioBench spoken reasoning / captioning (accuracy / string_match / METEOR).",
     ),
 }
 
 _TOP_LEVEL_GROUP = "audio-audiobench"
 _TOP_LEVEL_DESC = (
-    "AudioBench Phase-1 suite (judge-free).  Runs all 27 AudioBench tasks "
-    "that do not require an LLM judge: ASR (WER), speech translation (BLEU), "
-    "spoken reasoning (accuracy/string_match), and AudioCaps captioning "
-    "(METEOR).  Phase 2 (judge-dependent tasks) will extend this group once "
-    "the judge service is configured."
+    "AudioBench suite — ASR (WER), speech translation (BLEU), spoken "
+    "reasoning (accuracy/string_match), and AudioCaps captioning (METEOR)."
 )
 
 
 def _build_task_groups() -> dict:
-    """Assemble the :data:`TASK_GROUPS` dict from :data:`AUDIOBENCH_TASKS`.
+    """Build ``TASK_GROUPS`` from :data:`AUDIOBENCH_TASKS`.
 
-    One top-level ``audio-audiobench`` group containing all 27 leaves, plus
-    three sub-groups keyed by family (``-asr`` / ``-st`` / ``-reasoning``).
-    All groups are zero-shot by design — AudioBench tasks do not support
-    in-context examples.
+    Always zero-shot — AudioBench does not support in-context examples.
     """
     task_metrics: dict[str, str] = {t.name: t.metric for t in AUDIOBENCH_TASKS}
 
     def _task_entry(t: AudioBenchTaskSpec) -> dict:
-        entry: dict = {"task": t.name, "dataset": t.hf_repo}
-        # ``data_dir``-style subsetting: we deliberately do NOT set ``subset``
-        # in the YAML entry.  The reason is that ``load_dataset(name=...)``
-        # used by ``_pre_download_datasets_from_specs`` treats ``subset`` as a
-        # config name, not a ``data_dir`` — and for gigaspeech2/spoken-mqa the
-        # upstream distinction is a data_dir, not a config.  Since the group
-        # name starts with "audio-", ``_collect_dataset_specs`` auto-sets
-        # ``needs_snapshot_download=True`` which downloads the whole repo,
-        # so AudioBench can read the right data_dir at runtime.  This also
-        # means multiple tasks sharing one HF repo dedupe to a single spec.
-        return entry
+        # We deliberately omit ``subset`` — load_dataset treats it as a
+        # config name, but for gigaspeech2 / spoken-mqa the upstream
+        # distinction is a ``data_dir``.  The ``audio-*`` prefix triggers
+        # full-repo snapshot_download, so AudioBench can read the right
+        # data_dir at runtime.
+        return {"task": t.name, "dataset": t.hf_repo}
 
     groups: dict[str, dict] = {}
 
-    # Sub-groups per family.
     tasks_by_family: dict[str, list[AudioBenchTaskSpec]] = {
         "asr": [],
         "st": [],
@@ -141,7 +85,6 @@ def _task_entry(t: AudioBenchTaskSpec) -> dict:
             "tasks": [_task_entry(t) for t in entries],
         }
 
-    # Top-level group — union of everything.
     groups[_TOP_LEVEL_GROUP] = {
         "suite": SUITE_NAME,
         "n_shots": [0],
@@ -155,28 +98,13 @@ def _task_entry(t: AudioBenchTaskSpec) -> dict:
 TASK_GROUPS: dict = _build_task_groups()
 
 
-# ---------------------------------------------------------------------------
-# Model-flag detection.
-# ---------------------------------------------------------------------------
-
-
 def detect_model_flags(model_path: str) -> str | None:
-    """Delegate to :class:`AudioBenchModelAdapter`.
-
-    Called by :class:`oellm.runner.EvalRunner.resolve_suite` to append the
-    AudioBench model-family key to ``eval_suite`` as
-    ``audiobench:<family>``.
-    """
+    """Return the AudioBench ``--model`` family key for *model_path*."""
     from oellm.contrib.audiobench.adapter import AudioBenchModelAdapter
 
     return AudioBenchModelAdapter(model_path).to_contrib_flags()
 
 
-# ---------------------------------------------------------------------------
-# Runtime — subprocess into AudioBench's src/main_evaluate.py.
-# ---------------------------------------------------------------------------
-
-
 def run(
     *,
     model_path: str,
@@ -186,24 +114,10 @@ def run(
     model_flags: str | None,
     env: dict[str, str],
 ) -> None:
-    """Execute one AudioBench task and write lmms-eval-shaped JSON.
-
-    Args:
-        model_path: HF repo ID or local path of the model under evaluation.
-        task: Canonical task name (must start with ``audiobench_``).
-        n_shot: Always 0 for AudioBench — recorded in the output ``configs``
-            block for downstream compatibility.
-        output_path: Destination for the lmms-eval-compatible result JSON.
-        model_flags: AudioBench ``--model`` key (e.g. ``"qwen2_audio"``);
-            produced by :func:`detect_model_flags`.  Falls back to
-            ``"generic"`` if not supplied.
-        env: Environment dict passed to the subprocess.  Must contain
-            ``AUDIOBENCH_DIR`` (validated by dispatch.py before ``run`` is
-            called, but we re-check for safety).
-
-    Raises:
-        RuntimeError: if AudioBench returns non-zero or produces no output.
-        KeyError: if *task* is not in the registry.
+    """Execute one AudioBench task and write a lmms-eval-shaped result JSON.
+
+    Raises ``RuntimeError`` if AudioBench exits non-zero or produces no
+    parseable output, and ``KeyError`` if *task* is not registered.
     """
     ab_dir = env.get("AUDIOBENCH_DIR")
     if not ab_dir:
@@ -224,8 +138,6 @@ def run(
     spec = get_task_spec(task)
     model_key = model_flags or "generic"
 
-    # AudioBench writes outputs under a run-specific log directory; we set
-    # it to our output_path's parent so we can recover the raw result.
     run_dir = output_path.parent / f"audiobench_{output_path.stem}"
     run_dir.mkdir(parents=True, exist_ok=True)
 
@@ -246,8 +158,6 @@ def run(
     if spec.data_dir:
         cmd.extend(["--data_dir", spec.data_dir])
 
-    # Forward LIMIT (set by template.sbatch) as AudioBench's
-    # --number_of_samples when present.  "-1" means no limit in AudioBench.
     limit = env.get("LIMIT", "").strip()
     if limit:
         cmd.extend(["--number_of_samples", str(limit)])
@@ -277,17 +187,7 @@ def run(
 
 
 def _extract_metrics(run_dir: Path, spec: AudioBenchTaskSpec) -> dict[str, float]:
-    """Find AudioBench's per-task score JSON inside *run_dir* and read it.
-
-    AudioBench writes one JSON file per task under its ``--log_dir`` with
-    the score under a key matching ``--metrics``.  We search recursively
-    for any ``*.json`` and pick the first one whose body contains the
-    expected metric key.  This is intentionally lenient because upstream
-    log-layout has changed across releases.
-
-    Raises:
-        RuntimeError: if no matching result file is found.
-    """
+    """Find AudioBench's per-task result JSON under *run_dir* and read it."""
     candidates = sorted(run_dir.rglob("*.json"))
     if not candidates:
         raise RuntimeError(
@@ -304,9 +204,8 @@ def _extract_metrics(run_dir: Path, spec: AudioBenchTaskSpec) -> dict[str, float
             continue
         value = _find_metric(body, target_key)
         if value is not None:
-            # Emit the metric under OUR canonical key (spec.metric) so the
-            # lmms-eval-style ``task/metric,none`` stripping in
-            # collect_results() resolves to what's in task_metrics.yaml.
+            # Emit under our canonical key so collect_results' metric
+            # resolution picks up task_metrics.yaml.
             return {spec.metric: float(value)}
 
     raise RuntimeError(
@@ -316,11 +215,10 @@ def _extract_metrics(run_dir: Path, spec: AudioBenchTaskSpec) -> dict[str, float
 
 
 def _find_metric(body: object, key: str) -> float | None:
-    """Recursive search for a numeric value keyed by *key* anywhere in *body*.
+    """Recursive search for a numeric value keyed by *key*.
 
-    AudioBench's per-task JSON has nested structure that has drifted across
-    releases (sometimes ``{"wer": 0.04}``, sometimes
-    ``{"metrics": {"wer": {"score": 0.04}}}``).  We tolerate either form.
+    Tolerates both ``{"wer": 0.04}`` and ``{"metrics": {"wer": {"score":
+    0.04}}}`` layouts — upstream log shape has drifted across releases.
     """
     if isinstance(body, dict):
         if key in body:
@@ -351,12 +249,6 @@ def _write_lmms_shaped_json(
     n_shot: int,
     metrics: dict[str, float],
 ) -> None:
-    """Write a lmms-eval-compatible JSON at *output_path*.
-
-    :func:`oellm.main.collect_results` reads this shape directly; the
-    ``_resolve_metric`` fallback chain picks up our ``task_metrics``
-    mapping to extract the primary value.
-    """
     payload = {
         "model_name_or_path": model_path,
         "results": {task_name: metrics},
@@ -367,21 +259,9 @@ def _write_lmms_shaped_json(
         json.dump(payload, f, indent=2)
 
 
-# ---------------------------------------------------------------------------
-# parse_results — invoked by collect_results to recognise our output files.
-# ---------------------------------------------------------------------------
-
-
 def parse_results(data: dict) -> tuple[str, str, int, dict[str, float]] | None:
-    """Recognise a JSON dict produced by :func:`run`.
-
-    Detection heuristic: the ``results`` dict contains at least one key
-    that starts with ``"audiobench_"``.  Returns the tuple expected by
-    :func:`oellm.main.collect_results`:
-
-        ``(model_id, task_name, n_shot, {metric: value})``
-
-    Returns ``None`` for JSON blobs that don't belong to this suite.
+    """Recognise a JSON dict produced by :func:`run` and return
+    ``(model_id, task_name, n_shot, metrics)``; ``None`` if it's not ours.
     """
     results = data.get("results", {})
     if not isinstance(results, dict):
@@ -393,8 +273,6 @@ def parse_results(data: dict) -> tuple[str, str, int, dict[str, float]] | None:
             continue
         model_id = data.get("model_name_or_path") or data.get("model_name") or "unknown"
         n_shot = data.get("configs", {}).get(task_name, {}).get("num_fewshot", 0)
-        # Coerce everything that can be float; leave non-numeric alone so
-        # _resolve_metric can still see them.
         coerced: dict[str, float] = {}
         for k, v in task_results.items():
             if isinstance(v, int | float):
@@ -403,7 +281,6 @@ def parse_results(data: dict) -> tuple[str, str, int, dict[str, float]] | None:
     return None
 
 
-# Re-exports used by the test suite.
 __all__ = [
     "CLUSTER_ENV_VARS",
     "SUITE_NAME",
@@ -413,5 +290,4 @@ def parse_results(data: dict) -> tuple[str, str, int, dict[str, float]] | None:
     "run",
 ]
 
-# Silence unused-import lint (the symbol is exported for consumer reuse).
-_ = os
+_ = os  # exported via env dict passed to subprocess.run
diff --git a/oellm/contrib/audiobench/task.py b/oellm/contrib/audiobench/task.py
index 2a86bc14..849477f1 100644
--- a/oellm/contrib/audiobench/task.py
+++ b/oellm/contrib/audiobench/task.py
@@ -1,32 +1,12 @@
 """AudioBench task registry.
 
-Single source of truth for the AudioBench (AudioLLMs/AudioBench, arXiv 2406.16020)
-Phase-1 task set.  The registry is consumed by :mod:`oellm.contrib.audiobench.suite`
-to auto-generate ``TASK_GROUPS`` and to look up per-task metadata (HF repo,
-upstream task name, metric) at dispatch time.
+Single source of truth for the task set.  Consumed by
+:mod:`oellm.contrib.audiobench.suite` to build ``TASK_GROUPS`` and to look up
+per-task metadata (HF repo, upstream task name, metric) at dispatch time.
 
-Phase 1 = judge-free tasks only (27 total):
-
-- **20 new** benchmarks not covered by our lmms-eval task groups
-  (``earnings{21,22}``, ``gigaspeech2`` {thai, indonesian, vietnamese},
-  ``aishell`` ZH ASR, ``seame`` code-switch, covost2 extra language pairs,
-  ``spoken-mqa`` reasoning splits, ``mmau_mini``, ``audiocaps`` METEOR).
-- **7 dual-registered** duplicates of benchmarks we already run via lmms-eval
-  (LibriSpeech test-clean/other, Common Voice 15 EN, GigaSpeech, People's
-  Speech, TED-LIUM 3, covost2 en→zh).  These use AudioBench's own scorer
-  and normalizer so WP4 can compare numbers against the AudioBench paper.
-
-Naming
-------
-Every task name is prefixed ``audiobench_`` so the CSV ``task_path`` column
-uniquely identifies the scorer and there is no collision with lmms-eval's
-``librispeech_test_clean`` etc.  :func:`AudioBenchTaskSpec.upstream_name`
-returns the bare name that AudioBench's ``src/main_evaluate.py --dataset``
-flag expects.
-
-Phase 2 (judge-dependent tasks) will extend this registry with ~19 more
-entries driven by a vLLM Llama-3-70B judge or the OpenAI API; see the
-plugin README for the rollout plan.
+Every canonical task name is prefixed ``audiobench_`` so the CSV ``task_path``
+column uniquely identifies the scorer and doesn't collide with lmms-eval's
+names for the same benchmark.
 """
 
 from __future__ import annotations
@@ -41,25 +21,11 @@
 class AudioBenchTaskSpec:
     """Metadata for a single AudioBench task.
 
-    Attributes:
-        name: Canonical ``audiobench_*`` task name used in the CSV
-            ``task_path`` column and in ``task_metrics`` / ``task_groups``.
-        upstream_name: The ``--dataset`` value AudioBench's
-            ``src/main_evaluate.py`` expects (e.g. ``"librispeech_test_clean"``).
-        hf_repo: HuggingFace dataset repo ID for pre-download
-            (e.g. ``"AudioLLMs/librispeech_test_clean"``).
-        metric: Primary metric key written to our ``task_metrics`` mapping.
-            One of ``wer`` / ``bleu`` / ``accuracy`` / ``string_match`` /
-            ``meteor``.
-        upstream_metric: The value passed to AudioBench's ``--metrics`` CLI
-            flag.  Usually identical to :attr:`metric` but allows divergence
-            when AudioBench uses a different key for the same score (e.g.
-            ``wer`` vs ``bleu`` match; ``accuracy`` vs upstream ``acc``).
-        family: One of ``"asr" | "st" | "reasoning"``.  Controls which
-            ``audio-audiobench-*`` sub-group the task lands in.
-        data_dir: Optional upstream ``data_dir=...`` selector, used by the
-            gigaspeech2 multi-language repo.  Passed to AudioBench via
-            ``--data_dir`` (upstream convention).
+    ``upstream_name`` is what AudioBench's ``--dataset`` flag expects;
+    ``upstream_metric`` is what ``--metrics`` expects (usually identical to
+    ``metric``).  ``data_dir`` is the optional upstream ``--data_dir``
+    selector used when multiple tasks share one HF repo (gigaspeech2,
+    spoken-mqa).
     """
 
     name: str
@@ -72,7 +38,6 @@ class AudioBenchTaskSpec:
 
     @property
     def task_group(self) -> str:
-        """Return the ``audio-audiobench-*`` sub-group this task belongs to."""
         return f"audio-audiobench-{self.family}"
 
 
@@ -86,12 +51,7 @@ def _t(
     data_dir: str | None = None,
     name: str | None = None,
 ) -> AudioBenchTaskSpec:
-    """Helper — build an :class:`AudioBenchTaskSpec` with sensible defaults.
-
-    By default the canonical name is ``audiobench_<upstream_name>``.  Pass
-    ``name`` to override (used when upstream names collide across
-    data_dir variants of the same HF repo, e.g. gigaspeech2).
-    """
+    """Build a spec with ``name = audiobench_<upstream_name>`` by default."""
     return AudioBenchTaskSpec(
         name=name if name is not None else _TASK_NAME_PREFIX + upstream_name,
         upstream_name=upstream_name,
@@ -103,22 +63,13 @@ def _t(
     )
 
 
-# ---------------------------------------------------------------------------
-# Bucket B — 20 genuinely new tasks (not in our lmms-eval task groups)
-# ---------------------------------------------------------------------------
-
-_BUCKET_B_ASR = [
-    # Mandarin ASR (not in lmms-eval).
+# Tasks not covered by our lmms-eval task groups.
+_NEW_ASR = [
     _t("aishell_asr_zh_test", "AudioLLMs/aishell_1_zh_test", "wer", "asr"),
-    # Long-form English ASR from financial calls.
     _t("earnings21_test", "AudioLLMs/earnings21_test", "wer", "asr"),
     _t("earnings22_test", "AudioLLMs/earnings22_test", "wer", "asr"),
-    # Long-form TED talks (distinct from our tedlium_dev_test).
     _t("tedlium3_long_form_test", "AudioLLMs/tedlium3_long_form_test", "wer", "asr"),
-    # GigaSpeech2 — multilingual SE-Asian ASR.  All 3 share one HF repo and
-    # are disambiguated by ``data_dir``.  Upstream --dataset name is the same,
-    # so we override ``name`` with a language suffix to keep canonical names
-    # unique in our CSV.
+    # GigaSpeech2 — 3 languages share one HF repo, disambiguated by data_dir.
     _t(
         "gigaspeech2",
         "AudioLLMs/gigaspeech2-test",
@@ -143,13 +94,11 @@ def _t(
         data_dir="vi-test",
         name="audiobench_gigaspeech2_viet",
     ),
-    # SEAME code-switch (English ↔ Mandarin).
     _t("seame_dev_man", "AudioLLMs/seame_dev_man", "wer", "asr"),
     _t("seame_dev_sge", "AudioLLMs/seame_dev_sge", "wer", "asr"),
 ]
 
-_BUCKET_B_ST = [
-    # CoVoST2 language pairs not in lmms-eval (only en-zh is there).
+_NEW_ST = [
     _t("covost2_en_id_test", "AudioLLMs/covost2_en_id_test", "bleu", "st"),
     _t("covost2_en_ta_test", "AudioLLMs/covost2_en_ta_test", "bleu", "st"),
     _t("covost2_id_en_test", "AudioLLMs/covost2_id_en_test", "bleu", "st"),
@@ -157,11 +106,8 @@ def _t(
     _t("covost2_ta_en_test", "AudioLLMs/covost2_ta_en_test", "bleu", "st"),
 ]
 
-_BUCKET_B_REASONING = [
-    # Spoken-MQA reasoning splits (GSM-8K-like, acc scoring).  All 4 share
-    # one HF repo; the split is an upstream config — passed as ``data_dir``
-    # so the YAML/HF snapshot_download dedups across splits while AudioBench
-    # still knows which split to read.
+_NEW_REASONING = [
+    # Spoken-MQA — 4 splits share one HF repo; split is an upstream data_dir.
     _t(
         "spoken-mqa",
         "amao0o0/spoken-mqa",
@@ -198,66 +144,33 @@ def _t(
         data_dir="multi_step_reasoning",
         name="audiobench_spoken_mqa_multi_step_reasoning",
     ),
-    # MMAU mini — deterministic string-match scoring (judge-free path).
-    _t(
-        "mmau_mini",
-        "AudioLLMs/MMAU-mini",
-        "string_match",
-        "reasoning",
-        upstream_metric="string_match",
-    ),
-    # AudioCaps — METEOR is the judge-free scorer (judges also available).
-    _t(
-        "audiocaps_test",
-        "AudioLLMs/audiocaps_test",
-        "meteor",
-        "reasoning",
-        upstream_metric="meteor",
-    ),
+    _t("mmau_mini", "AudioLLMs/MMAU-mini", "string_match", "reasoning"),
+    _t("audiocaps_test", "AudioLLMs/audiocaps_test", "meteor", "reasoning"),
 ]
 
-# ---------------------------------------------------------------------------
-# Bucket A — 7 dual-registered duplicates of benchmarks already in lmms-eval.
-# These are for paper-comparability with AudioBench; the lmms-eval versions
-# stay in place and produce independent numbers under their own task names.
-# The HF repos are distinct (AudioLLMs/* vs lmms-lab/*) so there is no risk
-# of snapshot_download collision.
-# ---------------------------------------------------------------------------
-
-_BUCKET_A_DUAL = [
-    # LibriSpeech (English ASR).
+# Dual-registered duplicates of benchmarks also in lmms-eval.  These use
+# AudioBench's scorer/normaliser for paper-comparable numbers; the lmms-eval
+# versions stay in place.  HF repos differ (AudioLLMs/* vs lmms-lab/*) so
+# snapshot_download does not collide.
+_DUAL = [
     _t("librispeech_test_clean", "AudioLLMs/librispeech_test_clean", "wer", "asr"),
     _t("librispeech_test_other", "AudioLLMs/librispeech_test_other", "wer", "asr"),
-    # Common Voice 15 English ASR.
     _t("common_voice_15_en_test", "AudioLLMs/common_voice_15_en_test", "wer", "asr"),
-    # GigaSpeech v1 English ASR.
     _t("gigaspeech_test", "AudioLLMs/gigaspeech_test", "wer", "asr"),
-    # People's Speech English ASR (note upstream repo name has the "s").
     _t("peoples_speech_test", "AudioLLMs/peoples_speech_test", "wer", "asr"),
-    # TED-LIUM 3 standard test (distinct from tedlium3_long_form_test above).
     _t("tedlium3_test", "AudioLLMs/tedlium3_test", "wer", "asr"),
-    # CoVoST2 en→zh (ST).
     _t("covost2_en_zh_test", "AudioLLMs/covost2_en_zh_test", "bleu", "st"),
 ]
 
 
-# ---------------------------------------------------------------------------
-# Public registry — flat list of all Phase-1 task specs.
-# Order is stable (ASR / ST / reasoning) for deterministic YAML ordering
-# and for readable test-failure diffs.
-# ---------------------------------------------------------------------------
-
 AUDIOBENCH_TASKS: list[AudioBenchTaskSpec] = [
-    *_BUCKET_B_ASR,
-    *_BUCKET_B_ST,
-    *_BUCKET_B_REASONING,
-    *_BUCKET_A_DUAL,
+    *_NEW_ASR,
+    *_NEW_ST,
+    *_NEW_REASONING,
+    *_DUAL,
 ]
 
 
-# Fail-fast consistency checks — runs at import time so a typo in the
-# registry breaks the test suite rather than manifesting as a silent job
-# routing error later.
 def _validate() -> None:
     seen_names: set[str] = set()
     for t in AUDIOBENCH_TASKS:
@@ -279,13 +192,7 @@ def _validate() -> None:
 
 
 def get_task_spec(name: str) -> AudioBenchTaskSpec:
-    """Look up an :class:`AudioBenchTaskSpec` by canonical task name.
-
-    Raises
-    ------
-    KeyError
-        If *name* does not correspond to any registered AudioBench task.
-    """
+    """Look up a spec by canonical task name; raises ``KeyError`` if missing."""
     for t in AUDIOBENCH_TASKS:
         if t.name == name:
             return t
diff --git a/oellm/resources/clusters.yaml b/oellm/resources/clusters.yaml
index 36e23d29..370201b6 100644
--- a/oellm/resources/clusters.yaml
+++ b/oellm/resources/clusters.yaml
@@ -1,5 +1,5 @@
 shared:
-  TIME_LIMIT: "00:30:00"  # time limit in the format HH:MM:SS
+  TIME_LIMIT: "02:30:00"  # time limit in the format HH:MM:SS
   UV_LINK_MODE: "copy"
   EVAL_OUTPUT_DIR: "{EVAL_BASE_DIR}/{USER}"  # where evaluations are written
   GPUS_PER_NODE: 1
@@ -7,13 +7,16 @@ shared:
   HF_DATASETS_DISABLE_PROGRESS_BARS: "1"
 
 leonardo:
-  hostname_pattern: "*.leonardo.local"  # use this regexp to automatically assign environment variables corresponding to this YAML
-  EVAL_BASE_DIR: "/leonardo_work/AIFAC_L01_028/oellm-cli-shared-evals"
-  PARTITION: "boost_usr_prod"  # default partition to use
-  ACCOUNT: "OELLM_prod2026"  # default account to use
-  QUEUE_LIMIT: 1000  # maximum number of jobs that can be submitted as job/array, used to send only jobs that respects QOS
-  EVAL_CONTAINER_IMAGE: "eval_env-leonardo.sif"  # name of the container image that is pulled which is built automatically with Github actions
+  hostname_pattern: "*.leonardo.local"
+  EVAL_BASE_DIR: "/leonardo/home/userexternal/islobozh/oellm-cli-shared-evals/"
+  PARTITION: "boost_usr_prod"
+  ACCOUNT: "OELLM_prod2026"
+  QUEUE_LIMIT: 1000
+  EVAL_CONTAINER_IMAGE: "eval_env-leonardo.sif"
   SINGULARITY_ARGS: "--nv"
+  HF_HOME: "/leonardo_work/OELLM_prod2026/huggingface"
+  GPUS_PER_NODE: 4
+  REGION_REASONER_DIR: "/leonardo/home/userexternal/islobozh/RegionReasoner"
 
 jureca:
   hostname_pattern: "*.jureca"
diff --git a/pyproject.toml b/pyproject.toml
index 09f9d036..4128db93 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,17 +33,15 @@ audio = [
     "librosa",
     "jiwer",
 ]
-# AudioBench contrib plugin.  AudioBench itself is NOT pip-installable (no
-# build backend upstream and bare imports like ``from dataset import ...``
-# that break after install), so we don't list it as a Python dependency.
-# Instead, AUDIOBENCH_DIR in clusters.yaml points at a local git clone and
-# suite.py subprocesses into ``python src/main_evaluate.py``.  What we do
-# need here is our own post-processing / scorer deps for result parsing.
+# AudioBench contrib plugin.  AudioBench itself is not pip-installable
+# (no build backend upstream, bare imports), so AUDIOBENCH_DIR in
+# clusters.yaml points at a local git clone and suite.py subprocesses into
+# ``python src/main_evaluate.py``.  These are our post-processing deps.
 audiobench = [
-    "jiwer",         # Phase 1 — WER result sanity checks
-    "sacrebleu",     # Phase 1 — BLEU scorer verification (covost2)
-    "pythainlp",     # Phase 1 — Thai tokenisation for gigaspeech2_thai
-    "evaluate",      # Phase 1 — MMAU / METEOR post-processing
+    "jiwer",         # WER sanity checks
+    "sacrebleu",     # BLEU verification (covost2)
+    "pythainlp",     # Thai tokenisation for gigaspeech2_thai
+    "evaluate",      # MMAU / METEOR post-processing
     "soundfile",
     "librosa",
 ]
diff --git a/tests/test_audiobench.py b/tests/test_audiobench.py
new file mode 100644
index 00000000..e3a945ee
--- /dev/null
+++ b/tests/test_audiobench.py
@@ -0,0 +1,725 @@
+"""Tests for the AudioBench contrib benchmark integration."""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+
+from oellm.task_groups import (
+    _collect_dataset_specs,
+    _expand_task_groups,
+    get_all_task_group_names,
+)
+
+SUITE = "audiobench"
+TOP_GROUP = "audio-audiobench"
+ASR_GROUP = "audio-audiobench-asr"
+ST_GROUP = "audio-audiobench-st"
+REASONING_GROUP = "audio-audiobench-reasoning"
+
+# Canonical task names that MUST be in the registry.  Assertions that
+# reference individual task names go here so the audit table in the plan is
+# reflected 1:1 in tests; a silent rename breaks the build.
+BUCKET_B_TASKS = {
+    # ASR (9)
+    "audiobench_aishell_asr_zh_test",
+    "audiobench_earnings21_test",
+    "audiobench_earnings22_test",
+    "audiobench_tedlium3_long_form_test",
+    "audiobench_gigaspeech2_thai",
+    "audiobench_gigaspeech2_indo",
+    "audiobench_gigaspeech2_viet",
+    "audiobench_seame_dev_man",
+    "audiobench_seame_dev_sge",
+    # ST (5)
+    "audiobench_covost2_en_id_test",
+    "audiobench_covost2_en_ta_test",
+    "audiobench_covost2_id_en_test",
+    "audiobench_covost2_zh_en_test",
+    "audiobench_covost2_ta_en_test",
+    # Reasoning (6)
+    "audiobench_spoken_mqa_short_digit",
+    "audiobench_spoken_mqa_long_digit",
+    "audiobench_spoken_mqa_single_step_reasoning",
+    "audiobench_spoken_mqa_multi_step_reasoning",
+    "audiobench_mmau_mini",
+    "audiobench_audiocaps_test",
+}
+
+BUCKET_A_DUAL = {
+    "audiobench_librispeech_test_clean",
+    "audiobench_librispeech_test_other",
+    "audiobench_common_voice_15_en_test",
+    "audiobench_gigaspeech_test",
+    "audiobench_peoples_speech_test",
+    "audiobench_tedlium3_test",
+    "audiobench_covost2_en_zh_test",
+}
+
+ALL_PHASE1_TASKS = BUCKET_B_TASKS | BUCKET_A_DUAL
+
+
+# ---------------------------------------------------------------------------
+# Registry — task.py
+# ---------------------------------------------------------------------------
+
+
+class TestTaskRegistry:
+    def test_registry_has_exactly_27_tasks(self):
+        from oellm.contrib.audiobench.task import AUDIOBENCH_TASKS
+
+        assert len(AUDIOBENCH_TASKS) == 27
+
+    def test_registry_covers_all_phase1_task_names(self):
+        from oellm.contrib.audiobench.task import AUDIOBENCH_TASKS
+
+        names = {t.name for t in AUDIOBENCH_TASKS}
+        assert names == ALL_PHASE1_TASKS
+
+    def test_every_task_has_audiobench_prefix(self):
+        from oellm.contrib.audiobench.task import AUDIOBENCH_TASKS
+
+        for t in AUDIOBENCH_TASKS:
+            assert t.name.startswith("audiobench_"), t.name
+
+    def test_every_task_has_audiollms_or_amao_hf_repo(self):
+        from oellm.contrib.audiobench.task import AUDIOBENCH_TASKS
+
+        # Tasks live on AudioLLMs/* with the single exception of spoken-mqa
+        # (amao0o0/spoken-mqa).
+        for t in AUDIOBENCH_TASKS:
+            assert t.hf_repo.startswith(("AudioLLMs/", "amao0o0/")), (
+                f"{t.name} has unexpected repo {t.hf_repo}"
+            )
+
+    def test_asr_tasks_all_use_wer(self):
+        from oellm.contrib.audiobench.task import AUDIOBENCH_TASKS
+
+        for t in AUDIOBENCH_TASKS:
+            if t.family == "asr":
+                assert t.metric == "wer", f"{t.name}: {t.metric}"
+
+    def test_st_tasks_all_use_bleu(self):
+        from oellm.contrib.audiobench.task import AUDIOBENCH_TASKS
+
+        for t in AUDIOBENCH_TASKS:
+            if t.family == "st":
+                assert t.metric == "bleu", f"{t.name}: {t.metric}"
+
+    def test_gigaspeech2_tasks_share_repo_and_differ_by_data_dir(self):
+        from oellm.contrib.audiobench.task import AUDIOBENCH_TASKS
+
+        gs2 = [t for t in AUDIOBENCH_TASKS if "gigaspeech2" in t.name]
+        assert len(gs2) == 3
+        # All share the same HF repo.
+        assert {t.hf_repo for t in gs2} == {"AudioLLMs/gigaspeech2-test"}
+        # Each has a distinct data_dir.
+        assert {t.data_dir for t in gs2} == {"th-test", "id-test", "vi-test"}
+
+    def test_spoken_mqa_tasks_share_repo_and_differ_by_data_dir(self):
+        from oellm.contrib.audiobench.task import AUDIOBENCH_TASKS
+
+        smqa = [t for t in AUDIOBENCH_TASKS if "spoken_mqa" in t.name]
+        assert len(smqa) == 4
+        assert {t.hf_repo for t in smqa} == {"amao0o0/spoken-mqa"}
+        assert {t.data_dir for t in smqa} == {
+            "short_digit",
+            "long_digit",
+            "single_step_reasoning",
+            "multi_step_reasoning",
+        }
+
+    def test_get_task_spec_returns_spec(self):
+        from oellm.contrib.audiobench.task import get_task_spec
+
+        spec = get_task_spec("audiobench_librispeech_test_clean")
+        assert spec.upstream_name == "librispeech_test_clean"
+        assert spec.metric == "wer"
+        assert spec.family == "asr"
+
+    def test_get_task_spec_unknown_raises(self):
+        from oellm.contrib.audiobench.task import get_task_spec
+
+        with pytest.raises(KeyError, match="Unknown AudioBench task"):
+            get_task_spec("audiobench_does_not_exist")
+
+
+# ---------------------------------------------------------------------------
+# Adapter — adapter.py
+# ---------------------------------------------------------------------------
+
+
+class TestAudioBenchModelAdapter:
+    @pytest.fixture
+    def adapter_cls(self):
+        from oellm.contrib.audiobench.adapter import AudioBenchModelAdapter
+        from oellm.core.base_model_adapter import BaseModelAdapter
+
+        return AudioBenchModelAdapter, BaseModelAdapter
+
+    def test_is_base_model_adapter(self, adapter_cls):
+        cls, base = adapter_cls
+        assert issubclass(cls, base)
+
+    def test_qwen2_audio(self, adapter_cls):
+        cls, _ = adapter_cls
+        assert cls("Qwen/Qwen2-Audio-7B-Instruct").to_contrib_flags() == "qwen2_audio"
+
+    def test_qwen_audio(self, adapter_cls):
+        cls, _ = adapter_cls
+        assert cls("Qwen/Qwen-Audio-Chat").to_contrib_flags() == "qwen2_audio"
+
+    def test_salmonn(self, adapter_cls):
+        cls, _ = adapter_cls
+        assert cls("tsinghua/SALMONN-13B").to_contrib_flags() == "salmonn"
+
+    def test_ltu(self, adapter_cls):
+        cls, _ = adapter_cls
+        assert cls("MIT/ltu-as").to_contrib_flags() == "ltu"
+
+    def test_whisper(self, adapter_cls):
+        cls, _ = adapter_cls
+        assert cls("openai/whisper-large-v3").to_contrib_flags() == "whisper"
+
+    def test_audio_flamingo(self, adapter_cls):
+        cls, _ = adapter_cls
+        assert cls("nvidia/audio-flamingo-2").to_contrib_flags() == "audioflamingo"
+
+    def test_meralion(self, adapter_cls):
+        cls, _ = adapter_cls
+        assert cls("Singapore-NLP/MERaLiON-7B").to_contrib_flags() == "meralion"
+
+    def test_unknown_defaults_to_generic(self, adapter_cls):
+        cls, _ = adapter_cls
+        assert cls("random/unknown-model").to_contrib_flags() == "generic"
+
+    def test_module_level_detect_function(self):
+        from oellm.contrib.audiobench.adapter import detect_audiobench_model_type
+
+        assert detect_audiobench_model_type("Qwen/Qwen2-Audio-7B") == "qwen2_audio"
+        assert detect_audiobench_model_type("completely/unknown") == "generic"
+
+
+# ---------------------------------------------------------------------------
+# Suite plugin protocol — suite.py
+# ---------------------------------------------------------------------------
+
+
+class TestSuiteProtocol:
+    @pytest.fixture
+    def suite(self):
+        import oellm.contrib.audiobench.suite as s
+
+        return s
+
+    def test_suite_name(self, suite):
+        assert suite.SUITE_NAME == "audiobench"
+
+    def test_cluster_env_vars_declared(self, suite):
+        assert "AUDIOBENCH_DIR" in suite.CLUSTER_ENV_VARS
+
+    def test_task_groups_contains_all_four_groups(self, suite):
+        groups = suite.TASK_GROUPS["task_groups"]
+        for g in (TOP_GROUP, ASR_GROUP, ST_GROUP, REASONING_GROUP):
+            assert g in groups, f"{g} missing from TASK_GROUPS"
+
+    def test_top_level_group_has_all_27_tasks(self, suite):
+        tasks = suite.TASK_GROUPS["task_groups"][TOP_GROUP]["tasks"]
+        assert len(tasks) == 27
+
+    def test_task_metrics_present_for_all_leaves(self, suite):
+        metrics = suite.TASK_GROUPS["task_metrics"]
+        assert set(metrics.keys()) == ALL_PHASE1_TASKS
+
+    def test_all_groups_are_zero_shot(self, suite):
+        for name in (TOP_GROUP, ASR_GROUP, ST_GROUP, REASONING_GROUP):
+            group = suite.TASK_GROUPS["task_groups"][name]
+            assert group["n_shots"] == [0]
+            assert group["suite"] == SUITE
+
+    def test_detect_model_flags_qwen2_audio(self, suite):
+        assert suite.detect_model_flags("Qwen/Qwen2-Audio-7B-Instruct") == "qwen2_audio"
+
+    def test_detect_model_flags_unknown_defaults_to_generic(self, suite):
+        assert suite.detect_model_flags("some/unknown-model") == "generic"
+
+    def test_parse_results_recognises_audiobench_json(self, suite):
+        data = {
+            "model_name_or_path": "/path/to/model",
+            "results": {
+                "audiobench_librispeech_test_clean": {"wer": 0.047},
+            },
+            "configs": {"audiobench_librispeech_test_clean": {"num_fewshot": 0}},
+        }
+        result = suite.parse_results(data)
+        assert result is not None
+        model_id, task_name, n_shot, metrics = result
+        assert model_id == "/path/to/model"
+        assert task_name == "audiobench_librispeech_test_clean"
+        assert n_shot == 0
+        assert metrics["wer"] == pytest.approx(0.047)
+
+    def test_parse_results_rejects_non_audiobench_json(self, suite):
+        # lmms-eval style — no audiobench_ prefix.
+        data = {
+            "model_name_or_path": "some/model",
+            "results": {"librispeech_test_clean": {"wer,none": 0.05}},
+            "configs": {"librispeech_test_clean": {"num_fewshot": 0}},
+        }
+        assert suite.parse_results(data) is None
+
+    def test_parse_results_empty_returns_none(self, suite):
+        assert suite.parse_results({}) is None
+
+    def test_parse_results_malformed_returns_none(self, suite):
+        assert suite.parse_results({"results": "not a dict"}) is None
+
+
+# ---------------------------------------------------------------------------
+# TASK_GROUPS integration with core registry.
+# ---------------------------------------------------------------------------
+
+
+class TestTaskGroupsIntegration:
+    def test_groups_registered_via_registry(self):
+        all_names = get_all_task_group_names()
+        for g in (TOP_GROUP, ASR_GROUP, ST_GROUP, REASONING_GROUP):
+            assert g in all_names
+
+    def test_top_group_expands_to_27_zero_shot_tasks(self):
+        results = _expand_task_groups([TOP_GROUP])
+        assert len(results) == 27
+        for r in results:
+            assert r.n_shot == 0
+            assert r.suite == SUITE
+
+    def test_top_group_expands_to_expected_task_names(self):
+        results = _expand_task_groups([TOP_GROUP])
+        assert {r.task for r in results} == ALL_PHASE1_TASKS
+
+    def test_asr_group_has_15_leaves(self):
+        results = _expand_task_groups([ASR_GROUP])
+        # 9 bucket-B ASR + 6 bucket-A dual ASR = 15.
+        assert len(results) == 15
+        for r in results:
+            assert r.suite == SUITE
+
+    def test_st_group_has_6_leaves(self):
+        results = _expand_task_groups([ST_GROUP])
+        # 5 bucket-B ST + 1 bucket-A dual (en→zh) = 6.
+        assert len(results) == 6
+
+    def test_reasoning_group_has_6_leaves(self):
+        results = _expand_task_groups([REASONING_GROUP])
+        # 4 spoken-mqa + mmau_mini + audiocaps = 6.
+        assert len(results) == 6
+
+    def test_dataset_specs_flag_snapshot_download(self):
+        # Auto-derived from the ``audio-*`` group-name prefix in
+        # _collect_dataset_specs.
+        specs = _collect_dataset_specs([TOP_GROUP])
+        assert specs, "No dataset specs returned"
+        for s in specs:
+            assert s.needs_snapshot_download, (
+                f"DatasetSpec for {s.repo_id} missing needs_snapshot_download=True"
+            )
+
+    def test_dataset_specs_dedupe_shared_repos(self):
+        # gigaspeech2 (3 tasks) → 1 spec; spoken-mqa (4 tasks) → 1 spec.
+        specs = _collect_dataset_specs([TOP_GROUP])
+        repo_ids = [s.repo_id for s in specs]
+        assert repo_ids.count("AudioLLMs/gigaspeech2-test") == 1
+        assert repo_ids.count("amao0o0/spoken-mqa") == 1
+
+    def test_dataset_specs_contain_audiollms_repos(self):
+        specs = _collect_dataset_specs([TOP_GROUP])
+        repo_ids = {s.repo_id for s in specs}
+        # Sanity-check a handful of expected entries.
+        assert "AudioLLMs/librispeech_test_clean" in repo_ids
+        assert "AudioLLMs/earnings21_test" in repo_ids
+        assert "AudioLLMs/MMAU-mini" in repo_ids
+        assert "amao0o0/spoken-mqa" in repo_ids
+
+
+# ---------------------------------------------------------------------------
+# Registry auto-discovery.
+# ---------------------------------------------------------------------------
+
+
+class TestRegistryDiscovery:
+    def test_audiobench_suite_is_auto_discovered(self):
+        # Clear the _discover() cache so this test doesn't rely on import
+        # order from earlier tests.
+        from oellm import registry
+
+        registry._discover.cache_clear()
+        mod = registry.get_suite("audiobench")
+        assert mod.SUITE_NAME == "audiobench"
+        assert hasattr(mod, "run")
+        assert hasattr(mod, "parse_results")
+        assert hasattr(mod, "detect_model_flags")
+
+    def test_task_groups_merged_into_registry(self):
+        from oellm import registry
+
+        registry._discover.cache_clear()
+        merged = registry.get_all_task_groups()
+        assert TOP_GROUP in merged["task_groups"]
+        # task_metrics come through too.
+        assert "audiobench_librispeech_test_clean" in merged["task_metrics"]
+
+
+# ---------------------------------------------------------------------------
+# EvalRunner — resolve_suite wires audiobench through the adapter.
+# ---------------------------------------------------------------------------
+
+
+class TestRunnerIntegration:
+    def test_resolve_suite_appends_model_flag(self):
+        from oellm.constants import EvaluationJob
+        from oellm.runner import EvalRunner
+
+        runner = EvalRunner()
+        job = EvaluationJob(
+            model_path="Qwen/Qwen2-Audio-7B-Instruct",
+            task_path="audiobench_librispeech_test_clean",
+            n_shot=0,
+            eval_suite="audiobench",
+        )
+        result = runner.resolve_suite(job)
+        assert result == "audiobench:qwen2_audio"
+
+    def test_resolve_suite_generic_fallback(self):
+        from oellm.constants import EvaluationJob
+        from oellm.runner import EvalRunner
+
+        runner = EvalRunner()
+        job = EvaluationJob(
+            model_path="some/unknown-model",
+            task_path="audiobench_mmau_mini",
+            n_shot=0,
+            eval_suite="audiobench",
+        )
+        result = runner.resolve_suite(job)
+        assert result == "audiobench:generic"
+
+
+# ---------------------------------------------------------------------------
+# run() subprocess harness — exercise with a mocked subprocess.
+# ---------------------------------------------------------------------------
+
+
+class TestRunHarness:
+    """Exercise suite.run() with a mocked subprocess, verifying the CLI
+    it would invoke and the output JSON it writes.
+    """
+
+    def _fake_audiobench_tree(self, tmp_path: Path) -> Path:
+        """Create a minimal directory tree that looks like an AudioBench clone."""
+        ab_dir = tmp_path / "AudioBench"
+        (ab_dir / "src").mkdir(parents=True)
+        (ab_dir / "src" / "main_evaluate.py").write_text("# placeholder\n")
+        return ab_dir
+
+    def test_run_missing_audiobench_dir_raises(self, tmp_path):
+        from oellm.contrib.audiobench.suite import run
+
+        with pytest.raises(RuntimeError, match="AUDIOBENCH_DIR must be set"):
+            run(
+                model_path="Qwen/Qwen2-Audio-7B",
+                task="audiobench_librispeech_test_clean",
+                n_shot=0,
+                output_path=tmp_path / "out.json",
+                model_flags="qwen2_audio",
+                env={},  # no AUDIOBENCH_DIR
+            )
+
+    def test_run_missing_entrypoint_raises(self, tmp_path):
+        from oellm.contrib.audiobench.suite import run
+
+        bad_dir = tmp_path / "not-audiobench"
+        bad_dir.mkdir()
+        with pytest.raises(FileNotFoundError, match="AudioBench entry point"):
+            run(
+                model_path="Qwen/Qwen2-Audio-7B",
+                task="audiobench_librispeech_test_clean",
+                n_shot=0,
+                output_path=tmp_path / "out.json",
+                model_flags="qwen2_audio",
+                env={"AUDIOBENCH_DIR": str(bad_dir)},
+            )
+
+    def test_run_invokes_subprocess_with_expected_cli(self, tmp_path):
+        from oellm.contrib.audiobench import suite
+
+        ab_dir = self._fake_audiobench_tree(tmp_path)
+        output_path = tmp_path / "result.json"
+
+        def fake_run(cmd, cwd, env, check):
+            # Write a fake AudioBench result JSON into the run_dir that
+            # _extract_metrics will pick up.
+            run_dir = Path(cmd[cmd.index("--log_dir") + 1])
+            (run_dir / "task_result.json").write_text(json.dumps({"wer": 0.063}))
+            return _FakeCompletedProcess(0)
+
+        with patch(
+            "oellm.contrib.audiobench.suite.subprocess.run", side_effect=fake_run
+        ) as mock_sp:
+            suite.run(
+                model_path="Qwen/Qwen2-Audio-7B-Instruct",
+                task="audiobench_librispeech_test_clean",
+                n_shot=0,
+                output_path=output_path,
+                model_flags="qwen2_audio",
+                env={"AUDIOBENCH_DIR": str(ab_dir), "LIMIT": "100"},
+            )
+
+        assert mock_sp.call_count == 1
+        cmd = mock_sp.call_args.args[0]
+        assert cmd[:2] == ["python", "src/main_evaluate.py"]
+        assert "--dataset" in cmd
+        assert cmd[cmd.index("--dataset") + 1] == "librispeech_test_clean"
+        assert cmd[cmd.index("--model") + 1] == "qwen2_audio"
+        assert cmd[cmd.index("--model_name") + 1] == "Qwen/Qwen2-Audio-7B-Instruct"
+        assert cmd[cmd.index("--metrics") + 1] == "wer"
+        # LIMIT propagated.
+        assert cmd[cmd.index("--number_of_samples") + 1] == "100"
+        # cwd is AUDIOBENCH_DIR.
+        assert mock_sp.call_args.kwargs["cwd"] == str(ab_dir)
+
+        # Output JSON is lmms-eval-shaped and contains the extracted metric.
+        body = json.loads(output_path.read_text())
+        assert body["model_name_or_path"] == "Qwen/Qwen2-Audio-7B-Instruct"
+        assert body["results"]["audiobench_librispeech_test_clean"][
+            "wer"
+        ] == pytest.approx(0.063)
+        assert body["configs"]["audiobench_librispeech_test_clean"]["num_fewshot"] == 0
+
+    def test_run_forwards_data_dir_for_gigaspeech2(self, tmp_path):
+        from oellm.contrib.audiobench import suite
+
+        ab_dir = self._fake_audiobench_tree(tmp_path)
+        output_path = tmp_path / "result.json"
+
+        def fake_run(cmd, cwd, env, check):
+            run_dir = Path(cmd[cmd.index("--log_dir") + 1])
+            (run_dir / "gs2.json").write_text(json.dumps({"wer": 0.12}))
+            return _FakeCompletedProcess(0)
+
+        with patch(
+            "oellm.contrib.audiobench.suite.subprocess.run", side_effect=fake_run
+        ) as mock_sp:
+            suite.run(
+                model_path="Qwen/Qwen2-Audio-7B",
+                task="audiobench_gigaspeech2_thai",
+                n_shot=0,
+                output_path=output_path,
+                model_flags="qwen2_audio",
+                env={"AUDIOBENCH_DIR": str(ab_dir)},
+            )
+
+        cmd = mock_sp.call_args.args[0]
+        assert "--data_dir" in cmd
+        assert cmd[cmd.index("--data_dir") + 1] == "th-test"
+
+    def test_run_omits_number_of_samples_when_limit_empty(self, tmp_path):
+        from oellm.contrib.audiobench import suite
+
+        ab_dir = self._fake_audiobench_tree(tmp_path)
+        output_path = tmp_path / "result.json"
+
+        def fake_run(cmd, cwd, env, check):
+            run_dir = Path(cmd[cmd.index("--log_dir") + 1])
+            (run_dir / "r.json").write_text(json.dumps({"wer": 0.1}))
+            return _FakeCompletedProcess(0)
+
+        with patch(
+            "oellm.contrib.audiobench.suite.subprocess.run", side_effect=fake_run
+        ) as mock_sp:
+            suite.run(
+                model_path="Qwen/Qwen2-Audio-7B",
+                task="audiobench_librispeech_test_clean",
+                n_shot=0,
+                output_path=output_path,
+                model_flags="qwen2_audio",
+                env={"AUDIOBENCH_DIR": str(ab_dir), "LIMIT": ""},
+            )
+
+        cmd = mock_sp.call_args.args[0]
+        assert "--number_of_samples" not in cmd
+
+    def test_run_nonzero_exit_raises(self, tmp_path):
+        from oellm.contrib.audiobench import suite
+
+        ab_dir = self._fake_audiobench_tree(tmp_path)
+        output_path = tmp_path / "result.json"
+
+        with patch(
+            "oellm.contrib.audiobench.suite.subprocess.run",
+            return_value=_FakeCompletedProcess(1),
+        ):
+            with pytest.raises(RuntimeError, match="AudioBench exited with code 1"):
+                suite.run(
+                    model_path="Qwen/Qwen2-Audio-7B",
+                    task="audiobench_librispeech_test_clean",
+                    n_shot=0,
+                    output_path=output_path,
+                    model_flags="qwen2_audio",
+                    env={"AUDIOBENCH_DIR": str(ab_dir)},
+                )
+
+    def test_run_handles_nested_metric_json(self, tmp_path):
+        """AudioBench output format has drifted; support
+        ``{"metrics": {"wer": {"score": 0.05}}}`` as well as flat
+        ``{"wer": 0.05}``.
+        """
+        from oellm.contrib.audiobench import suite
+
+        ab_dir = self._fake_audiobench_tree(tmp_path)
+        output_path = tmp_path / "result.json"
+
+        def fake_run(cmd, cwd, env, check):
+            run_dir = Path(cmd[cmd.index("--log_dir") + 1])
+            (run_dir / "nested.json").write_text(
+                json.dumps({"metrics": {"wer": {"score": 0.051, "notes": "nested"}}})
+            )
+            return _FakeCompletedProcess(0)
+
+        with patch("oellm.contrib.audiobench.suite.subprocess.run", side_effect=fake_run):
+            suite.run(
+                model_path="Qwen/Qwen2-Audio-7B",
+                task="audiobench_librispeech_test_clean",
+                n_shot=0,
+                output_path=output_path,
+                model_flags="qwen2_audio",
+                env={"AUDIOBENCH_DIR": str(ab_dir)},
+            )
+
+        body = json.loads(output_path.read_text())
+        assert body["results"]["audiobench_librispeech_test_clean"][
+            "wer"
+        ] == pytest.approx(0.051)
+
+    def test_run_missing_metric_in_output_raises(self, tmp_path):
+        from oellm.contrib.audiobench import suite
+
+        ab_dir = self._fake_audiobench_tree(tmp_path)
+        output_path = tmp_path / "result.json"
+
+        def fake_run(cmd, cwd, env, check):
+            run_dir = Path(cmd[cmd.index("--log_dir") + 1])
+            (run_dir / "no_metric.json").write_text(json.dumps({"irrelevant": 1}))
+            return _FakeCompletedProcess(0)
+
+        with patch("oellm.contrib.audiobench.suite.subprocess.run", side_effect=fake_run):
+            with pytest.raises(RuntimeError, match="Could not locate metric"):
+                suite.run(
+                    model_path="Qwen/Qwen2-Audio-7B",
+                    task="audiobench_librispeech_test_clean",
+                    n_shot=0,
+                    output_path=output_path,
+                    model_flags="qwen2_audio",
+                    env={"AUDIOBENCH_DIR": str(ab_dir)},
+                )
+
+
+class _FakeCompletedProcess:
+    """Stand-in for subprocess.CompletedProcess."""
+
+    def __init__(self, returncode: int) -> None:
+        self.returncode = returncode
+
+
+# ---------------------------------------------------------------------------
+# schedule_evals dry-run — wiring smoke test.
+# ---------------------------------------------------------------------------
+
+
+class TestScheduleEvalsDryRun:
+    def test_dry_run_writes_audiobench_suite_to_csv(self, tmp_path):
+        import pandas as pd
+
+        from oellm.main import schedule_evals
+
+        with (
+            patch("oellm.scheduler._load_cluster_env"),
+            patch("oellm.scheduler._num_jobs_in_queue", return_value=0),
+            patch.dict(os.environ, {"EVAL_OUTPUT_DIR": str(tmp_path)}),
+        ):
+            schedule_evals(
+                models="Qwen/Qwen2-Audio-7B-Instruct",
+                task_groups=ASR_GROUP,
+                skip_checks=True,
+                venv_path=str(Path(sys.prefix)),
+                dry_run=True,
+            )
+
+        csv_files = list(tmp_path.glob("**/jobs.csv"))
+        assert len(csv_files) == 1
+        df = pd.read_csv(csv_files[0])
+        # All rows route to audiobench (with or without model-flag suffix).
+        assert all(s.startswith("audiobench") for s in df["eval_suite"].unique())
+        # task_path column contains canonical audiobench_ names.
+        assert all(t.startswith("audiobench_") for t in df["task_path"].unique())
+
+    def test_dry_run_sbatch_contains_contrib_dispatch(self, tmp_path):
+        from oellm.main import schedule_evals
+
+        with (
+            patch("oellm.scheduler._load_cluster_env"),
+            patch("oellm.scheduler._num_jobs_in_queue", return_value=0),
+            patch.dict(os.environ, {"EVAL_OUTPUT_DIR": str(tmp_path)}),
+        ):
+            schedule_evals(
+                models="Qwen/Qwen2-Audio-7B-Instruct",
+                task_groups=TOP_GROUP,
+                skip_checks=True,
+                venv_path=str(Path(sys.prefix)),
+                dry_run=True,
+            )
+
+        sbatch_files = list(tmp_path.glob("**/submit_evals.sbatch"))
+        assert len(sbatch_files) == 1
+        content = sbatch_files[0].read_text()
+        assert "oellm.contrib.dispatch" in content
+        # LIMIT is now exported so contrib plugins can read it.
+        assert "export LIMIT=" in content
+
+
+# ---------------------------------------------------------------------------
+# collect_results compatibility — verify a run() output flows through unchanged.
+# ---------------------------------------------------------------------------
+
+
+class TestCollectResultsCompat:
+    def test_collect_results_parses_audiobench_json(self, tmp_path):
+        import pandas as pd
+
+        from oellm.main import collect_results
+
+        results_dir = tmp_path / "results"
+        results_dir.mkdir()
+
+        mock_output = {
+            "model_name_or_path": "/cluster/models/Qwen2-Audio-7B",
+            "results": {
+                "audiobench_librispeech_test_clean": {"wer": 0.052},
+            },
+            "configs": {"audiobench_librispeech_test_clean": {"num_fewshot": 0}},
+        }
+        (results_dir / "ab123.json").write_text(json.dumps(mock_output))
+
+        output_csv = str(tmp_path / "results.csv")
+        collect_results(str(tmp_path), output_csv=output_csv)
+
+        df = pd.read_csv(output_csv)
+        assert len(df) == 1
+        row = df.iloc[0]
+        assert row["task"] == "audiobench_librispeech_test_clean"
+        assert float(row["performance"]) == pytest.approx(0.052)
+        assert row["model_name"] == "/cluster/models/Qwen2-Audio-7B"

From 68343b1d1610cad76ba0d26e3d888b511f69f25d Mon Sep 17 00:00:00 2001
From: Ivan Slobozhan <ivan.slobozhan@gmail.com>
Date: Mon, 4 May 2026 10:55:43 +0200
Subject: [PATCH 3/6] fixes

---
 oellm/contrib/audiobench/adapter.py |  49 +--
 oellm/contrib/audiobench/suite.py   | 113 ++++---
 oellm/contrib/audiobench/task.py    |  40 +--
 oellm/scheduler.py                  |  11 +-
 pyproject.toml                      |   9 +-
 tests/test_audiobench.py            | 451 +++++++++++++++++++++-------
 6 files changed, 469 insertions(+), 204 deletions(-)

diff --git a/oellm/contrib/audiobench/adapter.py b/oellm/contrib/audiobench/adapter.py
index 4734ba69..66a9655a 100644
--- a/oellm/contrib/audiobench/adapter.py
+++ b/oellm/contrib/audiobench/adapter.py
@@ -1,29 +1,38 @@
 """AudioBench model adapter.
 
-Maps a HuggingFace model path to the string key that AudioBench's
-``src/main_evaluate.py --model`` argument expects.  The detected value is
-passed to :mod:`oellm.contrib.dispatch` as the ``model_flags`` portion of
-the ``eval_suite`` column (``audiobench:<model_flags>``).
+Maps a HuggingFace model path to AudioBench's literal ``--model_name`` value.
+
+AudioBench's ``Model`` class (in ``$AUDIOBENCH_DIR/src/model.py``) dispatches
+on **exact-string** match against a fixed list — there is no family-level
+indirection and no fallback.  Each supported model has a hardcoded loader
+under ``model_src/`` that loads its own HF repo internally; AudioBench
+**cannot evaluate arbitrary HF checkpoints**, only the variants it knows
+about.  If we can't map the user's ``model_path`` to one of those literals,
+we return ``None`` and ``suite.run`` raises a clear error.
 """
 
 from __future__ import annotations
 
 from oellm.core.base_model_adapter import BaseModelAdapter
 
-# (model-family key, substrings to match in lowered model path).  Order
-# matters — first match wins, so more-specific patterns come first.
+# (audiobench_model_name, substrings_to_match_in_lower(model_path)).
+# Order matters — first match wins; put more-specific patterns first.
+# Keys MUST be the exact literals AudioBench's model.py dispatch expects.
 _PATTERNS: list[tuple[str, tuple[str, ...]]] = [
-    ("qwen2_audio", ("qwen2-audio", "qwen2_audio", "qwen-audio", "qwen_audio")),
-    ("salmonn", ("salmonn",)),
-    ("ltu", ("ltu-", "/ltu", "_ltu", "ltu_as")),
-    ("whisper", ("whisper-", "/whisper", "openai/whisper")),
-    ("audioflamingo", ("audio-flamingo", "audioflamingo", "audio_flamingo")),
-    ("meralion", ("meralion",)),
+    ("Qwen2-Audio-7B-Instruct", ("qwen2-audio-7b-instruct", "qwen2_audio_7b_instruct")),
+    ("Qwen-Audio-Chat", ("qwen-audio-chat", "qwen_audio_chat")),
+    ("SALMONN_7B", ("salmonn",)),
+    ("MERaLiON-AudioLLM-Whisper-SEA-LION", ("meralion-audiollm", "meralion_audiollm")),
+    ("whisper_large_v3", ("whisper-large-v3", "whisper_large_v3")),
+    ("whisper_large_v2", ("whisper-large-v2", "whisper_large_v2")),
+    ("phi_4_multimodal_instruct", ("phi-4-multimodal", "phi_4_multimodal")),
+    ("seallms_audio_7b", ("seallms-audio-7b", "seallms_audio_7b")),
+    ("WavLLM_fairseq", ("wavllm",)),
 ]
 
 
 class AudioBenchModelAdapter(BaseModelAdapter):
-    """Adapter resolving the ``--model`` flag for the AudioBench subprocess."""
+    """Adapter resolving the ``--model_name`` value for the AudioBench subprocess."""
 
     def __init__(self, model_path: str) -> None:
         self._path = model_path
@@ -42,13 +51,19 @@ def to_lmms_eval_args(self) -> str:
         return f"pretrained={self._path}"
 
     def to_contrib_flags(self) -> str | None:
+        """Return AudioBench's ``model_name`` dispatch key, or ``None`` if no match.
+
+        Returning ``None`` is intentional: AudioBench has no generic loader,
+        so an unmatched model path must fail loudly rather than fall through
+        to a fictitious ``generic`` key that AudioBench doesn't recognize.
+        """
         lowered = self._path.lower()
         for key, needles in _PATTERNS:
             if any(n in lowered for n in needles):
                 return key
-        return "generic"
+        return None
 
 
-def detect_audiobench_model_type(model_path: str) -> str:
-    """Like ``to_contrib_flags`` but always returns a string (default ``generic``)."""
-    return AudioBenchModelAdapter(model_path).to_contrib_flags() or "generic"
+def detect_audiobench_model_type(model_path: str) -> str | None:
+    """Convenience wrapper around :meth:`AudioBenchModelAdapter.to_contrib_flags`."""
+    return AudioBenchModelAdapter(model_path).to_contrib_flags()
diff --git a/oellm/contrib/audiobench/suite.py b/oellm/contrib/audiobench/suite.py
index e601e784..7ec28e46 100644
--- a/oellm/contrib/audiobench/suite.py
+++ b/oellm/contrib/audiobench/suite.py
@@ -57,11 +57,10 @@ def _build_task_groups() -> dict:
     task_metrics: dict[str, str] = {t.name: t.metric for t in AUDIOBENCH_TASKS}
 
     def _task_entry(t: AudioBenchTaskSpec) -> dict:
-        # We deliberately omit ``subset`` — load_dataset treats it as a
-        # config name, but for gigaspeech2 / spoken-mqa the upstream
-        # distinction is a ``data_dir``.  The ``audio-*`` prefix triggers
-        # full-repo snapshot_download, so AudioBench can read the right
-        # data_dir at runtime.
+        # No ``subset`` — for gigaspeech2 / spoken-mqa the upstream split
+        # selection is encoded in ``upstream_name`` itself (e.g.
+        # ``gigaspeech2_thai``).  The ``audio-*`` group prefix triggers
+        # full-repo snapshot_download in :func:`_collect_dataset_specs`.
         return {"task": t.name, "dataset": t.hf_repo}
 
     groups: dict[str, dict] = {}
@@ -99,7 +98,13 @@ def _task_entry(t: AudioBenchTaskSpec) -> dict:
 
 
 def detect_model_flags(model_path: str) -> str | None:
-    """Return the AudioBench ``--model`` family key for *model_path*."""
+    """Return AudioBench's literal ``--model_name`` dispatch key for *model_path*.
+
+    Returns ``None`` when *model_path* does not match any AudioBench-supported
+    model family — :func:`run` then raises a clear error.  AudioBench has no
+    generic loader, so silently falling back to a fictitious key would just
+    move the error deeper inside the subprocess.
+    """
     from oellm.contrib.audiobench.adapter import AudioBenchModelAdapter
 
     return AudioBenchModelAdapter(model_path).to_contrib_flags()
@@ -136,27 +141,31 @@ def run(
         )
 
     spec = get_task_spec(task)
-    model_key = model_flags or "generic"
-
-    run_dir = output_path.parent / f"audiobench_{output_path.stem}"
-    run_dir.mkdir(parents=True, exist_ok=True)
+    if not model_flags:
+        raise RuntimeError(
+            f"Could not map model_path={model_path!r} to an AudioBench-supported "
+            f"model.  AudioBench dispatches on a fixed list of literal "
+            f"model_name strings (Qwen2-Audio-7B-Instruct, SALMONN_7B, "
+            f"whisper_large_v3, …) — see oellm/contrib/audiobench/adapter.py.  "
+            f"AudioBench cannot evaluate arbitrary HF checkpoints; it loads "
+            f"its own hardcoded HF repos per model family."
+        )
+    model_key = model_flags  # AudioBench's dispatch key, e.g. "Qwen2-Audio-7B-Instruct"
 
     cmd = [
         "python",
         "src/main_evaluate.py",
-        "--dataset",
+        "--dataset_name",
         spec.upstream_name,
-        "--model",
-        model_key,
         "--model_name",
-        model_path,
+        model_key,
         "--metrics",
         spec.upstream_metric,
-        "--log_dir",
-        str(run_dir),
+        # Force re-eval — AudioBench skips by default if a stale score file
+        # already exists under log_for_all_models/.
+        "--overwrite",
+        "True",
     ]
-    if spec.data_dir:
-        cmd.extend(["--data_dir", spec.data_dir])
 
     limit = env.get("LIMIT", "").strip()
     if limit:
@@ -172,10 +181,12 @@ def run(
     if completed.returncode != 0:
         raise RuntimeError(
             f"AudioBench exited with code {completed.returncode} for "
-            f"task={task!r} model={model_path!r}"
+            f"task={task!r} model={model_path!r} (dispatch key={model_key!r})"
         )
 
-    metrics = _extract_metrics(run_dir, spec)
+    metrics = _extract_metrics(
+        audiobench_dir=Path(ab_dir), model_key=model_key, spec=spec
+    )
     _write_lmms_shaped_json(
         output_path=output_path,
         model_path=model_path,
@@ -186,32 +197,48 @@ def run(
     logger.info("Results written to %s", output_path)
 
 
-def _extract_metrics(run_dir: Path, spec: AudioBenchTaskSpec) -> dict[str, float]:
-    """Find AudioBench's per-task result JSON under *run_dir* and read it."""
-    candidates = sorted(run_dir.rglob("*.json"))
-    if not candidates:
+def _extract_metrics(
+    *,
+    audiobench_dir: Path,
+    model_key: str,
+    spec: AudioBenchTaskSpec,
+) -> dict[str, float]:
+    """Read AudioBench's score file from its hardcoded output path.
+
+    AudioBench writes to ``$cwd/log_for_all_models/<model_name>/<dataset_name>_<metric>_score.json``
+    (see ``main_evaluate.py:118``).  Path is fixed — there is no ``--log_dir``.
+    """
+    score_file = (
+        audiobench_dir
+        / "log_for_all_models"
+        / model_key
+        / f"{spec.upstream_name}_{spec.upstream_metric}_score.json"
+    )
+    if not score_file.exists():
         raise RuntimeError(
-            f"AudioBench produced no result JSON under {run_dir}.  "
-            "Check stdout/stderr for crashes."
+            f"AudioBench did not write expected score file at {score_file}.  "
+            f"Either AudioBench crashed silently, or the dispatch key "
+            f"{model_key!r} / dataset_name {spec.upstream_name!r} / metric "
+            f"{spec.upstream_metric!r} is wrong.  Check stdout/stderr."
         )
 
-    target_key = spec.upstream_metric
-    for path in candidates:
-        try:
-            with open(path) as f:
-                body = json.load(f)
-        except (json.JSONDecodeError, OSError):
-            continue
-        value = _find_metric(body, target_key)
-        if value is not None:
-            # Emit under our canonical key so collect_results' metric
-            # resolution picks up task_metrics.yaml.
-            return {spec.metric: float(value)}
-
-    raise RuntimeError(
-        f"Could not locate metric {target_key!r} in any of "
-        f"{len(candidates)} AudioBench result JSON(s) under {run_dir}"
-    )
+    try:
+        with open(score_file) as f:
+            body = json.load(f)
+    except (json.JSONDecodeError, OSError) as e:
+        raise RuntimeError(
+            f"Could not read AudioBench score file {score_file}: {e}"
+        ) from e
+
+    value = _find_metric(body, spec.upstream_metric)
+    if value is None:
+        raise RuntimeError(
+            f"Could not locate metric {spec.upstream_metric!r} in AudioBench "
+            f"score file {score_file}.  Body: {body!r}"
+        )
+    # Emit under our canonical key so collect_results' metric resolution
+    # picks up task_metrics.yaml.
+    return {spec.metric: float(value)}
 
 
 def _find_metric(body: object, key: str) -> float | None:
diff --git a/oellm/contrib/audiobench/task.py b/oellm/contrib/audiobench/task.py
index 849477f1..32098880 100644
--- a/oellm/contrib/audiobench/task.py
+++ b/oellm/contrib/audiobench/task.py
@@ -21,11 +21,10 @@
 class AudioBenchTaskSpec:
     """Metadata for a single AudioBench task.
 
-    ``upstream_name`` is what AudioBench's ``--dataset`` flag expects;
-    ``upstream_metric`` is what ``--metrics`` expects (usually identical to
-    ``metric``).  ``data_dir`` is the optional upstream ``--data_dir``
-    selector used when multiple tasks share one HF repo (gigaspeech2,
-    spoken-mqa).
+    ``upstream_name`` is the literal string AudioBench's ``--dataset_name``
+    expects (matched exactly against ``$AUDIOBENCH_DIR/src/dataset.py``'s
+    dispatch table).  ``upstream_metric`` is what ``--metrics`` expects
+    (usually identical to our canonical ``metric``).
     """
 
     name: str
@@ -34,7 +33,6 @@ class AudioBenchTaskSpec:
     metric: str
     upstream_metric: str
     family: str
-    data_dir: str | None = None
 
     @property
     def task_group(self) -> str:
@@ -48,7 +46,6 @@ def _t(
     family: str,
     *,
     upstream_metric: str | None = None,
-    data_dir: str | None = None,
     name: str | None = None,
 ) -> AudioBenchTaskSpec:
     """Build a spec with ``name = audiobench_<upstream_name>`` by default."""
@@ -59,7 +56,6 @@ def _t(
         metric=metric,
         upstream_metric=upstream_metric or metric,
         family=family,
-        data_dir=data_dir,
     )
 
 
@@ -69,29 +65,28 @@ def _t(
     _t("earnings21_test", "AudioLLMs/earnings21_test", "wer", "asr"),
     _t("earnings22_test", "AudioLLMs/earnings22_test", "wer", "asr"),
     _t("tedlium3_long_form_test", "AudioLLMs/tedlium3_long_form_test", "wer", "asr"),
-    # GigaSpeech2 — 3 languages share one HF repo, disambiguated by data_dir.
+    # GigaSpeech2 — 3 languages share one HF repo.  AudioBench dispatches via
+    # the dataset_name string itself (gigaspeech2_thai/indo/viet), not via a
+    # --data_dir flag (which doesn't exist upstream).
     _t(
-        "gigaspeech2",
+        "gigaspeech2_thai",
         "AudioLLMs/gigaspeech2-test",
         "wer",
         "asr",
-        data_dir="th-test",
         name="audiobench_gigaspeech2_thai",
     ),
     _t(
-        "gigaspeech2",
+        "gigaspeech2_indo",
         "AudioLLMs/gigaspeech2-test",
         "wer",
         "asr",
-        data_dir="id-test",
         name="audiobench_gigaspeech2_indo",
     ),
     _t(
-        "gigaspeech2",
+        "gigaspeech2_viet",
         "AudioLLMs/gigaspeech2-test",
         "wer",
         "asr",
-        data_dir="vi-test",
         name="audiobench_gigaspeech2_viet",
     ),
     _t("seame_dev_man", "AudioLLMs/seame_dev_man", "wer", "asr"),
@@ -107,41 +102,38 @@ def _t(
 ]
 
 _NEW_REASONING = [
-    # Spoken-MQA — 4 splits share one HF repo; split is an upstream data_dir.
+    # Spoken-MQA — 4 splits share one HF repo.  AudioBench dispatches via
+    # the hyphen-prefixed dataset_name (spoken-mqa_<split>), not --data_dir.
     _t(
-        "spoken-mqa",
+        "spoken-mqa_short_digit",
         "amao0o0/spoken-mqa",
         "accuracy",
         "reasoning",
         upstream_metric="acc",
-        data_dir="short_digit",
         name="audiobench_spoken_mqa_short_digit",
     ),
     _t(
-        "spoken-mqa",
+        "spoken-mqa_long_digit",
         "amao0o0/spoken-mqa",
         "accuracy",
         "reasoning",
         upstream_metric="acc",
-        data_dir="long_digit",
         name="audiobench_spoken_mqa_long_digit",
     ),
     _t(
-        "spoken-mqa",
+        "spoken-mqa_single_step_reasoning",
         "amao0o0/spoken-mqa",
         "accuracy",
         "reasoning",
         upstream_metric="acc",
-        data_dir="single_step_reasoning",
         name="audiobench_spoken_mqa_single_step_reasoning",
     ),
     _t(
-        "spoken-mqa",
+        "spoken-mqa_multi_step_reasoning",
         "amao0o0/spoken-mqa",
         "accuracy",
         "reasoning",
         upstream_metric="acc",
-        data_dir="multi_step_reasoning",
         name="audiobench_spoken_mqa_multi_step_reasoning",
     ),
     _t("mmau_mini", "AudioLLMs/MMAU-mini", "string_match", "reasoning"),
diff --git a/oellm/scheduler.py b/oellm/scheduler.py
index af203c91..61a80ddd 100644
--- a/oellm/scheduler.py
+++ b/oellm/scheduler.py
@@ -306,7 +306,16 @@ def schedule_evals(
         logging.warning("No evaluation jobs to schedule.")
         return None
 
-    df["eval_suite"] = df["eval_suite"].str.lower()
+    # Lowercase the suite name only, preserve any ``:model_flags`` suffix
+    # verbatim — contrib dispatch keys can be case-sensitive (e.g.
+    # AudioBench's ``Qwen2-Audio-7B-Instruct`` is matched literally).
+    def _lower_suite_only(s: str) -> str:
+        if ":" in s:
+            head, tail = s.split(":", 1)
+            return f"{head.lower()}:{tail}"
+        return s.lower()
+
+    df["eval_suite"] = df["eval_suite"].map(_lower_suite_only)
 
     # Ensure that all datasets required by the tasks are cached locally to avoid
     # network access on compute nodes.
diff --git a/pyproject.toml b/pyproject.toml
index 4128db93..8dcdb8f0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,10 +38,11 @@ audio = [
 # clusters.yaml points at a local git clone and suite.py subprocesses into
 # ``python src/main_evaluate.py``.  These are our post-processing deps.
 audiobench = [
-    "jiwer",         # WER sanity checks
-    "sacrebleu",     # BLEU verification (covost2)
-    "pythainlp",     # Thai tokenisation for gigaspeech2_thai
-    "evaluate",      # MMAU / METEOR post-processing
+    "jiwer<3",                     # AudioBench uses jiwer.compute_measures, removed in 3.0
+    "transformers>=4.45,<5",       # AudioBench's Qwen2-Audio loader uses the v4 processor API (`audios=` kwarg); v5 silently drops audio inputs and produces garbage predictions
+    "sacrebleu",                   # BLEU verification (covost2)
+    "pythainlp",                   # Thai tokenisation for gigaspeech2_thai
+    "evaluate",                    # MMAU / METEOR post-processing
     "soundfile",
     "librosa",
 ]
diff --git a/tests/test_audiobench.py b/tests/test_audiobench.py
index e3a945ee..d7617eb7 100644
--- a/tests/test_audiobench.py
+++ b/tests/test_audiobench.py
@@ -1,4 +1,24 @@
-"""Tests for the AudioBench contrib benchmark integration."""
+"""Tests for the AudioBench contrib benchmark integration.
+
+The shape of these tests reflects AudioBench's actual upstream API
+(``$AUDIOBENCH_DIR/src/main_evaluate.py``), which we discovered while
+debugging the first cluster smoke test:
+
+* ``main()`` accepts only ``dataset_name`` / ``model_name`` / ``metrics`` /
+  ``overwrite`` / ``number_of_samples`` — no ``--model``, no ``--log_dir``,
+  no ``--data_dir``.
+* ``Model.__init__`` and ``Dataset.load_dataset`` dispatch on **exact**
+  string match against fixed lists; AudioBench cannot evaluate arbitrary
+  HF checkpoints (only the variants whose loaders ship under
+  ``model_src/``), and split selection happens via the dataset_name itself
+  (``gigaspeech2_thai``, ``spoken-mqa_short_digit``) — there is no
+  ``--data_dir`` flag.
+* AudioBench writes scores to the hardcoded path
+  ``$cwd/log_for_all_models/<model_name>/<dataset>_<metric>_score.json``.
+* Without ``--overwrite True`` AudioBench skips evaluation when a stale
+  score file exists, so :func:`oellm.contrib.audiobench.suite.run` always
+  passes that flag.
+"""
 
 from __future__ import annotations
 
@@ -22,10 +42,9 @@
 ST_GROUP = "audio-audiobench-st"
 REASONING_GROUP = "audio-audiobench-reasoning"
 
-# Canonical task names that MUST be in the registry.  Assertions that
-# reference individual task names go here so the audit table in the plan is
-# reflected 1:1 in tests; a silent rename breaks the build.
-BUCKET_B_TASKS = {
+# Canonical task names that MUST be in the registry.  A silent rename
+# breaks the build.
+NEW_TASKS = {
     # ASR (9)
     "audiobench_aishell_asr_zh_test",
     "audiobench_earnings21_test",
@@ -51,7 +70,7 @@
     "audiobench_audiocaps_test",
 }
 
-BUCKET_A_DUAL = {
+DUAL_TASKS = {
     "audiobench_librispeech_test_clean",
     "audiobench_librispeech_test_other",
     "audiobench_common_voice_15_en_test",
@@ -61,7 +80,7 @@
     "audiobench_covost2_en_zh_test",
 }
 
-ALL_PHASE1_TASKS = BUCKET_B_TASKS | BUCKET_A_DUAL
+ALL_PHASE1_TASKS = NEW_TASKS | DUAL_TASKS
 
 
 # ---------------------------------------------------------------------------
@@ -90,8 +109,6 @@ def test_every_task_has_audiobench_prefix(self):
     def test_every_task_has_audiollms_or_amao_hf_repo(self):
         from oellm.contrib.audiobench.task import AUDIOBENCH_TASKS
 
-        # Tasks live on AudioLLMs/* with the single exception of spoken-mqa
-        # (amao0o0/spoken-mqa).
         for t in AUDIOBENCH_TASKS:
             assert t.hf_repo.startswith(("AudioLLMs/", "amao0o0/")), (
                 f"{t.name} has unexpected repo {t.hf_repo}"
@@ -111,29 +128,51 @@ def test_st_tasks_all_use_bleu(self):
             if t.family == "st":
                 assert t.metric == "bleu", f"{t.name}: {t.metric}"
 
-    def test_gigaspeech2_tasks_share_repo_and_differ_by_data_dir(self):
+    def test_gigaspeech2_tasks_use_per_split_upstream_name(self):
+        """All 3 GigaSpeech2 tasks share one HF repo, but AudioBench's
+        ``--dataset_name`` dispatch keys are the split-suffixed forms
+        (``gigaspeech2_thai``/``_indo``/``_viet``) — there is no
+        ``--data_dir`` flag.
+        """
         from oellm.contrib.audiobench.task import AUDIOBENCH_TASKS
 
         gs2 = [t for t in AUDIOBENCH_TASKS if "gigaspeech2" in t.name]
         assert len(gs2) == 3
-        # All share the same HF repo.
         assert {t.hf_repo for t in gs2} == {"AudioLLMs/gigaspeech2-test"}
-        # Each has a distinct data_dir.
-        assert {t.data_dir for t in gs2} == {"th-test", "id-test", "vi-test"}
+        assert {t.upstream_name for t in gs2} == {
+            "gigaspeech2_thai",
+            "gigaspeech2_indo",
+            "gigaspeech2_viet",
+        }
 
-    def test_spoken_mqa_tasks_share_repo_and_differ_by_data_dir(self):
+    def test_spoken_mqa_tasks_use_per_split_upstream_name(self):
+        """All 4 spoken-mqa tasks share one HF repo; AudioBench dispatches
+        via the hyphen-prefixed split-suffixed dataset_name
+        (``spoken-mqa_<split>``).
+        """
         from oellm.contrib.audiobench.task import AUDIOBENCH_TASKS
 
         smqa = [t for t in AUDIOBENCH_TASKS if "spoken_mqa" in t.name]
         assert len(smqa) == 4
         assert {t.hf_repo for t in smqa} == {"amao0o0/spoken-mqa"}
-        assert {t.data_dir for t in smqa} == {
-            "short_digit",
-            "long_digit",
-            "single_step_reasoning",
-            "multi_step_reasoning",
+        assert {t.upstream_name for t in smqa} == {
+            "spoken-mqa_short_digit",
+            "spoken-mqa_long_digit",
+            "spoken-mqa_single_step_reasoning",
+            "spoken-mqa_multi_step_reasoning",
         }
 
+    def test_spoken_mqa_uses_acc_metric_upstream(self):
+        """Upstream metric for spoken-mqa is ``acc`` (the canonical key
+        we expose externally is ``accuracy``).
+        """
+        from oellm.contrib.audiobench.task import AUDIOBENCH_TASKS
+
+        for t in AUDIOBENCH_TASKS:
+            if "spoken_mqa" in t.name:
+                assert t.metric == "accuracy"
+                assert t.upstream_metric == "acc"
+
     def test_get_task_spec_returns_spec(self):
         from oellm.contrib.audiobench.task import get_task_spec
 
@@ -148,6 +187,17 @@ def test_get_task_spec_unknown_raises(self):
         with pytest.raises(KeyError, match="Unknown AudioBench task"):
             get_task_spec("audiobench_does_not_exist")
 
+    def test_no_task_spec_carries_data_dir_attribute(self):
+        """``data_dir`` was removed once we discovered AudioBench has no
+        such flag; guard against accidental reintroduction.
+        """
+        from oellm.contrib.audiobench.task import AUDIOBENCH_TASKS, AudioBenchTaskSpec
+
+        # Field removed entirely from the dataclass.
+        assert "data_dir" not in AudioBenchTaskSpec.__dataclass_fields__
+        for t in AUDIOBENCH_TASKS:
+            assert not hasattr(t, "data_dir")
+
 
 # ---------------------------------------------------------------------------
 # Adapter — adapter.py
@@ -155,6 +205,13 @@ def test_get_task_spec_unknown_raises(self):
 
 
 class TestAudioBenchModelAdapter:
+    """Adapter must return AudioBench's literal ``model_name`` dispatch keys.
+
+    Each pattern check is a regression target — AudioBench's ``model.py``
+    does ``if self.model_name == "<exact-string>":`` and raises
+    NotImplementedError on any other value.
+    """
+
     @pytest.fixture
     def adapter_cls(self):
         from oellm.contrib.audiobench.adapter import AudioBenchModelAdapter
@@ -166,43 +223,61 @@ def test_is_base_model_adapter(self, adapter_cls):
         cls, base = adapter_cls
         assert issubclass(cls, base)
 
-    def test_qwen2_audio(self, adapter_cls):
+    def test_qwen2_audio_7b_instruct_returns_literal_key(self, adapter_cls):
         cls, _ = adapter_cls
-        assert cls("Qwen/Qwen2-Audio-7B-Instruct").to_contrib_flags() == "qwen2_audio"
+        # AudioBench dispatches on the literal "Qwen2-Audio-7B-Instruct".
+        assert (
+            cls("Qwen/Qwen2-Audio-7B-Instruct").to_contrib_flags()
+            == "Qwen2-Audio-7B-Instruct"
+        )
 
-    def test_qwen_audio(self, adapter_cls):
+    def test_qwen_audio_chat_returns_literal_key(self, adapter_cls):
         cls, _ = adapter_cls
-        assert cls("Qwen/Qwen-Audio-Chat").to_contrib_flags() == "qwen2_audio"
+        assert cls("Qwen/Qwen-Audio-Chat").to_contrib_flags() == "Qwen-Audio-Chat"
 
-    def test_salmonn(self, adapter_cls):
+    def test_salmonn_returns_salmonn_7b(self, adapter_cls):
         cls, _ = adapter_cls
-        assert cls("tsinghua/SALMONN-13B").to_contrib_flags() == "salmonn"
+        # AudioBench only ships the 7B variant (model_src/salmonn_7b.py).
+        assert cls("tsinghua/SALMONN-7B").to_contrib_flags() == "SALMONN_7B"
 
-    def test_ltu(self, adapter_cls):
+    def test_whisper_large_v3(self, adapter_cls):
         cls, _ = adapter_cls
-        assert cls("MIT/ltu-as").to_contrib_flags() == "ltu"
+        assert cls("openai/whisper-large-v3").to_contrib_flags() == "whisper_large_v3"
 
-    def test_whisper(self, adapter_cls):
+    def test_whisper_large_v2(self, adapter_cls):
         cls, _ = adapter_cls
-        assert cls("openai/whisper-large-v3").to_contrib_flags() == "whisper"
+        assert cls("openai/whisper-large-v2").to_contrib_flags() == "whisper_large_v2"
 
-    def test_audio_flamingo(self, adapter_cls):
+    def test_meralion_returns_full_literal_key(self, adapter_cls):
         cls, _ = adapter_cls
-        assert cls("nvidia/audio-flamingo-2").to_contrib_flags() == "audioflamingo"
+        assert (
+            cls("MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION").to_contrib_flags()
+            == "MERaLiON-AudioLLM-Whisper-SEA-LION"
+        )
 
-    def test_meralion(self, adapter_cls):
+    def test_phi_4_multimodal(self, adapter_cls):
         cls, _ = adapter_cls
-        assert cls("Singapore-NLP/MERaLiON-7B").to_contrib_flags() == "meralion"
+        assert (
+            cls("microsoft/Phi-4-multimodal-instruct").to_contrib_flags()
+            == "phi_4_multimodal_instruct"
+        )
 
-    def test_unknown_defaults_to_generic(self, adapter_cls):
+    def test_unknown_returns_none(self, adapter_cls):
+        """AudioBench has no generic loader.  Unmatched paths must return
+        ``None`` so :func:`suite.run` can raise a clear error rather than
+        falling through to a fictitious dispatch key.
+        """
         cls, _ = adapter_cls
-        assert cls("random/unknown-model").to_contrib_flags() == "generic"
+        assert cls("random/unknown-model").to_contrib_flags() is None
 
     def test_module_level_detect_function(self):
         from oellm.contrib.audiobench.adapter import detect_audiobench_model_type
 
-        assert detect_audiobench_model_type("Qwen/Qwen2-Audio-7B") == "qwen2_audio"
-        assert detect_audiobench_model_type("completely/unknown") == "generic"
+        assert (
+            detect_audiobench_model_type("Qwen/Qwen2-Audio-7B-Instruct")
+            == "Qwen2-Audio-7B-Instruct"
+        )
+        assert detect_audiobench_model_type("completely/unknown") is None
 
 
 # ---------------------------------------------------------------------------
@@ -243,10 +318,13 @@ def test_all_groups_are_zero_shot(self, suite):
             assert group["suite"] == SUITE
 
     def test_detect_model_flags_qwen2_audio(self, suite):
-        assert suite.detect_model_flags("Qwen/Qwen2-Audio-7B-Instruct") == "qwen2_audio"
+        assert (
+            suite.detect_model_flags("Qwen/Qwen2-Audio-7B-Instruct")
+            == "Qwen2-Audio-7B-Instruct"
+        )
 
-    def test_detect_model_flags_unknown_defaults_to_generic(self, suite):
-        assert suite.detect_model_flags("some/unknown-model") == "generic"
+    def test_detect_model_flags_unknown_returns_none(self, suite):
+        assert suite.detect_model_flags("some/unknown-model") is None
 
     def test_parse_results_recognises_audiobench_json(self, suite):
         data = {
@@ -304,14 +382,14 @@ def test_top_group_expands_to_expected_task_names(self):
 
     def test_asr_group_has_15_leaves(self):
         results = _expand_task_groups([ASR_GROUP])
-        # 9 bucket-B ASR + 6 bucket-A dual ASR = 15.
+        # 9 new ASR + 6 dual ASR = 15.
         assert len(results) == 15
         for r in results:
             assert r.suite == SUITE
 
     def test_st_group_has_6_leaves(self):
         results = _expand_task_groups([ST_GROUP])
-        # 5 bucket-B ST + 1 bucket-A dual (en→zh) = 6.
+        # 5 new ST + 1 dual (en→zh) = 6.
         assert len(results) == 6
 
     def test_reasoning_group_has_6_leaves(self):
@@ -339,7 +417,6 @@ def test_dataset_specs_dedupe_shared_repos(self):
     def test_dataset_specs_contain_audiollms_repos(self):
         specs = _collect_dataset_specs([TOP_GROUP])
         repo_ids = {s.repo_id for s in specs}
-        # Sanity-check a handful of expected entries.
         assert "AudioLLMs/librispeech_test_clean" in repo_ids
         assert "AudioLLMs/earnings21_test" in repo_ids
         assert "AudioLLMs/MMAU-mini" in repo_ids
@@ -370,7 +447,6 @@ def test_task_groups_merged_into_registry(self):
         registry._discover.cache_clear()
         merged = registry.get_all_task_groups()
         assert TOP_GROUP in merged["task_groups"]
-        # task_metrics come through too.
         assert "audiobench_librispeech_test_clean" in merged["task_metrics"]
 
 
@@ -380,7 +456,7 @@ def test_task_groups_merged_into_registry(self):
 
 
 class TestRunnerIntegration:
-    def test_resolve_suite_appends_model_flag(self):
+    def test_resolve_suite_appends_audiobench_dispatch_key(self):
         from oellm.constants import EvaluationJob
         from oellm.runner import EvalRunner
 
@@ -392,9 +468,17 @@ def test_resolve_suite_appends_model_flag(self):
             eval_suite="audiobench",
         )
         result = runner.resolve_suite(job)
-        assert result == "audiobench:qwen2_audio"
-
-    def test_resolve_suite_generic_fallback(self):
+        # AudioBench's literal dispatch key (case-sensitive) must come
+        # through verbatim so dispatch.py / suite.run get the exact value
+        # AudioBench's ``Model`` class compares against.
+        assert result == "audiobench:Qwen2-Audio-7B-Instruct"
+
+    def test_resolve_suite_unknown_model_passes_through_bare(self):
+        """When the adapter returns ``None`` (no AudioBench-supported
+        loader for the model path), ``resolve_suite`` keeps the bare
+        suite name; :func:`suite.run` then raises a clear error at
+        dispatch time rather than fabricating a fake key.
+        """
         from oellm.constants import EvaluationJob
         from oellm.runner import EvalRunner
 
@@ -406,7 +490,7 @@ def test_resolve_suite_generic_fallback(self):
             eval_suite="audiobench",
         )
         result = runner.resolve_suite(job)
-        assert result == "audiobench:generic"
+        assert result == "audiobench"  # bare, no ``:flags`` suffix
 
 
 # ---------------------------------------------------------------------------
@@ -415,8 +499,10 @@ def test_resolve_suite_generic_fallback(self):
 
 
 class TestRunHarness:
-    """Exercise suite.run() with a mocked subprocess, verifying the CLI
-    it would invoke and the output JSON it writes.
+    """Exercise suite.run() with a mocked subprocess, verifying both the
+    CLI it would invoke (matching AudioBench's actual ``main()`` signature)
+    and that we read the score file from AudioBench's hardcoded output
+    location.
     """
 
     def _fake_audiobench_tree(self, tmp_path: Path) -> Path:
@@ -426,16 +512,54 @@ def _fake_audiobench_tree(self, tmp_path: Path) -> Path:
         (ab_dir / "src" / "main_evaluate.py").write_text("# placeholder\n")
         return ab_dir
 
+    @staticmethod
+    def _score_file_path(
+        ab_dir: Path, model_name: str, dataset: str, metric: str
+    ) -> Path:
+        """Mirror suite._extract_metrics' path construction."""
+        return (
+            ab_dir / "log_for_all_models" / model_name / f"{dataset}_{metric}_score.json"
+        )
+
+    def _fake_run_writing_score(
+        self, ab_dir: Path, *, score_value: float, body_shape: str = "flat"
+    ):
+        """Build a fake_run that writes a score file at AudioBench's
+        hardcoded path, parameterized by the JSON shape we want to test.
+        """
+
+        def fake_run(cmd, cwd, env, check):
+            model_name = cmd[cmd.index("--model_name") + 1]
+            dataset = cmd[cmd.index("--dataset_name") + 1]
+            metric = cmd[cmd.index("--metrics") + 1]
+            score_file = self._score_file_path(Path(cwd), model_name, dataset, metric)
+            score_file.parent.mkdir(parents=True, exist_ok=True)
+            if body_shape == "flat":
+                score_file.write_text(json.dumps({metric: score_value}))
+            elif body_shape == "nested":
+                score_file.write_text(
+                    json.dumps({"metrics": {metric: {"score": score_value, "n": 100}}})
+                )
+            elif body_shape == "missing_metric":
+                score_file.write_text(json.dumps({"irrelevant": 1}))
+            elif body_shape == "no_file":
+                pass  # deliberately don't write
+            else:
+                raise ValueError(f"unknown body_shape: {body_shape}")
+            return _FakeCompletedProcess(0)
+
+        return fake_run
+
     def test_run_missing_audiobench_dir_raises(self, tmp_path):
         from oellm.contrib.audiobench.suite import run
 
         with pytest.raises(RuntimeError, match="AUDIOBENCH_DIR must be set"):
             run(
-                model_path="Qwen/Qwen2-Audio-7B",
+                model_path="Qwen/Qwen2-Audio-7B-Instruct",
                 task="audiobench_librispeech_test_clean",
                 n_shot=0,
                 output_path=tmp_path / "out.json",
-                model_flags="qwen2_audio",
+                model_flags="Qwen2-Audio-7B-Instruct",
                 env={},  # no AUDIOBENCH_DIR
             )
 
@@ -446,53 +570,74 @@ def test_run_missing_entrypoint_raises(self, tmp_path):
         bad_dir.mkdir()
         with pytest.raises(FileNotFoundError, match="AudioBench entry point"):
             run(
-                model_path="Qwen/Qwen2-Audio-7B",
+                model_path="Qwen/Qwen2-Audio-7B-Instruct",
                 task="audiobench_librispeech_test_clean",
                 n_shot=0,
                 output_path=tmp_path / "out.json",
-                model_flags="qwen2_audio",
+                model_flags="Qwen2-Audio-7B-Instruct",
                 env={"AUDIOBENCH_DIR": str(bad_dir)},
             )
 
+    def test_run_unmapped_model_raises(self, tmp_path):
+        """AudioBench has no generic loader.  When ``model_flags`` is
+        ``None`` (adapter found no match), :func:`run` must fail loudly
+        rather than invoking AudioBench with a missing/empty model_name.
+        """
+        from oellm.contrib.audiobench.suite import run
+
+        ab_dir = self._fake_audiobench_tree(tmp_path)
+        with pytest.raises(RuntimeError, match="Could not map model_path"):
+            run(
+                model_path="random/unknown-model",
+                task="audiobench_librispeech_test_clean",
+                n_shot=0,
+                output_path=tmp_path / "out.json",
+                model_flags=None,
+                env={"AUDIOBENCH_DIR": str(ab_dir)},
+            )
+
     def test_run_invokes_subprocess_with_expected_cli(self, tmp_path):
         from oellm.contrib.audiobench import suite
 
         ab_dir = self._fake_audiobench_tree(tmp_path)
         output_path = tmp_path / "result.json"
 
-        def fake_run(cmd, cwd, env, check):
-            # Write a fake AudioBench result JSON into the run_dir that
-            # _extract_metrics will pick up.
-            run_dir = Path(cmd[cmd.index("--log_dir") + 1])
-            (run_dir / "task_result.json").write_text(json.dumps({"wer": 0.063}))
-            return _FakeCompletedProcess(0)
-
         with patch(
-            "oellm.contrib.audiobench.suite.subprocess.run", side_effect=fake_run
+            "oellm.contrib.audiobench.suite.subprocess.run",
+            side_effect=self._fake_run_writing_score(ab_dir, score_value=0.063),
         ) as mock_sp:
             suite.run(
                 model_path="Qwen/Qwen2-Audio-7B-Instruct",
                 task="audiobench_librispeech_test_clean",
                 n_shot=0,
                 output_path=output_path,
-                model_flags="qwen2_audio",
+                model_flags="Qwen2-Audio-7B-Instruct",
                 env={"AUDIOBENCH_DIR": str(ab_dir), "LIMIT": "100"},
             )
 
         assert mock_sp.call_count == 1
         cmd = mock_sp.call_args.args[0]
         assert cmd[:2] == ["python", "src/main_evaluate.py"]
-        assert "--dataset" in cmd
-        assert cmd[cmd.index("--dataset") + 1] == "librispeech_test_clean"
-        assert cmd[cmd.index("--model") + 1] == "qwen2_audio"
-        assert cmd[cmd.index("--model_name") + 1] == "Qwen/Qwen2-Audio-7B-Instruct"
+
+        # AudioBench's actual main() signature: dataset_name / model_name
+        # / metrics / overwrite / number_of_samples.  No --model, no
+        # --log_dir, no --data_dir.
+        assert cmd[cmd.index("--dataset_name") + 1] == "librispeech_test_clean"
+        assert cmd[cmd.index("--model_name") + 1] == "Qwen2-Audio-7B-Instruct"
         assert cmd[cmd.index("--metrics") + 1] == "wer"
-        # LIMIT propagated.
+        assert cmd[cmd.index("--overwrite") + 1] == "True"
         assert cmd[cmd.index("--number_of_samples") + 1] == "100"
-        # cwd is AUDIOBENCH_DIR.
+
+        # Flags AudioBench does NOT accept must not be in the cmd.
+        assert "--model" not in cmd  # only --model_name exists upstream
+        assert "--log_dir" not in cmd  # AudioBench writes to a fixed path
+        assert "--data_dir" not in cmd  # split selection is via dataset_name
+
+        # cwd is AUDIOBENCH_DIR so AudioBench's relative writes
+        # (log_for_all_models/...) land inside the clone.
         assert mock_sp.call_args.kwargs["cwd"] == str(ab_dir)
 
-        # Output JSON is lmms-eval-shaped and contains the extracted metric.
+        # Output JSON is lmms-eval-shaped.
         body = json.loads(output_path.read_text())
         assert body["model_name_or_path"] == "Qwen/Qwen2-Audio-7B-Instruct"
         assert body["results"]["audiobench_librispeech_test_clean"][
@@ -500,32 +645,31 @@ def fake_run(cmd, cwd, env, check):
         ] == pytest.approx(0.063)
         assert body["configs"]["audiobench_librispeech_test_clean"]["num_fewshot"] == 0
 
-    def test_run_forwards_data_dir_for_gigaspeech2(self, tmp_path):
+    def test_run_uses_per_split_dataset_name_for_gigaspeech2(self, tmp_path):
+        """GigaSpeech2 splits are dispatched via the dataset_name itself
+        (``gigaspeech2_thai``), not a ``--data_dir`` flag.
+        """
         from oellm.contrib.audiobench import suite
 
         ab_dir = self._fake_audiobench_tree(tmp_path)
         output_path = tmp_path / "result.json"
 
-        def fake_run(cmd, cwd, env, check):
-            run_dir = Path(cmd[cmd.index("--log_dir") + 1])
-            (run_dir / "gs2.json").write_text(json.dumps({"wer": 0.12}))
-            return _FakeCompletedProcess(0)
-
         with patch(
-            "oellm.contrib.audiobench.suite.subprocess.run", side_effect=fake_run
+            "oellm.contrib.audiobench.suite.subprocess.run",
+            side_effect=self._fake_run_writing_score(ab_dir, score_value=0.12),
         ) as mock_sp:
             suite.run(
-                model_path="Qwen/Qwen2-Audio-7B",
+                model_path="Qwen/Qwen2-Audio-7B-Instruct",
                 task="audiobench_gigaspeech2_thai",
                 n_shot=0,
                 output_path=output_path,
-                model_flags="qwen2_audio",
+                model_flags="Qwen2-Audio-7B-Instruct",
                 env={"AUDIOBENCH_DIR": str(ab_dir)},
             )
 
         cmd = mock_sp.call_args.args[0]
-        assert "--data_dir" in cmd
-        assert cmd[cmd.index("--data_dir") + 1] == "th-test"
+        assert cmd[cmd.index("--dataset_name") + 1] == "gigaspeech2_thai"
+        assert "--data_dir" not in cmd
 
     def test_run_omits_number_of_samples_when_limit_empty(self, tmp_path):
         from oellm.contrib.audiobench import suite
@@ -533,26 +677,48 @@ def test_run_omits_number_of_samples_when_limit_empty(self, tmp_path):
         ab_dir = self._fake_audiobench_tree(tmp_path)
         output_path = tmp_path / "result.json"
 
-        def fake_run(cmd, cwd, env, check):
-            run_dir = Path(cmd[cmd.index("--log_dir") + 1])
-            (run_dir / "r.json").write_text(json.dumps({"wer": 0.1}))
-            return _FakeCompletedProcess(0)
-
         with patch(
-            "oellm.contrib.audiobench.suite.subprocess.run", side_effect=fake_run
+            "oellm.contrib.audiobench.suite.subprocess.run",
+            side_effect=self._fake_run_writing_score(ab_dir, score_value=0.1),
         ) as mock_sp:
             suite.run(
-                model_path="Qwen/Qwen2-Audio-7B",
+                model_path="Qwen/Qwen2-Audio-7B-Instruct",
                 task="audiobench_librispeech_test_clean",
                 n_shot=0,
                 output_path=output_path,
-                model_flags="qwen2_audio",
+                model_flags="Qwen2-Audio-7B-Instruct",
                 env={"AUDIOBENCH_DIR": str(ab_dir), "LIMIT": ""},
             )
 
         cmd = mock_sp.call_args.args[0]
         assert "--number_of_samples" not in cmd
 
+    def test_run_always_passes_overwrite_true(self, tmp_path):
+        """AudioBench skips evaluation when a stale score file already
+        exists unless ``--overwrite True`` is passed; we always pass it
+        because we do our own deduplication via output_path.
+        """
+        from oellm.contrib.audiobench import suite
+
+        ab_dir = self._fake_audiobench_tree(tmp_path)
+        output_path = tmp_path / "result.json"
+
+        with patch(
+            "oellm.contrib.audiobench.suite.subprocess.run",
+            side_effect=self._fake_run_writing_score(ab_dir, score_value=0.1),
+        ) as mock_sp:
+            suite.run(
+                model_path="Qwen/Qwen2-Audio-7B-Instruct",
+                task="audiobench_librispeech_test_clean",
+                n_shot=0,
+                output_path=output_path,
+                model_flags="Qwen2-Audio-7B-Instruct",
+                env={"AUDIOBENCH_DIR": str(ab_dir)},
+            )
+
+        cmd = mock_sp.call_args.args[0]
+        assert cmd[cmd.index("--overwrite") + 1] == "True"
+
     def test_run_nonzero_exit_raises(self, tmp_path):
         from oellm.contrib.audiobench import suite
 
@@ -565,38 +731,36 @@ def test_run_nonzero_exit_raises(self, tmp_path):
         ):
             with pytest.raises(RuntimeError, match="AudioBench exited with code 1"):
                 suite.run(
-                    model_path="Qwen/Qwen2-Audio-7B",
+                    model_path="Qwen/Qwen2-Audio-7B-Instruct",
                     task="audiobench_librispeech_test_clean",
                     n_shot=0,
                     output_path=output_path,
-                    model_flags="qwen2_audio",
+                    model_flags="Qwen2-Audio-7B-Instruct",
                     env={"AUDIOBENCH_DIR": str(ab_dir)},
                 )
 
     def test_run_handles_nested_metric_json(self, tmp_path):
-        """AudioBench output format has drifted; support
-        ``{"metrics": {"wer": {"score": 0.05}}}`` as well as flat
-        ``{"wer": 0.05}``.
+        """AudioBench's score-file shape has drifted across releases; we
+        tolerate both ``{"wer": 0.05}`` and
+        ``{"metrics": {"wer": {"score": 0.05}}}`` layouts.
         """
         from oellm.contrib.audiobench import suite
 
         ab_dir = self._fake_audiobench_tree(tmp_path)
         output_path = tmp_path / "result.json"
 
-        def fake_run(cmd, cwd, env, check):
-            run_dir = Path(cmd[cmd.index("--log_dir") + 1])
-            (run_dir / "nested.json").write_text(
-                json.dumps({"metrics": {"wer": {"score": 0.051, "notes": "nested"}}})
-            )
-            return _FakeCompletedProcess(0)
-
-        with patch("oellm.contrib.audiobench.suite.subprocess.run", side_effect=fake_run):
+        with patch(
+            "oellm.contrib.audiobench.suite.subprocess.run",
+            side_effect=self._fake_run_writing_score(
+                ab_dir, score_value=0.051, body_shape="nested"
+            ),
+        ):
             suite.run(
-                model_path="Qwen/Qwen2-Audio-7B",
+                model_path="Qwen/Qwen2-Audio-7B-Instruct",
                 task="audiobench_librispeech_test_clean",
                 n_shot=0,
                 output_path=output_path,
-                model_flags="qwen2_audio",
+                model_flags="Qwen2-Audio-7B-Instruct",
                 env={"AUDIOBENCH_DIR": str(ab_dir)},
             )
 
@@ -605,25 +769,53 @@ def fake_run(cmd, cwd, env, check):
             "wer"
         ] == pytest.approx(0.051)
 
-    def test_run_missing_metric_in_output_raises(self, tmp_path):
+    def test_run_missing_score_file_raises(self, tmp_path):
+        """If AudioBench exits 0 but doesn't write the score file at the
+        expected path, surface a clear error rather than producing an
+        empty CSV row downstream.
+        """
         from oellm.contrib.audiobench import suite
 
         ab_dir = self._fake_audiobench_tree(tmp_path)
         output_path = tmp_path / "result.json"
 
-        def fake_run(cmd, cwd, env, check):
-            run_dir = Path(cmd[cmd.index("--log_dir") + 1])
-            (run_dir / "no_metric.json").write_text(json.dumps({"irrelevant": 1}))
-            return _FakeCompletedProcess(0)
+        with patch(
+            "oellm.contrib.audiobench.suite.subprocess.run",
+            side_effect=self._fake_run_writing_score(
+                ab_dir, score_value=0.0, body_shape="no_file"
+            ),
+        ):
+            with pytest.raises(
+                RuntimeError, match="AudioBench did not write expected score file"
+            ):
+                suite.run(
+                    model_path="Qwen/Qwen2-Audio-7B-Instruct",
+                    task="audiobench_librispeech_test_clean",
+                    n_shot=0,
+                    output_path=output_path,
+                    model_flags="Qwen2-Audio-7B-Instruct",
+                    env={"AUDIOBENCH_DIR": str(ab_dir)},
+                )
+
+    def test_run_score_file_without_metric_key_raises(self, tmp_path):
+        from oellm.contrib.audiobench import suite
+
+        ab_dir = self._fake_audiobench_tree(tmp_path)
+        output_path = tmp_path / "result.json"
 
-        with patch("oellm.contrib.audiobench.suite.subprocess.run", side_effect=fake_run):
+        with patch(
+            "oellm.contrib.audiobench.suite.subprocess.run",
+            side_effect=self._fake_run_writing_score(
+                ab_dir, score_value=0.0, body_shape="missing_metric"
+            ),
+        ):
             with pytest.raises(RuntimeError, match="Could not locate metric"):
                 suite.run(
-                    model_path="Qwen/Qwen2-Audio-7B",
+                    model_path="Qwen/Qwen2-Audio-7B-Instruct",
                     task="audiobench_librispeech_test_clean",
                     n_shot=0,
                     output_path=output_path,
-                    model_flags="qwen2_audio",
+                    model_flags="Qwen2-Audio-7B-Instruct",
                     env={"AUDIOBENCH_DIR": str(ab_dir)},
                 )
 
@@ -662,11 +854,40 @@ def test_dry_run_writes_audiobench_suite_to_csv(self, tmp_path):
         csv_files = list(tmp_path.glob("**/jobs.csv"))
         assert len(csv_files) == 1
         df = pd.read_csv(csv_files[0])
-        # All rows route to audiobench (with or without model-flag suffix).
+        # All rows route to audiobench (with model-flag suffix).
         assert all(s.startswith("audiobench") for s in df["eval_suite"].unique())
         # task_path column contains canonical audiobench_ names.
         assert all(t.startswith("audiobench_") for t in df["task_path"].unique())
 
+    def test_dry_run_preserves_model_flag_capitalization(self, tmp_path):
+        """Regression: scheduler.py used to lowercase the entire eval_suite
+        column, breaking AudioBench's case-sensitive dispatch keys
+        (``Qwen2-Audio-7B-Instruct`` was being mangled to
+        ``qwen2-audio-7b-instruct``).
+        """
+        import pandas as pd
+
+        from oellm.main import schedule_evals
+
+        with (
+            patch("oellm.scheduler._load_cluster_env"),
+            patch("oellm.scheduler._num_jobs_in_queue", return_value=0),
+            patch.dict(os.environ, {"EVAL_OUTPUT_DIR": str(tmp_path)}),
+        ):
+            schedule_evals(
+                models="Qwen/Qwen2-Audio-7B-Instruct",
+                task_groups=ASR_GROUP,
+                skip_checks=True,
+                venv_path=str(Path(sys.prefix)),
+                dry_run=True,
+            )
+
+        csv_files = list(tmp_path.glob("**/jobs.csv"))
+        df = pd.read_csv(csv_files[0])
+        suites = set(df["eval_suite"].unique())
+        # The exact AudioBench dispatch literal must come through case-intact.
+        assert "audiobench:Qwen2-Audio-7B-Instruct" in suites
+
     def test_dry_run_sbatch_contains_contrib_dispatch(self, tmp_path):
         from oellm.main import schedule_evals
 
@@ -687,7 +908,7 @@ def test_dry_run_sbatch_contains_contrib_dispatch(self, tmp_path):
         assert len(sbatch_files) == 1
         content = sbatch_files[0].read_text()
         assert "oellm.contrib.dispatch" in content
-        # LIMIT is now exported so contrib plugins can read it.
+        # LIMIT is exported so contrib plugins can read it.
         assert "export LIMIT=" in content
 
 

From bce403bb398cd1a22b578a4f8e398c4becfacddd Mon Sep 17 00:00:00 2001
From: Ivan Slobozhan <ivan.slobozhan@gmail.com>
Date: Mon, 4 May 2026 16:02:49 +0200
Subject: [PATCH 4/6] update readme

---
 docs/VENV.md                             |  24 +++-
 oellm/contrib/audiobench/README.md       | 150 ++++++++++++++---------
 oellm/contrib/regiondial_bench/README.md |  57 ++++-----
 3 files changed, 142 insertions(+), 89 deletions(-)

diff --git a/docs/VENV.md b/docs/VENV.md
index 553500fc..da9f53bb 100644
--- a/docs/VENV.md
+++ b/docs/VENV.md
@@ -4,7 +4,29 @@
 
 Instead of using pre-built containers, you can run evaluations with your own Python virtual environment by passing `--venv-path`.
 
-## Setup
+## Choosing your venv
+
+Most evaluations share **one general venv**. A handful of framework-level
+suites have hard dependency conflicts and need their own venv:
+
+| Task group(s) | Engine | Venv | Setup |
+|---|---|---|---|
+| `open-sci-*`, `belebele_*_cf`, all text/multilingual tasks | `lm-eval-harness`, `lighteval` | **general** | [Setup](#setup-general-venv) |
+| `image-*`, `video-*`, `audio-*` (modality-prefixed) | `lmms-eval` | **general** | [Setup](#setup-general-venv) |
+| `dclm-core-22` | `lm-eval-harness` (pinned 0.4.9.2) | **dclm** | [DCLM-core-22](#dclm-core-22) |
+| `reasoning` (GPQA/MATH500/AIME/MBPP/etc.) | `evalchemy` + forked lm-eval | **evalchemy** | [Evalchemy](#evalchemy-reasoning) |
+
+Custom contrib benchmarks bring their own dependency stacks and are
+documented in `oellm/contrib/<name>/README.md`:
+
+| Task group(s) | Contrib | README |
+|---|---|---|
+| `audio-audiobench*` | `audiobench` | [`oellm/contrib/audiobench/README.md`](../oellm/contrib/audiobench/README.md) |
+| `regiondial-*` | `regiondial_bench` | [`oellm/contrib/regiondial_bench/README.md`](../oellm/contrib/regiondial_bench/README.md) |
+
+Use `oellm list-tasks` to see which suite a given task group routes to.
+
+## Setup (general venv)
 
 1. Create a venv with Python 3.12:
    ```bash
diff --git a/oellm/contrib/audiobench/README.md b/oellm/contrib/audiobench/README.md
index 4306901a..ea4c49ab 100644
--- a/oellm/contrib/audiobench/README.md
+++ b/oellm/contrib/audiobench/README.md
@@ -34,38 +34,22 @@ and depend on a vLLM judge service being provisioned on Leonardo.
 
 ## Prerequisites
 
-### 1. Clone AudioBench on the cluster
+AudioBench is not pip-installable (no upstream build backend, bare imports
+in `src/main_evaluate.py`); the plugin invokes it as a subprocess from an
+on-cluster clone. A dedicated venv is required: the `[audiobench]` extra
+pins `transformers<5` and `jiwer<3`, which conflict with the general eval
+venv (see [`docs/VENV.md`](../../../docs/VENV.md) for the framework venvs).
 
-AudioBench is **not** pip-installable — upstream is a script harness with
-bare imports (`from dataset import ...` inside `src/main_evaluate.py`) and
-no `pyproject.toml` / `setup.py`. The plugin invokes it as a subprocess
-from an on-cluster clone.
+### 1. Clone AudioBench
 
 ```bash
 git clone https://github.com/AudioLLMs/AudioBench /path/to/AudioBench
 ```
 
-We track the **latest `main`** — no pinned SHA — so updates are a simple
-`git pull` under `$AUDIOBENCH_DIR`. If a breaking upstream change lands,
-file an issue and we'll introduce a pin.
+AudioBench's `main` branch is tracked without a pinned SHA; updates are a
+`git pull` under `$AUDIOBENCH_DIR`.
 
-### 2. Install AudioBench's own runtime dependencies
-
-Still inside the clone:
-
-```bash
-cd /path/to/AudioBench
-python -m venv .venv && source .venv/bin/activate
-pip install -r requirements.txt
-```
-
-AudioBench's deps (unpinned upstream): `transformers`, `vllm`, `datasets`,
-`torchaudio`, `peft`, `autoawq`, `huggingface-hub`, `librosa`, `soundfile`,
-`fire`, `evaluate`, `jiwer`, `more_itertools`. Use a **separate venv**
-from the elliot-cli venv — AudioBench typically pulls in a bleeding-edge
-`transformers` that will conflict with lmms-eval's pin.
-
-### 3. Configure `clusters.yaml`
+### 2. Configure clusters.yaml
 
 Add `AUDIOBENCH_DIR` to your cluster block in
 `oellm/resources/clusters.yaml`:
@@ -76,29 +60,67 @@ leonardo:
   AUDIOBENCH_DIR: "/leonardo/home/userexternal/<user>/AudioBench"
 ```
 
-The plugin fails fast at dispatch time (via
-`oellm.contrib.dispatch`'s `CLUSTER_ENV_VARS` check) if the variable is
-missing, so you'll get a clean error message instead of a crash deep
-inside the subprocess.
-
-### 4. Install the elliot-cli `audiobench` extra
+`oellm.contrib.dispatch`'s `CLUSTER_ENV_VARS` check raises a clear error
+at dispatch time if the variable is missing.
 
-On the submission / login node where you run `oellm schedule-evals`:
+### 3. Create a venv and install the `[audiobench]` extra
 
 ```bash
+uv venv --python 3.12 audiobench-venv
+source audiobench-venv/bin/activate
 uv pip install -e ".[audiobench]"
 ```
 
-This installs our Python-side scorer deps (`jiwer`, `sacrebleu`,
-`pythainlp`, `evaluate`) used for result post-processing — **not**
-AudioBench itself.
+The extra pins `transformers>=4.45,<5`, `jiwer<3`, `sacrebleu`,
+`pythainlp`, `evaluate`, `soundfile`, `librosa`.
 
-### 5. Dataset pre-download
+### 4. Install AudioBench's runtime dependencies
 
-No manual steps required. `schedule-evals` auto-downloads every
-`AudioLLMs/*` HF repo referenced by the requested task group on the
-login node via `huggingface_hub.snapshot_download(max_workers=2)` so the
-compute nodes do not need internet access.
+Filter `vllm` — it is only used by judge-dependent tasks (deferred):
+
+```bash
+grep -v -i '^vllm' /path/to/AudioBench/requirements.txt > /tmp/ab-reqs.txt
+uv pip install -r /tmp/ab-reqs.txt
+```
+
+### 5. Re-pin PyTorch for the cluster's CUDA driver
+
+PyPI's default torch wheels target a CUDA runtime newer than most HPC
+drivers (Leonardo, JURECA report CUDA 12.2) and crash with
+`NVIDIA driver too old`. Use the `cu121` index:
+
+```bash
+uv pip install torch torchvision torchaudio \
+    --index-url https://download.pytorch.org/whl/cu121
+```
+
+### 6. Reinstall rapidfuzz
+
+The pure-Python fallback raises `NotImplementedError` on
+`Levenshtein.editops`, which jiwer's WER scoring calls. Force a fresh
+install of the C extension:
+
+```bash
+uv pip install --reinstall rapidfuzz
+```
+
+### 7. Verify
+
+```bash
+python -c "
+from transformers import Qwen2AudioForConditionalGeneration
+from rapidfuzz.distance import Levenshtein
+Levenshtein.editops('a', 'b')   # must not raise NotImplementedError
+print('audiobench venv OK')
+"
+```
+
+### Dataset pre-download
+
+No manual steps required. `schedule-eval` pre-downloads every
+`AudioLLMs/*` HF repo referenced by the requested task group on the login
+node via `huggingface_hub.snapshot_download(max_workers=2)`, so compute
+nodes do not need internet access.
 
 ## Running
 
@@ -115,23 +137,23 @@ compute nodes do not need internet access.
 
 ```bash
 # Full AudioBench suite on a Qwen2-Audio model:
-oellm schedule-evals \
+oellm schedule-eval \
     --models Qwen/Qwen2-Audio-7B-Instruct \
     --task-groups audio-audiobench \
-    --venv-path ~/elliot-venv
+    --venv-path audiobench-venv
 
 # ASR only:
-oellm schedule-evals \
+oellm schedule-eval \
     --models Qwen/Qwen2-Audio-7B-Instruct \
     --task-groups audio-audiobench-asr \
-    --venv-path ~/elliot-venv
+    --venv-path audiobench-venv
 
 # Smoke test with --limit:
-oellm schedule-evals \
+oellm schedule-eval \
     --models Qwen/Qwen2-Audio-7B-Instruct \
     --task-groups audio-audiobench-asr \
     --limit 100 \
-    --venv-path ~/elliot-venv
+    --venv-path audiobench-venv
 ```
 
 `--limit N` is forwarded to AudioBench's `--number_of_samples N`. When
@@ -154,23 +176,31 @@ vs `lmms_eval`) — no silent averaging.
 
 ## Supported model adapters
 
-| Model path pattern                  | AudioBench `--model` key |
-|-------------------------------------|--------------------------|
-| `*qwen2-audio*` / `*qwen-audio*`    | `qwen2_audio`            |
-| `*salmonn*`                         | `salmonn`                |
-| `*ltu-*` / `*/ltu*` / `*ltu_as*`    | `ltu`                    |
-| `*whisper-*` / `*/whisper*`         | `whisper`                |
-| `*audio-flamingo*` / `*audioflamingo*` | `audioflamingo`        |
-| `*meralion*`                        | `meralion`               |
-| (anything else)                     | `generic` (default HF pipeline) |
-
-To override detection explicitly, pass the key as a suffix in the suite
-column: `audiobench:qwen2_audio`. The dispatcher in
-`oellm/contrib/dispatch.py` already splits on `:`.
+AudioBench dispatches on a fixed list of literal `model_name` strings
+(see `$AUDIOBENCH_DIR/src/model.py`); each loader under `model_src/`
+fetches its own HF repo. Arbitrary HF checkpoints are not supported —
+only the variants below:
+
+| Model path substring (lowered)                 | AudioBench `model_name` (literal)         |
+|------------------------------------------------|-------------------------------------------|
+| `qwen2-audio-7b-instruct` / `qwen2_audio_7b_instruct` | `Qwen2-Audio-7B-Instruct`          |
+| `qwen-audio-chat` / `qwen_audio_chat`          | `Qwen-Audio-Chat`                         |
+| `salmonn`                                      | `SALMONN_7B`                              |
+| `meralion-audiollm` / `meralion_audiollm`      | `MERaLiON-AudioLLM-Whisper-SEA-LION`      |
+| `whisper-large-v3` / `whisper_large_v3`        | `whisper_large_v3`                        |
+| `whisper-large-v2` / `whisper_large_v2`        | `whisper_large_v2`                        |
+| `phi-4-multimodal` / `phi_4_multimodal`        | `phi_4_multimodal_instruct`               |
+| `seallms-audio-7b` / `seallms_audio_7b`        | `seallms_audio_7b`                        |
+| `wavllm`                                       | `WavLLM_fairseq`                          |
+| (anything else)                                | error — no generic loader upstream        |
+
+To override detection, pass the literal AudioBench key as a suffix:
+`audiobench:Qwen2-Audio-7B-Instruct`. Case is preserved end-to-end
+(AudioBench's match is case-sensitive).
 
 ## How results flow end-to-end
 
-1. `schedule-evals` expands `audio-audiobench*` groups → 27 rows in
+1. `schedule-eval` expands `audio-audiobench*` groups → 27 rows in
    `jobs.csv` with `eval_suite=audiobench` (plus an adapter suffix from
    `detect_model_flags`).
 2. `_collect_dataset_specs` auto-derives `needs_snapshot_download=True`
diff --git a/oellm/contrib/regiondial_bench/README.md b/oellm/contrib/regiondial_bench/README.md
index b540d6ef..57f646ea 100644
--- a/oellm/contrib/regiondial_bench/README.md
+++ b/oellm/contrib/regiondial_bench/README.md
@@ -16,17 +16,17 @@ plus per-round breakdown (R1–R7) for gIoU and bbox_AP.
 
 ## Prerequisites
 
-### 1. Clone RegionReasoner
+The benchmark calls `test/evaluation/evaluation_multi_segmentation.py` and
+the `test/vision_reasoner/` model wrapper from the RegionReasoner
+repository as a subprocess, so the repo must be present on the cluster
+filesystem. A dedicated venv is required for `flash-attn` (specific
+pre-built wheel) and HEIF image support (`pi-heif`); see
+[`docs/VENV.md`](../../../docs/VENV.md) for the framework venvs.
 
-The benchmark relies on the inference script
-`test/evaluation/evaluation_multi_segmentation.py` and the model wrapper
-`test/vision_reasoner/` from the RegionReasoner repository. These are **not
-packaged** — the platform calls them directly as a subprocess, so the repo
-must be present on the cluster filesystem.
+### 1. Clone RegionReasoner
 
 ```bash
-git clone https://github.com/lmsdss/RegionReasoner \
-    /path/to/RegionReasoner
+git clone https://github.com/lmsdss/RegionReasoner /path/to/RegionReasoner
 ```
 
 ### 2. Configure clusters.yaml
@@ -38,40 +38,41 @@ my-cluster:
   ...
   HF_HOME: "/path/to/large/filesystem/huggingface"   # must have ~30 GB free
   REGION_REASONER_DIR: "/path/to/RegionReasoner"
-  GPUS_PER_NODE: 4                                   # controls both SLURM --gres and shard count
+  GPUS_PER_NODE: 4                                   # controls SLURM --gres and shard count
 ```
 
-> **`HF_HOME`** must point to a filesystem with at least **30 GB** of free
-> space. On CINECA Leonardo, use the work filesystem
-> (`/leonardo_work/<project>/huggingface`), not the home filesystem (50 GB
-> quota, fills up quickly).
+> `HF_HOME` must point to a filesystem with at least 30 GB free. On
+> CINECA Leonardo, use the work filesystem
+> (`/leonardo_work/<project>/huggingface`), not the home filesystem
+> (50 GB quota).
 
-### 3. Install dependencies in your venv
+### 3. Create a venv and install dependencies
 
 ```bash
-# PyTorch — match the CUDA version available on your cluster
-pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/cu121
+uv venv --python 3.12 regiondial-venv
+source regiondial-venv/bin/activate
+uv pip install -e .
 
-# Matching torchvision
-pip install torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu121
+# PyTorch — match the cluster's CUDA driver (cu121 for driver supporting CUDA 12.2)
+uv pip install torch==2.5.1 torchvision==0.20.1 \
+    --index-url https://download.pytorch.org/whl/cu121
 
-# flash-attn pre-built wheel (no compilation needed)
+# flash-attn pre-built wheel (Python 3.12 / CUDA 12.x / torch 2.5.1)
 wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp312-cp312-linux_x86_64.whl
-pip install flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp312-cp312-linux_x86_64.whl
+uv pip install flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp312-cp312-linux_x86_64.whl
 
 # HEIF image support
-pip install pi-heif
+uv pip install pi-heif
 ```
 
-> **flash-attn note:** The pre-built wheel above is for Python 3.12, CUDA 12.x,
-> torch 2.5.1. If your configuration differs, find the matching wheel at
-> https://github.com/Dao-AILab/flash-attention/releases
+> If your Python / CUDA / torch combination differs, find the matching
+> flash-attn wheel at
+> <https://github.com/Dao-AILab/flash-attention/releases>.
 
-### 4. What gets auto-downloaded
+### What gets auto-downloaded
 
-When you run `oellm schedule-eval`, the platform automatically pre-downloads
-the following on the login node (before SLURM submission, so compute nodes do
-not need internet access):
+`oellm schedule-eval` pre-downloads the following on the login node so
+compute nodes do not need internet access:
 
 | Asset | HF repo | Size |
 |---|---|---|

From b355cd57983eb41447eff7fdc65e1557c3ed5eee Mon Sep 17 00:00:00 2001
From: Ivan Slobozhan <ivan.slobozhan@gmail.com>
Date: Mon, 4 May 2026 23:10:31 +0200
Subject: [PATCH 5/6] fix comments

---
 oellm/contrib/audiobench/README.md | 89 +++++++-----------------------
 1 file changed, 21 insertions(+), 68 deletions(-)

diff --git a/oellm/contrib/audiobench/README.md b/oellm/contrib/audiobench/README.md
index ea4c49ab..60ab3261 100644
--- a/oellm/contrib/audiobench/README.md
+++ b/oellm/contrib/audiobench/README.md
@@ -40,30 +40,22 @@ on-cluster clone. A dedicated venv is required: the `[audiobench]` extra
 pins `transformers<5` and `jiwer<3`, which conflict with the general eval
 venv (see [`docs/VENV.md`](../../../docs/VENV.md) for the framework venvs).
 
-### 1. Clone AudioBench
+### 1. Clone AudioBench and configure `clusters.yaml`
 
 ```bash
 git clone https://github.com/AudioLLMs/AudioBench /path/to/AudioBench
 ```
 
-AudioBench's `main` branch is tracked without a pinned SHA; updates are a
-`git pull` under `$AUDIOBENCH_DIR`.
-
-### 2. Configure clusters.yaml
-
 Add `AUDIOBENCH_DIR` to your cluster block in
 `oellm/resources/clusters.yaml`:
 
 ```yaml
 leonardo:
   ...
-  AUDIOBENCH_DIR: "/leonardo/home/userexternal/<user>/AudioBench"
+  AUDIOBENCH_DIR: "/path/to/AudioBench"
 ```
 
-`oellm.contrib.dispatch`'s `CLUSTER_ENV_VARS` check raises a clear error
-at dispatch time if the variable is missing.
-
-### 3. Create a venv and install the `[audiobench]` extra
+### 2. Create the venv
 
 ```bash
 uv venv --python 3.12 audiobench-venv
@@ -71,49 +63,36 @@ source audiobench-venv/bin/activate
 uv pip install -e ".[audiobench]"
 ```
 
-The extra pins `transformers>=4.45,<5`, `jiwer<3`, `sacrebleu`,
-`pythainlp`, `evaluate`, `soundfile`, `librosa`.
-
-### 4. Install AudioBench's runtime dependencies
+The `[audiobench]` extra pins `transformers>=4.45,<5`, `jiwer<3`,
+`sacrebleu`, `pythainlp`, `evaluate`, `soundfile`, `librosa`.
 
-Filter `vllm` — it is only used by judge-dependent tasks (deferred):
+### 3. Install AudioBench's runtime dependencies
 
 ```bash
+# AudioBench's own requirements (filter vllm; only used by deferred judge tasks)
 grep -v -i '^vllm' /path/to/AudioBench/requirements.txt > /tmp/ab-reqs.txt
 uv pip install -r /tmp/ab-reqs.txt
-```
-
-### 5. Re-pin PyTorch for the cluster's CUDA driver
 
-PyPI's default torch wheels target a CUDA runtime newer than most HPC
-drivers (Leonardo, JURECA report CUDA 12.2) and crash with
-`NVIDIA driver too old`. Use the `cu121` index:
-
-```bash
+# PyTorch for cluster's CUDA driver — PyPI defaults target a newer runtime
+# than most HPC drivers (Leonardo / JURECA report CUDA 12.2) and crash with
+# `NVIDIA driver too old`.  Use the cu121 index.
 uv pip install torch torchvision torchaudio \
     --index-url https://download.pytorch.org/whl/cu121
-```
-
-### 6. Reinstall rapidfuzz
 
-The pure-Python fallback raises `NotImplementedError` on
-`Levenshtein.editops`, which jiwer's WER scoring calls. Force a fresh
-install of the C extension:
-
-```bash
+# rapidfuzz C extension — without this, jiwer's WER scoring hits the
+# pure-Python fallback and raises NotImplementedError on Levenshtein.editops.
 uv pip install --reinstall rapidfuzz
 ```
 
-### 7. Verify
-
-```bash
-python -c "
-from transformers import Qwen2AudioForConditionalGeneration
-from rapidfuzz.distance import Levenshtein
-Levenshtein.editops('a', 'b')   # must not raise NotImplementedError
-print('audiobench venv OK')
-"
-```
+> Verify the venv works:
+> ```bash
+> python -c "
+> from transformers import Qwen2AudioForConditionalGeneration
+> from rapidfuzz.distance import Levenshtein
+> Levenshtein.editops('a', 'b')   # must not raise
+> print('audiobench venv OK')
+> "
+> ```
 
 ### Dataset pre-download
 
@@ -198,29 +177,3 @@ To override detection, pass the literal AudioBench key as a suffix:
 `audiobench:Qwen2-Audio-7B-Instruct`. Case is preserved end-to-end
 (AudioBench's match is case-sensitive).
 
-## How results flow end-to-end
-
-1. `schedule-eval` expands `audio-audiobench*` groups → 27 rows in
-   `jobs.csv` with `eval_suite=audiobench` (plus an adapter suffix from
-   `detect_model_flags`).
-2. `_collect_dataset_specs` auto-derives `needs_snapshot_download=True`
-   from the group-name prefix (`audio-*`) and snapshots every referenced
-   `AudioLLMs/*` repo to the shared HF cache.
-3. `template.sbatch`'s `*)` catch-all invokes
-   `python -m oellm.contrib.dispatch --suite audiobench:<adapter> …`.
-4. `oellm.contrib.audiobench.suite.run()` subprocesses
-   `python src/main_evaluate.py …` inside `$AUDIOBENCH_DIR`, captures
-   the result JSON AudioBench writes under its `--log_dir`, extracts the
-   metric value, and writes a lmms-eval-compatible JSON at
-   `$output_path`.
-5. `collect-results` reads it via `parse_results()` and the standard
-   `_resolve_metric` fallback chain — no special-casing in core code.
-
-## Open items
-
-- **Judge service hosting:** judge-dependent tasks need a Llama-3-70B-AWQ
-  judge on an OpenAI-compatible endpoint. Plan is a separate long-running
-  vLLM sbatch whose URL/model lands in `clusters.yaml` as
-  `AUDIOBENCH_JUDGE_URL` and `AUDIOBENCH_JUDGE_MODEL`.
-- **MERaLiON / IMDA NSC tasks:** ~21 gated AudioBench tasks require
-  corpora not on public HF. Deferred until WP4 needs them.

From 6117bd511582f3c381ff66084e8ba18fe9b829eb Mon Sep 17 00:00:00 2001
From: Ivan Slobozhan <ivan.slobozhan@gmail.com>
Date: Mon, 4 May 2026 23:13:39 +0200
Subject: [PATCH 6/6] Revert accidental clusters.yaml changes

---
 oellm/resources/clusters.yaml | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/oellm/resources/clusters.yaml b/oellm/resources/clusters.yaml
index 370201b6..36e23d29 100644
--- a/oellm/resources/clusters.yaml
+++ b/oellm/resources/clusters.yaml
@@ -1,5 +1,5 @@
 shared:
-  TIME_LIMIT: "02:30:00"  # time limit in the format HH:MM:SS
+  TIME_LIMIT: "00:30:00"  # time limit in the format HH:MM:SS
   UV_LINK_MODE: "copy"
   EVAL_OUTPUT_DIR: "{EVAL_BASE_DIR}/{USER}"  # where evaluations are written
   GPUS_PER_NODE: 1
@@ -7,16 +7,13 @@ shared:
   HF_DATASETS_DISABLE_PROGRESS_BARS: "1"
 
 leonardo:
-  hostname_pattern: "*.leonardo.local"
-  EVAL_BASE_DIR: "/leonardo/home/userexternal/islobozh/oellm-cli-shared-evals/"
-  PARTITION: "boost_usr_prod"
-  ACCOUNT: "OELLM_prod2026"
-  QUEUE_LIMIT: 1000
-  EVAL_CONTAINER_IMAGE: "eval_env-leonardo.sif"
+  hostname_pattern: "*.leonardo.local"  # use this regexp to automatically assign environment variables corresponding to this YAML
+  EVAL_BASE_DIR: "/leonardo_work/AIFAC_L01_028/oellm-cli-shared-evals"
+  PARTITION: "boost_usr_prod"  # default partition to use
+  ACCOUNT: "OELLM_prod2026"  # default account to use
+  QUEUE_LIMIT: 1000  # maximum number of jobs that can be submitted as job/array, used to send only jobs that respects QOS
+  EVAL_CONTAINER_IMAGE: "eval_env-leonardo.sif"  # name of the container image that is pulled which is built automatically with Github actions
   SINGULARITY_ARGS: "--nv"
-  HF_HOME: "/leonardo_work/OELLM_prod2026/huggingface"
-  GPUS_PER_NODE: 4
-  REGION_REASONER_DIR: "/leonardo/home/userexternal/islobozh/RegionReasoner"
 
 jureca:
   hostname_pattern: "*.jureca"