From 6954ae99575499b389b173e13525ef13400d4d7f Mon Sep 17 00:00:00 2001 From: Ivan Slobozhan Date: Tue, 21 Apr 2026 11:24:02 +0200 Subject: [PATCH 1/6] adding audiobench benchmark as contrib --- oellm/contrib/audiobench/README.md | 199 +++++++++++++ oellm/contrib/audiobench/__init__.py | 0 oellm/contrib/audiobench/adapter.py | 80 +++++ oellm/contrib/audiobench/suite.py | 417 +++++++++++++++++++++++++++ oellm/contrib/audiobench/task.py | 293 +++++++++++++++++++ oellm/resources/template.sbatch | 5 +- pyproject.toml | 14 + 7 files changed, 1007 insertions(+), 1 deletion(-) create mode 100644 oellm/contrib/audiobench/README.md create mode 100644 oellm/contrib/audiobench/__init__.py create mode 100644 oellm/contrib/audiobench/adapter.py create mode 100644 oellm/contrib/audiobench/suite.py create mode 100644 oellm/contrib/audiobench/task.py diff --git a/oellm/contrib/audiobench/README.md b/oellm/contrib/audiobench/README.md new file mode 100644 index 00000000..b1eee38a --- /dev/null +++ b/oellm/contrib/audiobench/README.md @@ -0,0 +1,199 @@ +# AudioBench + +AudioBench (AudioLLMs/AudioBench, [arXiv 2406.16020](https://arxiv.org/abs/2406.16020)) +is a broad audio-understanding benchmark covering ASR, speech translation, +spoken reasoning, audio scene QA, and paralinguistics. This contrib plugin +wraps AudioBench as a callable `audiobench` suite inside elliot-cli so WP4 +can produce numbers directly comparable with the AudioBench paper and +leaderboard, without the scoring-normalisation drift that would come from +running the same datasets through lmms-eval. + +## Scope — Phase 1 (this release) + +**27 judge-free tasks** across ASR (WER), speech translation (BLEU), spoken +reasoning (accuracy / string_match), and AudioCaps (METEOR). Of these: + +- **20 tasks are genuinely new** to the platform — not in any of our + existing lmms-eval `audio-*` groups. Examples: `earnings21_test`, + `earnings22_test`, GigaSpeech2 (Thai / Indonesian / Vietnamese), + SEAME code-switch, Spoken-MQA reasoning splits, MMAU mini. +- **7 tasks are dual-registered** duplicates of benchmarks we already run + through lmms-eval (LibriSpeech test-clean/other, Common Voice 15 EN, + GigaSpeech, People's Speech, TED-LIUM 3, CoVoST2 en→zh). These use + AudioBench's own scorer and normaliser so WP4 can report numbers + aligned with the AudioBench paper. + +Every AudioBench task is namespaced with an `audiobench_` prefix so the CSV +`task_path` column unambiguously identifies which scorer produced a number +(e.g. `audiobench_librispeech_test_clean` is AudioBench-scored; +`librispeech_test_clean` remains the lmms-eval version). + +**Phase 2** (not in this release) will add ~19 judge-dependent tasks +(SLUE-SQA5, Spoken-SQuAD, AudioCaps-QA, IEMOCAP / MELD / VoxCeleb probes, +AudioLLM-InstructionFollowing) once a vLLM judge server is provisioned on +Leonardo. + +## Prerequisites + +### 1. Clone AudioBench on the cluster + +AudioBench is **not** pip-installable — upstream is a script harness with +bare imports (`from dataset import ...` inside `src/main_evaluate.py`) and +no `pyproject.toml` / `setup.py`. The plugin invokes it as a subprocess +from an on-cluster clone. + +```bash +git clone https://github.com/AudioLLMs/AudioBench /path/to/AudioBench +``` + +We track the **latest `main`** — no pinned SHA — so updates are a simple +`git pull` under `$AUDIOBENCH_DIR`. If a breaking upstream change lands, +file an issue and we'll introduce a pin. + +### 2. Install AudioBench's own runtime dependencies + +Still inside the clone: + +```bash +cd /path/to/AudioBench +python -m venv .venv && source .venv/bin/activate +pip install -r requirements.txt +``` + +AudioBench's deps (unpinned upstream): `transformers`, `vllm`, `datasets`, +`torchaudio`, `peft`, `autoawq`, `huggingface-hub`, `librosa`, `soundfile`, +`fire`, `evaluate`, `jiwer`, `more_itertools`. Use a **separate venv** +from the elliot-cli venv — AudioBench typically pulls in a bleeding-edge +`transformers` that will conflict with lmms-eval's pin. + +### 3. Configure `clusters.yaml` + +Add `AUDIOBENCH_DIR` to your cluster block in +`oellm/resources/clusters.yaml`: + +```yaml +leonardo: + ... + AUDIOBENCH_DIR: "/leonardo/home/userexternal//AudioBench" +``` + +The plugin fails fast at dispatch time (via +`oellm.contrib.dispatch`'s `CLUSTER_ENV_VARS` check) if the variable is +missing, so you'll get a clean error message instead of a crash deep +inside the subprocess. + +### 4. Install the elliot-cli `audiobench` extra + +On the submission / login node where you run `oellm schedule-evals`: + +```bash +uv pip install -e ".[audiobench]" +``` + +This installs our Python-side scorer deps (`jiwer`, `sacrebleu`, +`pythainlp`, `evaluate`) used for result post-processing — **not** +AudioBench itself. + +### 5. Dataset pre-download + +No manual steps required. `schedule-evals` auto-downloads every +`AudioLLMs/*` HF repo referenced by the requested task group on the +login node via `huggingface_hub.snapshot_download(max_workers=2)` so the +compute nodes do not need internet access. The rate-limit-friendly +`max_workers=2` is shared infrastructure — see `oellm/utils.py`. + +## Running + +### Available task groups + +| Task group | Leaves | What it covers | +|----------------------------------|--------|-----------------------------------------------------------------| +| `audio-audiobench` | 27 | Full Phase-1 suite (everything below). | +| `audio-audiobench-asr` | 15 | WER tasks — 9 new + 6 dual-registered with lmms-eval. | +| `audio-audiobench-st` | 6 | BLEU speech-translation — 5 new + 1 dual (en→zh). | +| `audio-audiobench-reasoning` | 6 | Spoken-MQA × 4, MMAU mini, AudioCaps METEOR. | + +### Example + +```bash +# Full AudioBench Phase-1 suite on a Qwen2-Audio model: +oellm schedule-evals \ + --models Qwen/Qwen2-Audio-7B-Instruct \ + --task-groups audio-audiobench \ + --venv-path ~/elliot-venv + +# ASR only: +oellm schedule-evals \ + --models Qwen/Qwen2-Audio-7B-Instruct \ + --task-groups audio-audiobench-asr \ + --venv-path ~/elliot-venv + +# Smoke test with --limit: +oellm schedule-evals \ + --models Qwen/Qwen2-Audio-7B-Instruct \ + --task-groups audio-audiobench-asr \ + --limit 100 \ + --venv-path ~/elliot-venv +``` + +`--limit N` is forwarded to AudioBench's `--number_of_samples N`. When +unset, the full test split is evaluated. + +### Collecting results + +```bash +oellm collect-results \ + --eval-output-dir /path/to/evals \ + --output-csv audiobench_results.csv +``` + +The primary metric per task is what's registered in `task_metrics` +(`wer` / `bleu` / `accuracy` / `string_match` / `meteor`). Dual-registered +tasks land in the CSV **alongside** their lmms-eval counterparts, with +different `task_path` values (`audiobench_librispeech_test_clean` vs +`librispeech_test_clean`) and different `eval_suite` values (`audiobench` +vs `lmms_eval`) — no silent averaging. + +## Supported model adapters + +| Model path pattern | AudioBench `--model` key | +|-------------------------------------|--------------------------| +| `*qwen2-audio*` / `*qwen-audio*` | `qwen2_audio` | +| `*salmonn*` | `salmonn` | +| `*ltu-*` / `*/ltu*` / `*ltu_as*` | `ltu` | +| `*whisper-*` / `*/whisper*` | `whisper` | +| `*audio-flamingo*` / `*audioflamingo*` | `audioflamingo` | +| `*meralion*` | `meralion` | +| (anything else) | `generic` (default HF pipeline) | + +To override detection explicitly, pass the key as a suffix in the suite +column: `audiobench:qwen2_audio`. The dispatcher in +`oellm/contrib/dispatch.py` already splits on `:`. + +## How results flow end-to-end + +1. `schedule-evals` expands `audio-audiobench*` groups → 27 rows in + `jobs.csv` with `eval_suite=audiobench` (plus an adapter suffix from + `detect_model_flags`). +2. `_collect_dataset_specs` auto-derives `needs_snapshot_download=True` + from the group-name prefix (`audio-*`) and snapshots every referenced + `AudioLLMs/*` repo to the shared HF cache. +3. `template.sbatch`'s `*)` catch-all invokes + `python -m oellm.contrib.dispatch --suite audiobench: …`. +4. `oellm.contrib.audiobench.suite.run()` subprocesses + `python src/main_evaluate.py …` inside `$AUDIOBENCH_DIR`, captures + the result JSON AudioBench writes under its `--log_dir`, extracts the + metric value, and writes a lmms-eval-compatible JSON at + `$output_path`. +5. `collect-results` reads it via `parse_results()` and the standard + `_resolve_metric` fallback chain — no special-casing in core code. + +## Open questions / Phase-2 prerequisites + +- **Judge service hosting:** Phase 2 needs a Llama-3-70B-AWQ judge on an + OpenAI-compatible endpoint. Plan is a separate long-running vLLM sbatch + whose URL/model lands in `clusters.yaml` as `AUDIOBENCH_JUDGE_URL` and + `AUDIOBENCH_JUDGE_MODEL`. +- **MERaLiON / IMDA NSC tasks:** ~21 gated AudioBench tasks require + corpora not on public HF. These will ship in a later phase — or not, + depending on whether WP4 needs them. diff --git a/oellm/contrib/audiobench/__init__.py b/oellm/contrib/audiobench/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/oellm/contrib/audiobench/adapter.py b/oellm/contrib/audiobench/adapter.py new file mode 100644 index 00000000..2b15c7f6 --- /dev/null +++ b/oellm/contrib/audiobench/adapter.py @@ -0,0 +1,80 @@ +"""AudioBench model adapter. + +Maps a HuggingFace model path (or local filesystem path) to the string key +that AudioBench's ``src/main_evaluate.py --model`` argument expects. The +upstream dispatch table lives in ``AudioBench/src/model.py`` and is +hand-wired — one entry per model family. + +The adapter returns one of: + +- ``"qwen2_audio"`` — Qwen2-Audio / Qwen-Audio checkpoints. +- ``"salmonn"`` — SALMONN family (Tsinghua). +- ``"ltu"`` — Listen-Think-Understand. +- ``"whisper"`` — Whisper (OpenAI). +- ``"audioflamingo"`` — Audio-Flamingo (NVIDIA). +- ``"meralion"`` — MERaLiON (Singapore-NLP). +- ``"generic"`` — fallback. AudioBench treats this as the default HF + pipeline dispatch, which works for many generic audio + LLMs but may need tuning per model. + +The detected value is passed to :mod:`oellm.contrib.dispatch` as the +``model_flags`` portion of the ``eval_suite`` column +(``audiobench:``), exactly like the regiondial_bench pattern. +""" + +from __future__ import annotations + +from oellm.core.base_model_adapter import BaseModelAdapter + +# (model-family key, substrings to match in lowered model path) +# Order matters — first match wins. More-specific patterns must appear +# before their super-strings (e.g. "qwen2-audio" before "qwen"). +_PATTERNS: list[tuple[str, tuple[str, ...]]] = [ + ("qwen2_audio", ("qwen2-audio", "qwen2_audio", "qwen-audio", "qwen_audio")), + ("salmonn", ("salmonn",)), + # LTU checkpoints often have paths like "ltu-as/", "ltu-7b", or + # "MIT/ltu". Prefix with "/" / "-" where possible to avoid false + # matches (e.g. "altus"). + ("ltu", ("ltu-", "/ltu", "_ltu", "ltu_as")), + ("whisper", ("whisper-", "/whisper", "openai/whisper")), + ("audioflamingo", ("audio-flamingo", "audioflamingo", "audio_flamingo")), + ("meralion", ("meralion",)), +] + + +class AudioBenchModelAdapter(BaseModelAdapter): + """Adapter that resolves ``--model`` flag for AudioBench subprocess.""" + + def __init__(self, model_path: str) -> None: + self._path = model_path + + @property + def model_path(self) -> str: + return self._path + + def to_lm_eval_args(self) -> str: + # Not used — AudioBench doesn't route through lm-eval. Provided + # only to satisfy the BaseModelAdapter contract. + return f"pretrained={self._path},trust_remote_code=True" + + def to_lmms_eval_args(self) -> str: + # Not used — see note on to_lm_eval_args(). + return f"pretrained={self._path}" + + def to_contrib_flags(self) -> str | None: + """Return the AudioBench ``--model`` key for this model path.""" + lowered = self._path.lower() + for key, needles in _PATTERNS: + if any(n in lowered for n in needles): + return key + return "generic" + + +def detect_audiobench_model_type(model_path: str) -> str: + """Module-level convenience — matches :func:`oellm.constants.detect_lmms_model_type`. + + Returns the same value as + ``AudioBenchModelAdapter(model_path).to_contrib_flags()`` but never + returns ``None`` (falls back to ``"generic"``). + """ + return AudioBenchModelAdapter(model_path).to_contrib_flags() or "generic" diff --git a/oellm/contrib/audiobench/suite.py b/oellm/contrib/audiobench/suite.py new file mode 100644 index 00000000..7fed48a1 --- /dev/null +++ b/oellm/contrib/audiobench/suite.py @@ -0,0 +1,417 @@ +"""AudioBench contrib suite — plugin protocol implementation. + +Implements the :mod:`oellm.registry` plugin protocol for the AudioBench +benchmark (AudioLLMs/AudioBench, arXiv 2406.16020). AudioBench is **not** a +pip-installable library — it is a script harness. We invoke its entry point +via ``python src/main_evaluate.py`` as a subprocess, from a clone pointed at +by the ``$AUDIOBENCH_DIR`` environment variable (configured in +``clusters.yaml``). This mirrors the precedent set by ``regiondial_bench``. + +Cluster setup +------------- +The following environment variables must be set in ``clusters.yaml`` (or the +cluster's module/profile system) before using any ``audio-audiobench-*`` +task group: + +``AUDIOBENCH_DIR`` + Absolute path to a local clone of + https://github.com/AudioLLMs/AudioBench. The entry point + ``src/main_evaluate.py`` must be present and the repo's own Python + dependencies must be installed in the active environment. + +Phase 2 (judge-dependent tasks) will additionally require: + +``AUDIOBENCH_JUDGE_URL`` / ``AUDIOBENCH_JUDGE_MODEL`` + OpenAI-compatible URL and model name for the judge server (typically a + vLLM deployment of ``meta-llama/Meta-Llama-3-70B-Instruct-AWQ``). Not + needed for Phase-1 judge-free tasks shipped today. + +Output format +------------- +:func:`run` writes a lmms-eval-compatible JSON file to *output_path* so +that :func:`oellm.main.collect_results` can parse it without modification:: + + { + "model_name_or_path": "", + "results": { + "audiobench_librispeech_test_clean": { + "wer": 0.047 + } + }, + "configs": { + "audiobench_librispeech_test_clean": {"num_fewshot": 0} + } + } +""" + +from __future__ import annotations + +import json +import logging +import os +import subprocess +from pathlib import Path + +from oellm.contrib.audiobench.task import ( + AUDIOBENCH_TASKS, + SUITE_NAME, + AudioBenchTaskSpec, + get_task_spec, +) + +logger = logging.getLogger(__name__) + +CLUSTER_ENV_VARS = ["AUDIOBENCH_DIR"] + +# Mapping family → (group_name, human description). +_FAMILY_GROUPS = { + "asr": ( + "audio-audiobench-asr", + "AudioBench ASR tasks (WER). Covers AudioBench-scored LibriSpeech, " + "Common Voice 15 EN, GigaSpeech, People's Speech, TED-LIUM 3 — dual " + "with our lmms-eval versions for paper-comparable numbers — plus new " + "tasks not in lmms-eval: earnings21/22, TED-LIUM 3 long-form, " + "AISHELL Mandarin, GigaSpeech2 (th/id/vi), SEAME code-switch.", + ), + "st": ( + "audio-audiobench-st", + "AudioBench speech-translation tasks (BLEU). CoVoST2 covering " + "en↔id, en↔ta, zh→en, ta→en (new), plus dual-registered en→zh.", + ), + "reasoning": ( + "audio-audiobench-reasoning", + "AudioBench spoken reasoning / captioning. Spoken-MQA digit + " + "reasoning splits (accuracy), MMAU-mini (string_match), " + "AudioCaps (METEOR).", + ), +} + +_TOP_LEVEL_GROUP = "audio-audiobench" +_TOP_LEVEL_DESC = ( + "AudioBench Phase-1 suite (judge-free). Runs all 27 AudioBench tasks " + "that do not require an LLM judge: ASR (WER), speech translation (BLEU), " + "spoken reasoning (accuracy/string_match), and AudioCaps captioning " + "(METEOR). Phase 2 (judge-dependent tasks) will extend this group once " + "the judge service is configured." +) + + +def _build_task_groups() -> dict: + """Assemble the :data:`TASK_GROUPS` dict from :data:`AUDIOBENCH_TASKS`. + + One top-level ``audio-audiobench`` group containing all 27 leaves, plus + three sub-groups keyed by family (``-asr`` / ``-st`` / ``-reasoning``). + All groups are zero-shot by design — AudioBench tasks do not support + in-context examples. + """ + task_metrics: dict[str, str] = {t.name: t.metric for t in AUDIOBENCH_TASKS} + + def _task_entry(t: AudioBenchTaskSpec) -> dict: + entry: dict = {"task": t.name, "dataset": t.hf_repo} + # ``data_dir``-style subsetting: we deliberately do NOT set ``subset`` + # in the YAML entry. The reason is that ``load_dataset(name=...)`` + # used by ``_pre_download_datasets_from_specs`` treats ``subset`` as a + # config name, not a ``data_dir`` — and for gigaspeech2/spoken-mqa the + # upstream distinction is a data_dir, not a config. Since the group + # name starts with "audio-", ``_collect_dataset_specs`` auto-sets + # ``needs_snapshot_download=True`` which downloads the whole repo, + # so AudioBench can read the right data_dir at runtime. This also + # means multiple tasks sharing one HF repo dedupe to a single spec. + return entry + + groups: dict[str, dict] = {} + + # Sub-groups per family. + tasks_by_family: dict[str, list[AudioBenchTaskSpec]] = { + "asr": [], + "st": [], + "reasoning": [], + } + for t in AUDIOBENCH_TASKS: + tasks_by_family[t.family].append(t) + + for family, (group_name, desc) in _FAMILY_GROUPS.items(): + entries = tasks_by_family[family] + if not entries: + continue + groups[group_name] = { + "suite": SUITE_NAME, + "n_shots": [0], + "description": desc, + "tasks": [_task_entry(t) for t in entries], + } + + # Top-level group — union of everything. + groups[_TOP_LEVEL_GROUP] = { + "suite": SUITE_NAME, + "n_shots": [0], + "description": _TOP_LEVEL_DESC, + "tasks": [_task_entry(t) for t in AUDIOBENCH_TASKS], + } + + return {"task_metrics": task_metrics, "task_groups": groups} + + +TASK_GROUPS: dict = _build_task_groups() + + +# --------------------------------------------------------------------------- +# Model-flag detection. +# --------------------------------------------------------------------------- + + +def detect_model_flags(model_path: str) -> str | None: + """Delegate to :class:`AudioBenchModelAdapter`. + + Called by :class:`oellm.runner.EvalRunner.resolve_suite` to append the + AudioBench model-family key to ``eval_suite`` as + ``audiobench:``. + """ + from oellm.contrib.audiobench.adapter import AudioBenchModelAdapter + + return AudioBenchModelAdapter(model_path).to_contrib_flags() + + +# --------------------------------------------------------------------------- +# Runtime — subprocess into AudioBench's src/main_evaluate.py. +# --------------------------------------------------------------------------- + + +def run( + *, + model_path: str, + task: str, + n_shot: int, + output_path: Path, + model_flags: str | None, + env: dict[str, str], +) -> None: + """Execute one AudioBench task and write lmms-eval-shaped JSON. + + Args: + model_path: HF repo ID or local path of the model under evaluation. + task: Canonical task name (must start with ``audiobench_``). + n_shot: Always 0 for AudioBench — recorded in the output ``configs`` + block for downstream compatibility. + output_path: Destination for the lmms-eval-compatible result JSON. + model_flags: AudioBench ``--model`` key (e.g. ``"qwen2_audio"``); + produced by :func:`detect_model_flags`. Falls back to + ``"generic"`` if not supplied. + env: Environment dict passed to the subprocess. Must contain + ``AUDIOBENCH_DIR`` (validated by dispatch.py before ``run`` is + called, but we re-check for safety). + + Raises: + RuntimeError: if AudioBench returns non-zero or produces no output. + KeyError: if *task* is not in the registry. + """ + ab_dir = env.get("AUDIOBENCH_DIR") + if not ab_dir: + raise RuntimeError( + "AUDIOBENCH_DIR must be set. Add it to clusters.yaml — " + "it should point at a local clone of " + "https://github.com/AudioLLMs/AudioBench." + ) + + entrypoint = Path(ab_dir) / "src" / "main_evaluate.py" + if not entrypoint.exists(): + raise FileNotFoundError( + f"AudioBench entry point not found: {entrypoint}\n" + f"Check that AUDIOBENCH_DIR={ab_dir!r} points at a valid " + "AudioBench clone." + ) + + spec = get_task_spec(task) + model_key = model_flags or "generic" + + # AudioBench writes outputs under a run-specific log directory; we set + # it to our output_path's parent so we can recover the raw result. + run_dir = output_path.parent / f"audiobench_{output_path.stem}" + run_dir.mkdir(parents=True, exist_ok=True) + + cmd = [ + "python", + "src/main_evaluate.py", + "--dataset", + spec.upstream_name, + "--model", + model_key, + "--model_name", + model_path, + "--metrics", + spec.upstream_metric, + "--log_dir", + str(run_dir), + ] + if spec.data_dir: + cmd.extend(["--data_dir", spec.data_dir]) + + # Forward LIMIT (set by template.sbatch) as AudioBench's + # --number_of_samples when present. "-1" means no limit in AudioBench. + limit = env.get("LIMIT", "").strip() + if limit: + cmd.extend(["--number_of_samples", str(limit)]) + + logger.info("AudioBench cmd: %s (cwd=%s)", " ".join(cmd), ab_dir) + completed = subprocess.run( + cmd, + cwd=ab_dir, + env=env, + check=False, + ) + if completed.returncode != 0: + raise RuntimeError( + f"AudioBench exited with code {completed.returncode} for " + f"task={task!r} model={model_path!r}" + ) + + metrics = _extract_metrics(run_dir, spec) + _write_lmms_shaped_json( + output_path=output_path, + model_path=model_path, + task_name=task, + n_shot=n_shot, + metrics=metrics, + ) + logger.info("Results written to %s", output_path) + + +def _extract_metrics(run_dir: Path, spec: AudioBenchTaskSpec) -> dict[str, float]: + """Find AudioBench's per-task score JSON inside *run_dir* and read it. + + AudioBench writes one JSON file per task under its ``--log_dir`` with + the score under a key matching ``--metrics``. We search recursively + for any ``*.json`` and pick the first one whose body contains the + expected metric key. This is intentionally lenient because upstream + log-layout has changed across releases. + + Raises: + RuntimeError: if no matching result file is found. + """ + candidates = sorted(run_dir.rglob("*.json")) + if not candidates: + raise RuntimeError( + f"AudioBench produced no result JSON under {run_dir}. " + "Check stdout/stderr for crashes." + ) + + target_key = spec.upstream_metric + for path in candidates: + try: + with open(path) as f: + body = json.load(f) + except (json.JSONDecodeError, OSError): + continue + value = _find_metric(body, target_key) + if value is not None: + # Emit the metric under OUR canonical key (spec.metric) so the + # lmms-eval-style ``task/metric,none`` stripping in + # collect_results() resolves to what's in task_metrics.yaml. + return {spec.metric: float(value)} + + raise RuntimeError( + f"Could not locate metric {target_key!r} in any of " + f"{len(candidates)} AudioBench result JSON(s) under {run_dir}" + ) + + +def _find_metric(body: object, key: str) -> float | None: + """Recursive search for a numeric value keyed by *key* anywhere in *body*. + + AudioBench's per-task JSON has nested structure that has drifted across + releases (sometimes ``{"wer": 0.04}``, sometimes + ``{"metrics": {"wer": {"score": 0.04}}}``). We tolerate either form. + """ + if isinstance(body, dict): + if key in body: + candidate = body[key] + if isinstance(candidate, int | float): + return float(candidate) + if isinstance(candidate, dict) and "score" in candidate: + score = candidate["score"] + if isinstance(score, int | float): + return float(score) + for v in body.values(): + found = _find_metric(v, key) + if found is not None: + return found + elif isinstance(body, list): + for item in body: + found = _find_metric(item, key) + if found is not None: + return found + return None + + +def _write_lmms_shaped_json( + *, + output_path: Path, + model_path: str, + task_name: str, + n_shot: int, + metrics: dict[str, float], +) -> None: + """Write a lmms-eval-compatible JSON at *output_path*. + + :func:`oellm.main.collect_results` reads this shape directly; the + ``_resolve_metric`` fallback chain picks up our ``task_metrics`` + mapping to extract the primary value. + """ + payload = { + "model_name_or_path": model_path, + "results": {task_name: metrics}, + "configs": {task_name: {"num_fewshot": n_shot}}, + } + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, "w") as f: + json.dump(payload, f, indent=2) + + +# --------------------------------------------------------------------------- +# parse_results — invoked by collect_results to recognise our output files. +# --------------------------------------------------------------------------- + + +def parse_results(data: dict) -> tuple[str, str, int, dict[str, float]] | None: + """Recognise a JSON dict produced by :func:`run`. + + Detection heuristic: the ``results`` dict contains at least one key + that starts with ``"audiobench_"``. Returns the tuple expected by + :func:`oellm.main.collect_results`: + + ``(model_id, task_name, n_shot, {metric: value})`` + + Returns ``None`` for JSON blobs that don't belong to this suite. + """ + results = data.get("results", {}) + if not isinstance(results, dict): + return None + for task_name, task_results in results.items(): + if not isinstance(task_name, str) or not task_name.startswith("audiobench_"): + continue + if not isinstance(task_results, dict): + continue + model_id = data.get("model_name_or_path") or data.get("model_name") or "unknown" + n_shot = data.get("configs", {}).get(task_name, {}).get("num_fewshot", 0) + # Coerce everything that can be float; leave non-numeric alone so + # _resolve_metric can still see them. + coerced: dict[str, float] = {} + for k, v in task_results.items(): + if isinstance(v, int | float): + coerced[k] = float(v) + return model_id, task_name, int(n_shot), coerced + return None + + +# Re-exports used by the test suite. +__all__ = [ + "CLUSTER_ENV_VARS", + "SUITE_NAME", + "TASK_GROUPS", + "detect_model_flags", + "parse_results", + "run", +] + +# Silence unused-import lint (the symbol is exported for consumer reuse). +_ = os diff --git a/oellm/contrib/audiobench/task.py b/oellm/contrib/audiobench/task.py new file mode 100644 index 00000000..2a86bc14 --- /dev/null +++ b/oellm/contrib/audiobench/task.py @@ -0,0 +1,293 @@ +"""AudioBench task registry. + +Single source of truth for the AudioBench (AudioLLMs/AudioBench, arXiv 2406.16020) +Phase-1 task set. The registry is consumed by :mod:`oellm.contrib.audiobench.suite` +to auto-generate ``TASK_GROUPS`` and to look up per-task metadata (HF repo, +upstream task name, metric) at dispatch time. + +Phase 1 = judge-free tasks only (27 total): + +- **20 new** benchmarks not covered by our lmms-eval task groups + (``earnings{21,22}``, ``gigaspeech2`` {thai, indonesian, vietnamese}, + ``aishell`` ZH ASR, ``seame`` code-switch, covost2 extra language pairs, + ``spoken-mqa`` reasoning splits, ``mmau_mini``, ``audiocaps`` METEOR). +- **7 dual-registered** duplicates of benchmarks we already run via lmms-eval + (LibriSpeech test-clean/other, Common Voice 15 EN, GigaSpeech, People's + Speech, TED-LIUM 3, covost2 en→zh). These use AudioBench's own scorer + and normalizer so WP4 can compare numbers against the AudioBench paper. + +Naming +------ +Every task name is prefixed ``audiobench_`` so the CSV ``task_path`` column +uniquely identifies the scorer and there is no collision with lmms-eval's +``librispeech_test_clean`` etc. :func:`AudioBenchTaskSpec.upstream_name` +returns the bare name that AudioBench's ``src/main_evaluate.py --dataset`` +flag expects. + +Phase 2 (judge-dependent tasks) will extend this registry with ~19 more +entries driven by a vLLM Llama-3-70B judge or the OpenAI API; see the +plugin README for the rollout plan. +""" + +from __future__ import annotations + +from dataclasses import dataclass + +SUITE_NAME = "audiobench" +_TASK_NAME_PREFIX = "audiobench_" + + +@dataclass(frozen=True) +class AudioBenchTaskSpec: + """Metadata for a single AudioBench task. + + Attributes: + name: Canonical ``audiobench_*`` task name used in the CSV + ``task_path`` column and in ``task_metrics`` / ``task_groups``. + upstream_name: The ``--dataset`` value AudioBench's + ``src/main_evaluate.py`` expects (e.g. ``"librispeech_test_clean"``). + hf_repo: HuggingFace dataset repo ID for pre-download + (e.g. ``"AudioLLMs/librispeech_test_clean"``). + metric: Primary metric key written to our ``task_metrics`` mapping. + One of ``wer`` / ``bleu`` / ``accuracy`` / ``string_match`` / + ``meteor``. + upstream_metric: The value passed to AudioBench's ``--metrics`` CLI + flag. Usually identical to :attr:`metric` but allows divergence + when AudioBench uses a different key for the same score (e.g. + ``wer`` vs ``bleu`` match; ``accuracy`` vs upstream ``acc``). + family: One of ``"asr" | "st" | "reasoning"``. Controls which + ``audio-audiobench-*`` sub-group the task lands in. + data_dir: Optional upstream ``data_dir=...`` selector, used by the + gigaspeech2 multi-language repo. Passed to AudioBench via + ``--data_dir`` (upstream convention). + """ + + name: str + upstream_name: str + hf_repo: str + metric: str + upstream_metric: str + family: str + data_dir: str | None = None + + @property + def task_group(self) -> str: + """Return the ``audio-audiobench-*`` sub-group this task belongs to.""" + return f"audio-audiobench-{self.family}" + + +def _t( + upstream_name: str, + hf_repo: str, + metric: str, + family: str, + *, + upstream_metric: str | None = None, + data_dir: str | None = None, + name: str | None = None, +) -> AudioBenchTaskSpec: + """Helper — build an :class:`AudioBenchTaskSpec` with sensible defaults. + + By default the canonical name is ``audiobench_``. Pass + ``name`` to override (used when upstream names collide across + data_dir variants of the same HF repo, e.g. gigaspeech2). + """ + return AudioBenchTaskSpec( + name=name if name is not None else _TASK_NAME_PREFIX + upstream_name, + upstream_name=upstream_name, + hf_repo=hf_repo, + metric=metric, + upstream_metric=upstream_metric or metric, + family=family, + data_dir=data_dir, + ) + + +# --------------------------------------------------------------------------- +# Bucket B — 20 genuinely new tasks (not in our lmms-eval task groups) +# --------------------------------------------------------------------------- + +_BUCKET_B_ASR = [ + # Mandarin ASR (not in lmms-eval). + _t("aishell_asr_zh_test", "AudioLLMs/aishell_1_zh_test", "wer", "asr"), + # Long-form English ASR from financial calls. + _t("earnings21_test", "AudioLLMs/earnings21_test", "wer", "asr"), + _t("earnings22_test", "AudioLLMs/earnings22_test", "wer", "asr"), + # Long-form TED talks (distinct from our tedlium_dev_test). + _t("tedlium3_long_form_test", "AudioLLMs/tedlium3_long_form_test", "wer", "asr"), + # GigaSpeech2 — multilingual SE-Asian ASR. All 3 share one HF repo and + # are disambiguated by ``data_dir``. Upstream --dataset name is the same, + # so we override ``name`` with a language suffix to keep canonical names + # unique in our CSV. + _t( + "gigaspeech2", + "AudioLLMs/gigaspeech2-test", + "wer", + "asr", + data_dir="th-test", + name="audiobench_gigaspeech2_thai", + ), + _t( + "gigaspeech2", + "AudioLLMs/gigaspeech2-test", + "wer", + "asr", + data_dir="id-test", + name="audiobench_gigaspeech2_indo", + ), + _t( + "gigaspeech2", + "AudioLLMs/gigaspeech2-test", + "wer", + "asr", + data_dir="vi-test", + name="audiobench_gigaspeech2_viet", + ), + # SEAME code-switch (English ↔ Mandarin). + _t("seame_dev_man", "AudioLLMs/seame_dev_man", "wer", "asr"), + _t("seame_dev_sge", "AudioLLMs/seame_dev_sge", "wer", "asr"), +] + +_BUCKET_B_ST = [ + # CoVoST2 language pairs not in lmms-eval (only en-zh is there). + _t("covost2_en_id_test", "AudioLLMs/covost2_en_id_test", "bleu", "st"), + _t("covost2_en_ta_test", "AudioLLMs/covost2_en_ta_test", "bleu", "st"), + _t("covost2_id_en_test", "AudioLLMs/covost2_id_en_test", "bleu", "st"), + _t("covost2_zh_en_test", "AudioLLMs/covost2_zh_en_test", "bleu", "st"), + _t("covost2_ta_en_test", "AudioLLMs/covost2_ta_en_test", "bleu", "st"), +] + +_BUCKET_B_REASONING = [ + # Spoken-MQA reasoning splits (GSM-8K-like, acc scoring). All 4 share + # one HF repo; the split is an upstream config — passed as ``data_dir`` + # so the YAML/HF snapshot_download dedups across splits while AudioBench + # still knows which split to read. + _t( + "spoken-mqa", + "amao0o0/spoken-mqa", + "accuracy", + "reasoning", + upstream_metric="acc", + data_dir="short_digit", + name="audiobench_spoken_mqa_short_digit", + ), + _t( + "spoken-mqa", + "amao0o0/spoken-mqa", + "accuracy", + "reasoning", + upstream_metric="acc", + data_dir="long_digit", + name="audiobench_spoken_mqa_long_digit", + ), + _t( + "spoken-mqa", + "amao0o0/spoken-mqa", + "accuracy", + "reasoning", + upstream_metric="acc", + data_dir="single_step_reasoning", + name="audiobench_spoken_mqa_single_step_reasoning", + ), + _t( + "spoken-mqa", + "amao0o0/spoken-mqa", + "accuracy", + "reasoning", + upstream_metric="acc", + data_dir="multi_step_reasoning", + name="audiobench_spoken_mqa_multi_step_reasoning", + ), + # MMAU mini — deterministic string-match scoring (judge-free path). + _t( + "mmau_mini", + "AudioLLMs/MMAU-mini", + "string_match", + "reasoning", + upstream_metric="string_match", + ), + # AudioCaps — METEOR is the judge-free scorer (judges also available). + _t( + "audiocaps_test", + "AudioLLMs/audiocaps_test", + "meteor", + "reasoning", + upstream_metric="meteor", + ), +] + +# --------------------------------------------------------------------------- +# Bucket A — 7 dual-registered duplicates of benchmarks already in lmms-eval. +# These are for paper-comparability with AudioBench; the lmms-eval versions +# stay in place and produce independent numbers under their own task names. +# The HF repos are distinct (AudioLLMs/* vs lmms-lab/*) so there is no risk +# of snapshot_download collision. +# --------------------------------------------------------------------------- + +_BUCKET_A_DUAL = [ + # LibriSpeech (English ASR). + _t("librispeech_test_clean", "AudioLLMs/librispeech_test_clean", "wer", "asr"), + _t("librispeech_test_other", "AudioLLMs/librispeech_test_other", "wer", "asr"), + # Common Voice 15 English ASR. + _t("common_voice_15_en_test", "AudioLLMs/common_voice_15_en_test", "wer", "asr"), + # GigaSpeech v1 English ASR. + _t("gigaspeech_test", "AudioLLMs/gigaspeech_test", "wer", "asr"), + # People's Speech English ASR (note upstream repo name has the "s"). + _t("peoples_speech_test", "AudioLLMs/peoples_speech_test", "wer", "asr"), + # TED-LIUM 3 standard test (distinct from tedlium3_long_form_test above). + _t("tedlium3_test", "AudioLLMs/tedlium3_test", "wer", "asr"), + # CoVoST2 en→zh (ST). + _t("covost2_en_zh_test", "AudioLLMs/covost2_en_zh_test", "bleu", "st"), +] + + +# --------------------------------------------------------------------------- +# Public registry — flat list of all Phase-1 task specs. +# Order is stable (ASR / ST / reasoning) for deterministic YAML ordering +# and for readable test-failure diffs. +# --------------------------------------------------------------------------- + +AUDIOBENCH_TASKS: list[AudioBenchTaskSpec] = [ + *_BUCKET_B_ASR, + *_BUCKET_B_ST, + *_BUCKET_B_REASONING, + *_BUCKET_A_DUAL, +] + + +# Fail-fast consistency checks — runs at import time so a typo in the +# registry breaks the test suite rather than manifesting as a silent job +# routing error later. +def _validate() -> None: + seen_names: set[str] = set() + for t in AUDIOBENCH_TASKS: + if t.name in seen_names: + raise RuntimeError(f"Duplicate AudioBench task name {t.name!r} in registry") + seen_names.add(t.name) + if not t.name.startswith(_TASK_NAME_PREFIX): + raise RuntimeError( + f"AudioBench task {t.name!r} missing required prefix " + f"{_TASK_NAME_PREFIX!r}" + ) + if t.family not in {"asr", "st", "reasoning"}: + raise RuntimeError( + f"AudioBench task {t.name!r} has unknown family {t.family!r}" + ) + + +_validate() + + +def get_task_spec(name: str) -> AudioBenchTaskSpec: + """Look up an :class:`AudioBenchTaskSpec` by canonical task name. + + Raises + ------ + KeyError + If *name* does not correspond to any registered AudioBench task. + """ + for t in AUDIOBENCH_TASKS: + if t.name == name: + return t + known = sorted(t.name for t in AUDIOBENCH_TASKS) + raise KeyError(f"Unknown AudioBench task {name!r}. Known tasks: {', '.join(known)}") diff --git a/oellm/resources/template.sbatch b/oellm/resources/template.sbatch index d67ab948..bab9643b 100644 --- a/oellm/resources/template.sbatch +++ b/oellm/resources/template.sbatch @@ -13,7 +13,10 @@ CSV_PATH="{csv_path}" NUM_JOBS={num_jobs} TOTAL_EVALS={total_evals} -LIMIT="{limit}" +# Exported so contrib suite plugins (which spawn their own Python subprocesses +# via oellm.contrib.dispatch) can read it from os.environ. Built-in suites +# below still interpolate $LIMIT directly into their CLI flags. +export LIMIT="{limit}" VENV_PATH="{venv_path}" LM_EVAL_INCLUDE_PATH="{lm_eval_include_path}" diff --git a/pyproject.toml b/pyproject.toml index e8662646..09f9d036 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,20 @@ audio = [ "librosa", "jiwer", ] +# AudioBench contrib plugin. AudioBench itself is NOT pip-installable (no +# build backend upstream and bare imports like ``from dataset import ...`` +# that break after install), so we don't list it as a Python dependency. +# Instead, AUDIOBENCH_DIR in clusters.yaml points at a local git clone and +# suite.py subprocesses into ``python src/main_evaluate.py``. What we do +# need here is our own post-processing / scorer deps for result parsing. +audiobench = [ + "jiwer", # Phase 1 — WER result sanity checks + "sacrebleu", # Phase 1 — BLEU scorer verification (covost2) + "pythainlp", # Phase 1 — Thai tokenisation for gigaspeech2_thai + "evaluate", # Phase 1 — MMAU / METEOR post-processing + "soundfile", + "librosa", +] [project.scripts] oellm = "oellm.main:main" From 6e2f2a24fbd4adaf6654b76765a8f875c7696f6f Mon Sep 17 00:00:00 2001 From: Ivan Slobozhan Date: Tue, 21 Apr 2026 11:35:22 +0200 Subject: [PATCH 2/6] add audiobench tests --- oellm/contrib/audiobench/README.md | 29 +- oellm/contrib/audiobench/adapter.py | 48 +- oellm/contrib/audiobench/suite.py | 190 ++------ oellm/contrib/audiobench/task.py | 153 ++---- oellm/resources/clusters.yaml | 17 +- pyproject.toml | 18 +- tests/test_audiobench.py | 725 ++++++++++++++++++++++++++++ 7 files changed, 830 insertions(+), 350 deletions(-) create mode 100644 tests/test_audiobench.py diff --git a/oellm/contrib/audiobench/README.md b/oellm/contrib/audiobench/README.md index b1eee38a..4306901a 100644 --- a/oellm/contrib/audiobench/README.md +++ b/oellm/contrib/audiobench/README.md @@ -8,7 +8,7 @@ can produce numbers directly comparable with the AudioBench paper and leaderboard, without the scoring-normalisation drift that would come from running the same datasets through lmms-eval. -## Scope — Phase 1 (this release) +## Scope **27 judge-free tasks** across ASR (WER), speech translation (BLEU), spoken reasoning (accuracy / string_match), and AudioCaps (METEOR). Of these: @@ -28,10 +28,9 @@ Every AudioBench task is namespaced with an `audiobench_` prefix so the CSV (e.g. `audiobench_librispeech_test_clean` is AudioBench-scored; `librispeech_test_clean` remains the lmms-eval version). -**Phase 2** (not in this release) will add ~19 judge-dependent tasks -(SLUE-SQA5, Spoken-SQuAD, AudioCaps-QA, IEMOCAP / MELD / VoxCeleb probes, -AudioLLM-InstructionFollowing) once a vLLM judge server is provisioned on -Leonardo. +Judge-dependent tasks (SLUE-SQA5, Spoken-SQuAD, AudioCaps-QA, IEMOCAP / +MELD / VoxCeleb probes, AudioLLM-InstructionFollowing) are not included +and depend on a vLLM judge service being provisioned on Leonardo. ## Prerequisites @@ -99,8 +98,7 @@ AudioBench itself. No manual steps required. `schedule-evals` auto-downloads every `AudioLLMs/*` HF repo referenced by the requested task group on the login node via `huggingface_hub.snapshot_download(max_workers=2)` so the -compute nodes do not need internet access. The rate-limit-friendly -`max_workers=2` is shared infrastructure — see `oellm/utils.py`. +compute nodes do not need internet access. ## Running @@ -108,7 +106,7 @@ compute nodes do not need internet access. The rate-limit-friendly | Task group | Leaves | What it covers | |----------------------------------|--------|-----------------------------------------------------------------| -| `audio-audiobench` | 27 | Full Phase-1 suite (everything below). | +| `audio-audiobench` | 27 | Full suite (everything below). | | `audio-audiobench-asr` | 15 | WER tasks — 9 new + 6 dual-registered with lmms-eval. | | `audio-audiobench-st` | 6 | BLEU speech-translation — 5 new + 1 dual (en→zh). | | `audio-audiobench-reasoning` | 6 | Spoken-MQA × 4, MMAU mini, AudioCaps METEOR. | @@ -116,7 +114,7 @@ compute nodes do not need internet access. The rate-limit-friendly ### Example ```bash -# Full AudioBench Phase-1 suite on a Qwen2-Audio model: +# Full AudioBench suite on a Qwen2-Audio model: oellm schedule-evals \ --models Qwen/Qwen2-Audio-7B-Instruct \ --task-groups audio-audiobench \ @@ -188,12 +186,11 @@ column: `audiobench:qwen2_audio`. The dispatcher in 5. `collect-results` reads it via `parse_results()` and the standard `_resolve_metric` fallback chain — no special-casing in core code. -## Open questions / Phase-2 prerequisites +## Open items -- **Judge service hosting:** Phase 2 needs a Llama-3-70B-AWQ judge on an - OpenAI-compatible endpoint. Plan is a separate long-running vLLM sbatch - whose URL/model lands in `clusters.yaml` as `AUDIOBENCH_JUDGE_URL` and - `AUDIOBENCH_JUDGE_MODEL`. +- **Judge service hosting:** judge-dependent tasks need a Llama-3-70B-AWQ + judge on an OpenAI-compatible endpoint. Plan is a separate long-running + vLLM sbatch whose URL/model lands in `clusters.yaml` as + `AUDIOBENCH_JUDGE_URL` and `AUDIOBENCH_JUDGE_MODEL`. - **MERaLiON / IMDA NSC tasks:** ~21 gated AudioBench tasks require - corpora not on public HF. These will ship in a later phase — or not, - depending on whether WP4 needs them. + corpora not on public HF. Deferred until WP4 needs them. diff --git a/oellm/contrib/audiobench/adapter.py b/oellm/contrib/audiobench/adapter.py index 2b15c7f6..4734ba69 100644 --- a/oellm/contrib/audiobench/adapter.py +++ b/oellm/contrib/audiobench/adapter.py @@ -1,40 +1,20 @@ """AudioBench model adapter. -Maps a HuggingFace model path (or local filesystem path) to the string key -that AudioBench's ``src/main_evaluate.py --model`` argument expects. The -upstream dispatch table lives in ``AudioBench/src/model.py`` and is -hand-wired — one entry per model family. - -The adapter returns one of: - -- ``"qwen2_audio"`` — Qwen2-Audio / Qwen-Audio checkpoints. -- ``"salmonn"`` — SALMONN family (Tsinghua). -- ``"ltu"`` — Listen-Think-Understand. -- ``"whisper"`` — Whisper (OpenAI). -- ``"audioflamingo"`` — Audio-Flamingo (NVIDIA). -- ``"meralion"`` — MERaLiON (Singapore-NLP). -- ``"generic"`` — fallback. AudioBench treats this as the default HF - pipeline dispatch, which works for many generic audio - LLMs but may need tuning per model. - -The detected value is passed to :mod:`oellm.contrib.dispatch` as the -``model_flags`` portion of the ``eval_suite`` column -(``audiobench:``), exactly like the regiondial_bench pattern. +Maps a HuggingFace model path to the string key that AudioBench's +``src/main_evaluate.py --model`` argument expects. The detected value is +passed to :mod:`oellm.contrib.dispatch` as the ``model_flags`` portion of +the ``eval_suite`` column (``audiobench:``). """ from __future__ import annotations from oellm.core.base_model_adapter import BaseModelAdapter -# (model-family key, substrings to match in lowered model path) -# Order matters — first match wins. More-specific patterns must appear -# before their super-strings (e.g. "qwen2-audio" before "qwen"). +# (model-family key, substrings to match in lowered model path). Order +# matters — first match wins, so more-specific patterns come first. _PATTERNS: list[tuple[str, tuple[str, ...]]] = [ ("qwen2_audio", ("qwen2-audio", "qwen2_audio", "qwen-audio", "qwen_audio")), ("salmonn", ("salmonn",)), - # LTU checkpoints often have paths like "ltu-as/", "ltu-7b", or - # "MIT/ltu". Prefix with "/" / "-" where possible to avoid false - # matches (e.g. "altus"). ("ltu", ("ltu-", "/ltu", "_ltu", "ltu_as")), ("whisper", ("whisper-", "/whisper", "openai/whisper")), ("audioflamingo", ("audio-flamingo", "audioflamingo", "audio_flamingo")), @@ -43,7 +23,7 @@ class AudioBenchModelAdapter(BaseModelAdapter): - """Adapter that resolves ``--model`` flag for AudioBench subprocess.""" + """Adapter resolving the ``--model`` flag for the AudioBench subprocess.""" def __init__(self, model_path: str) -> None: self._path = model_path @@ -53,16 +33,15 @@ def model_path(self) -> str: return self._path def to_lm_eval_args(self) -> str: - # Not used — AudioBench doesn't route through lm-eval. Provided - # only to satisfy the BaseModelAdapter contract. + # Unused — AudioBench doesn't route through lm-eval. Required by + # BaseModelAdapter. return f"pretrained={self._path},trust_remote_code=True" def to_lmms_eval_args(self) -> str: - # Not used — see note on to_lm_eval_args(). + # Unused — see to_lm_eval_args(). return f"pretrained={self._path}" def to_contrib_flags(self) -> str | None: - """Return the AudioBench ``--model`` key for this model path.""" lowered = self._path.lower() for key, needles in _PATTERNS: if any(n in lowered for n in needles): @@ -71,10 +50,5 @@ def to_contrib_flags(self) -> str | None: def detect_audiobench_model_type(model_path: str) -> str: - """Module-level convenience — matches :func:`oellm.constants.detect_lmms_model_type`. - - Returns the same value as - ``AudioBenchModelAdapter(model_path).to_contrib_flags()`` but never - returns ``None`` (falls back to ``"generic"``). - """ + """Like ``to_contrib_flags`` but always returns a string (default ``generic``).""" return AudioBenchModelAdapter(model_path).to_contrib_flags() or "generic" diff --git a/oellm/contrib/audiobench/suite.py b/oellm/contrib/audiobench/suite.py index 7fed48a1..e601e784 100644 --- a/oellm/contrib/audiobench/suite.py +++ b/oellm/contrib/audiobench/suite.py @@ -1,47 +1,11 @@ """AudioBench contrib suite — plugin protocol implementation. -Implements the :mod:`oellm.registry` plugin protocol for the AudioBench -benchmark (AudioLLMs/AudioBench, arXiv 2406.16020). AudioBench is **not** a -pip-installable library — it is a script harness. We invoke its entry point -via ``python src/main_evaluate.py`` as a subprocess, from a clone pointed at -by the ``$AUDIOBENCH_DIR`` environment variable (configured in -``clusters.yaml``). This mirrors the precedent set by ``regiondial_bench``. - -Cluster setup -------------- -The following environment variables must be set in ``clusters.yaml`` (or the -cluster's module/profile system) before using any ``audio-audiobench-*`` -task group: - -``AUDIOBENCH_DIR`` - Absolute path to a local clone of - https://github.com/AudioLLMs/AudioBench. The entry point - ``src/main_evaluate.py`` must be present and the repo's own Python - dependencies must be installed in the active environment. - -Phase 2 (judge-dependent tasks) will additionally require: - -``AUDIOBENCH_JUDGE_URL`` / ``AUDIOBENCH_JUDGE_MODEL`` - OpenAI-compatible URL and model name for the judge server (typically a - vLLM deployment of ``meta-llama/Meta-Llama-3-70B-Instruct-AWQ``). Not - needed for Phase-1 judge-free tasks shipped today. - -Output format -------------- -:func:`run` writes a lmms-eval-compatible JSON file to *output_path* so -that :func:`oellm.main.collect_results` can parse it without modification:: - - { - "model_name_or_path": "", - "results": { - "audiobench_librispeech_test_clean": { - "wer": 0.047 - } - }, - "configs": { - "audiobench_librispeech_test_clean": {"num_fewshot": 0} - } - } +AudioBench is not pip-installable (upstream has no build backend and uses +bare imports like ``from dataset import ...``), so :func:`run` invokes its +``src/main_evaluate.py`` entry point as a subprocess with ``cwd`` set to +``$AUDIOBENCH_DIR``. :func:`run` then re-shapes AudioBench's result JSON +into a lmms-eval-compatible payload that :func:`oellm.main.collect_results` +can parse unchanged. """ from __future__ import annotations @@ -63,65 +27,45 @@ CLUSTER_ENV_VARS = ["AUDIOBENCH_DIR"] -# Mapping family → (group_name, human description). _FAMILY_GROUPS = { "asr": ( "audio-audiobench-asr", - "AudioBench ASR tasks (WER). Covers AudioBench-scored LibriSpeech, " - "Common Voice 15 EN, GigaSpeech, People's Speech, TED-LIUM 3 — dual " - "with our lmms-eval versions for paper-comparable numbers — plus new " - "tasks not in lmms-eval: earnings21/22, TED-LIUM 3 long-form, " - "AISHELL Mandarin, GigaSpeech2 (th/id/vi), SEAME code-switch.", + "AudioBench ASR tasks (WER).", ), "st": ( "audio-audiobench-st", - "AudioBench speech-translation tasks (BLEU). CoVoST2 covering " - "en↔id, en↔ta, zh→en, ta→en (new), plus dual-registered en→zh.", + "AudioBench speech-translation tasks (BLEU).", ), "reasoning": ( "audio-audiobench-reasoning", - "AudioBench spoken reasoning / captioning. Spoken-MQA digit + " - "reasoning splits (accuracy), MMAU-mini (string_match), " - "AudioCaps (METEOR).", + "AudioBench spoken reasoning / captioning (accuracy / string_match / METEOR).", ), } _TOP_LEVEL_GROUP = "audio-audiobench" _TOP_LEVEL_DESC = ( - "AudioBench Phase-1 suite (judge-free). Runs all 27 AudioBench tasks " - "that do not require an LLM judge: ASR (WER), speech translation (BLEU), " - "spoken reasoning (accuracy/string_match), and AudioCaps captioning " - "(METEOR). Phase 2 (judge-dependent tasks) will extend this group once " - "the judge service is configured." + "AudioBench suite — ASR (WER), speech translation (BLEU), spoken " + "reasoning (accuracy/string_match), and AudioCaps captioning (METEOR)." ) def _build_task_groups() -> dict: - """Assemble the :data:`TASK_GROUPS` dict from :data:`AUDIOBENCH_TASKS`. + """Build ``TASK_GROUPS`` from :data:`AUDIOBENCH_TASKS`. - One top-level ``audio-audiobench`` group containing all 27 leaves, plus - three sub-groups keyed by family (``-asr`` / ``-st`` / ``-reasoning``). - All groups are zero-shot by design — AudioBench tasks do not support - in-context examples. + Always zero-shot — AudioBench does not support in-context examples. """ task_metrics: dict[str, str] = {t.name: t.metric for t in AUDIOBENCH_TASKS} def _task_entry(t: AudioBenchTaskSpec) -> dict: - entry: dict = {"task": t.name, "dataset": t.hf_repo} - # ``data_dir``-style subsetting: we deliberately do NOT set ``subset`` - # in the YAML entry. The reason is that ``load_dataset(name=...)`` - # used by ``_pre_download_datasets_from_specs`` treats ``subset`` as a - # config name, not a ``data_dir`` — and for gigaspeech2/spoken-mqa the - # upstream distinction is a data_dir, not a config. Since the group - # name starts with "audio-", ``_collect_dataset_specs`` auto-sets - # ``needs_snapshot_download=True`` which downloads the whole repo, - # so AudioBench can read the right data_dir at runtime. This also - # means multiple tasks sharing one HF repo dedupe to a single spec. - return entry + # We deliberately omit ``subset`` — load_dataset treats it as a + # config name, but for gigaspeech2 / spoken-mqa the upstream + # distinction is a ``data_dir``. The ``audio-*`` prefix triggers + # full-repo snapshot_download, so AudioBench can read the right + # data_dir at runtime. + return {"task": t.name, "dataset": t.hf_repo} groups: dict[str, dict] = {} - # Sub-groups per family. tasks_by_family: dict[str, list[AudioBenchTaskSpec]] = { "asr": [], "st": [], @@ -141,7 +85,6 @@ def _task_entry(t: AudioBenchTaskSpec) -> dict: "tasks": [_task_entry(t) for t in entries], } - # Top-level group — union of everything. groups[_TOP_LEVEL_GROUP] = { "suite": SUITE_NAME, "n_shots": [0], @@ -155,28 +98,13 @@ def _task_entry(t: AudioBenchTaskSpec) -> dict: TASK_GROUPS: dict = _build_task_groups() -# --------------------------------------------------------------------------- -# Model-flag detection. -# --------------------------------------------------------------------------- - - def detect_model_flags(model_path: str) -> str | None: - """Delegate to :class:`AudioBenchModelAdapter`. - - Called by :class:`oellm.runner.EvalRunner.resolve_suite` to append the - AudioBench model-family key to ``eval_suite`` as - ``audiobench:``. - """ + """Return the AudioBench ``--model`` family key for *model_path*.""" from oellm.contrib.audiobench.adapter import AudioBenchModelAdapter return AudioBenchModelAdapter(model_path).to_contrib_flags() -# --------------------------------------------------------------------------- -# Runtime — subprocess into AudioBench's src/main_evaluate.py. -# --------------------------------------------------------------------------- - - def run( *, model_path: str, @@ -186,24 +114,10 @@ def run( model_flags: str | None, env: dict[str, str], ) -> None: - """Execute one AudioBench task and write lmms-eval-shaped JSON. - - Args: - model_path: HF repo ID or local path of the model under evaluation. - task: Canonical task name (must start with ``audiobench_``). - n_shot: Always 0 for AudioBench — recorded in the output ``configs`` - block for downstream compatibility. - output_path: Destination for the lmms-eval-compatible result JSON. - model_flags: AudioBench ``--model`` key (e.g. ``"qwen2_audio"``); - produced by :func:`detect_model_flags`. Falls back to - ``"generic"`` if not supplied. - env: Environment dict passed to the subprocess. Must contain - ``AUDIOBENCH_DIR`` (validated by dispatch.py before ``run`` is - called, but we re-check for safety). - - Raises: - RuntimeError: if AudioBench returns non-zero or produces no output. - KeyError: if *task* is not in the registry. + """Execute one AudioBench task and write a lmms-eval-shaped result JSON. + + Raises ``RuntimeError`` if AudioBench exits non-zero or produces no + parseable output, and ``KeyError`` if *task* is not registered. """ ab_dir = env.get("AUDIOBENCH_DIR") if not ab_dir: @@ -224,8 +138,6 @@ def run( spec = get_task_spec(task) model_key = model_flags or "generic" - # AudioBench writes outputs under a run-specific log directory; we set - # it to our output_path's parent so we can recover the raw result. run_dir = output_path.parent / f"audiobench_{output_path.stem}" run_dir.mkdir(parents=True, exist_ok=True) @@ -246,8 +158,6 @@ def run( if spec.data_dir: cmd.extend(["--data_dir", spec.data_dir]) - # Forward LIMIT (set by template.sbatch) as AudioBench's - # --number_of_samples when present. "-1" means no limit in AudioBench. limit = env.get("LIMIT", "").strip() if limit: cmd.extend(["--number_of_samples", str(limit)]) @@ -277,17 +187,7 @@ def run( def _extract_metrics(run_dir: Path, spec: AudioBenchTaskSpec) -> dict[str, float]: - """Find AudioBench's per-task score JSON inside *run_dir* and read it. - - AudioBench writes one JSON file per task under its ``--log_dir`` with - the score under a key matching ``--metrics``. We search recursively - for any ``*.json`` and pick the first one whose body contains the - expected metric key. This is intentionally lenient because upstream - log-layout has changed across releases. - - Raises: - RuntimeError: if no matching result file is found. - """ + """Find AudioBench's per-task result JSON under *run_dir* and read it.""" candidates = sorted(run_dir.rglob("*.json")) if not candidates: raise RuntimeError( @@ -304,9 +204,8 @@ def _extract_metrics(run_dir: Path, spec: AudioBenchTaskSpec) -> dict[str, float continue value = _find_metric(body, target_key) if value is not None: - # Emit the metric under OUR canonical key (spec.metric) so the - # lmms-eval-style ``task/metric,none`` stripping in - # collect_results() resolves to what's in task_metrics.yaml. + # Emit under our canonical key so collect_results' metric + # resolution picks up task_metrics.yaml. return {spec.metric: float(value)} raise RuntimeError( @@ -316,11 +215,10 @@ def _extract_metrics(run_dir: Path, spec: AudioBenchTaskSpec) -> dict[str, float def _find_metric(body: object, key: str) -> float | None: - """Recursive search for a numeric value keyed by *key* anywhere in *body*. + """Recursive search for a numeric value keyed by *key*. - AudioBench's per-task JSON has nested structure that has drifted across - releases (sometimes ``{"wer": 0.04}``, sometimes - ``{"metrics": {"wer": {"score": 0.04}}}``). We tolerate either form. + Tolerates both ``{"wer": 0.04}`` and ``{"metrics": {"wer": {"score": + 0.04}}}`` layouts — upstream log shape has drifted across releases. """ if isinstance(body, dict): if key in body: @@ -351,12 +249,6 @@ def _write_lmms_shaped_json( n_shot: int, metrics: dict[str, float], ) -> None: - """Write a lmms-eval-compatible JSON at *output_path*. - - :func:`oellm.main.collect_results` reads this shape directly; the - ``_resolve_metric`` fallback chain picks up our ``task_metrics`` - mapping to extract the primary value. - """ payload = { "model_name_or_path": model_path, "results": {task_name: metrics}, @@ -367,21 +259,9 @@ def _write_lmms_shaped_json( json.dump(payload, f, indent=2) -# --------------------------------------------------------------------------- -# parse_results — invoked by collect_results to recognise our output files. -# --------------------------------------------------------------------------- - - def parse_results(data: dict) -> tuple[str, str, int, dict[str, float]] | None: - """Recognise a JSON dict produced by :func:`run`. - - Detection heuristic: the ``results`` dict contains at least one key - that starts with ``"audiobench_"``. Returns the tuple expected by - :func:`oellm.main.collect_results`: - - ``(model_id, task_name, n_shot, {metric: value})`` - - Returns ``None`` for JSON blobs that don't belong to this suite. + """Recognise a JSON dict produced by :func:`run` and return + ``(model_id, task_name, n_shot, metrics)``; ``None`` if it's not ours. """ results = data.get("results", {}) if not isinstance(results, dict): @@ -393,8 +273,6 @@ def parse_results(data: dict) -> tuple[str, str, int, dict[str, float]] | None: continue model_id = data.get("model_name_or_path") or data.get("model_name") or "unknown" n_shot = data.get("configs", {}).get(task_name, {}).get("num_fewshot", 0) - # Coerce everything that can be float; leave non-numeric alone so - # _resolve_metric can still see them. coerced: dict[str, float] = {} for k, v in task_results.items(): if isinstance(v, int | float): @@ -403,7 +281,6 @@ def parse_results(data: dict) -> tuple[str, str, int, dict[str, float]] | None: return None -# Re-exports used by the test suite. __all__ = [ "CLUSTER_ENV_VARS", "SUITE_NAME", @@ -413,5 +290,4 @@ def parse_results(data: dict) -> tuple[str, str, int, dict[str, float]] | None: "run", ] -# Silence unused-import lint (the symbol is exported for consumer reuse). -_ = os +_ = os # exported via env dict passed to subprocess.run diff --git a/oellm/contrib/audiobench/task.py b/oellm/contrib/audiobench/task.py index 2a86bc14..849477f1 100644 --- a/oellm/contrib/audiobench/task.py +++ b/oellm/contrib/audiobench/task.py @@ -1,32 +1,12 @@ """AudioBench task registry. -Single source of truth for the AudioBench (AudioLLMs/AudioBench, arXiv 2406.16020) -Phase-1 task set. The registry is consumed by :mod:`oellm.contrib.audiobench.suite` -to auto-generate ``TASK_GROUPS`` and to look up per-task metadata (HF repo, -upstream task name, metric) at dispatch time. +Single source of truth for the task set. Consumed by +:mod:`oellm.contrib.audiobench.suite` to build ``TASK_GROUPS`` and to look up +per-task metadata (HF repo, upstream task name, metric) at dispatch time. -Phase 1 = judge-free tasks only (27 total): - -- **20 new** benchmarks not covered by our lmms-eval task groups - (``earnings{21,22}``, ``gigaspeech2`` {thai, indonesian, vietnamese}, - ``aishell`` ZH ASR, ``seame`` code-switch, covost2 extra language pairs, - ``spoken-mqa`` reasoning splits, ``mmau_mini``, ``audiocaps`` METEOR). -- **7 dual-registered** duplicates of benchmarks we already run via lmms-eval - (LibriSpeech test-clean/other, Common Voice 15 EN, GigaSpeech, People's - Speech, TED-LIUM 3, covost2 en→zh). These use AudioBench's own scorer - and normalizer so WP4 can compare numbers against the AudioBench paper. - -Naming ------- -Every task name is prefixed ``audiobench_`` so the CSV ``task_path`` column -uniquely identifies the scorer and there is no collision with lmms-eval's -``librispeech_test_clean`` etc. :func:`AudioBenchTaskSpec.upstream_name` -returns the bare name that AudioBench's ``src/main_evaluate.py --dataset`` -flag expects. - -Phase 2 (judge-dependent tasks) will extend this registry with ~19 more -entries driven by a vLLM Llama-3-70B judge or the OpenAI API; see the -plugin README for the rollout plan. +Every canonical task name is prefixed ``audiobench_`` so the CSV ``task_path`` +column uniquely identifies the scorer and doesn't collide with lmms-eval's +names for the same benchmark. """ from __future__ import annotations @@ -41,25 +21,11 @@ class AudioBenchTaskSpec: """Metadata for a single AudioBench task. - Attributes: - name: Canonical ``audiobench_*`` task name used in the CSV - ``task_path`` column and in ``task_metrics`` / ``task_groups``. - upstream_name: The ``--dataset`` value AudioBench's - ``src/main_evaluate.py`` expects (e.g. ``"librispeech_test_clean"``). - hf_repo: HuggingFace dataset repo ID for pre-download - (e.g. ``"AudioLLMs/librispeech_test_clean"``). - metric: Primary metric key written to our ``task_metrics`` mapping. - One of ``wer`` / ``bleu`` / ``accuracy`` / ``string_match`` / - ``meteor``. - upstream_metric: The value passed to AudioBench's ``--metrics`` CLI - flag. Usually identical to :attr:`metric` but allows divergence - when AudioBench uses a different key for the same score (e.g. - ``wer`` vs ``bleu`` match; ``accuracy`` vs upstream ``acc``). - family: One of ``"asr" | "st" | "reasoning"``. Controls which - ``audio-audiobench-*`` sub-group the task lands in. - data_dir: Optional upstream ``data_dir=...`` selector, used by the - gigaspeech2 multi-language repo. Passed to AudioBench via - ``--data_dir`` (upstream convention). + ``upstream_name`` is what AudioBench's ``--dataset`` flag expects; + ``upstream_metric`` is what ``--metrics`` expects (usually identical to + ``metric``). ``data_dir`` is the optional upstream ``--data_dir`` + selector used when multiple tasks share one HF repo (gigaspeech2, + spoken-mqa). """ name: str @@ -72,7 +38,6 @@ class AudioBenchTaskSpec: @property def task_group(self) -> str: - """Return the ``audio-audiobench-*`` sub-group this task belongs to.""" return f"audio-audiobench-{self.family}" @@ -86,12 +51,7 @@ def _t( data_dir: str | None = None, name: str | None = None, ) -> AudioBenchTaskSpec: - """Helper — build an :class:`AudioBenchTaskSpec` with sensible defaults. - - By default the canonical name is ``audiobench_``. Pass - ``name`` to override (used when upstream names collide across - data_dir variants of the same HF repo, e.g. gigaspeech2). - """ + """Build a spec with ``name = audiobench_`` by default.""" return AudioBenchTaskSpec( name=name if name is not None else _TASK_NAME_PREFIX + upstream_name, upstream_name=upstream_name, @@ -103,22 +63,13 @@ def _t( ) -# --------------------------------------------------------------------------- -# Bucket B — 20 genuinely new tasks (not in our lmms-eval task groups) -# --------------------------------------------------------------------------- - -_BUCKET_B_ASR = [ - # Mandarin ASR (not in lmms-eval). +# Tasks not covered by our lmms-eval task groups. +_NEW_ASR = [ _t("aishell_asr_zh_test", "AudioLLMs/aishell_1_zh_test", "wer", "asr"), - # Long-form English ASR from financial calls. _t("earnings21_test", "AudioLLMs/earnings21_test", "wer", "asr"), _t("earnings22_test", "AudioLLMs/earnings22_test", "wer", "asr"), - # Long-form TED talks (distinct from our tedlium_dev_test). _t("tedlium3_long_form_test", "AudioLLMs/tedlium3_long_form_test", "wer", "asr"), - # GigaSpeech2 — multilingual SE-Asian ASR. All 3 share one HF repo and - # are disambiguated by ``data_dir``. Upstream --dataset name is the same, - # so we override ``name`` with a language suffix to keep canonical names - # unique in our CSV. + # GigaSpeech2 — 3 languages share one HF repo, disambiguated by data_dir. _t( "gigaspeech2", "AudioLLMs/gigaspeech2-test", @@ -143,13 +94,11 @@ def _t( data_dir="vi-test", name="audiobench_gigaspeech2_viet", ), - # SEAME code-switch (English ↔ Mandarin). _t("seame_dev_man", "AudioLLMs/seame_dev_man", "wer", "asr"), _t("seame_dev_sge", "AudioLLMs/seame_dev_sge", "wer", "asr"), ] -_BUCKET_B_ST = [ - # CoVoST2 language pairs not in lmms-eval (only en-zh is there). +_NEW_ST = [ _t("covost2_en_id_test", "AudioLLMs/covost2_en_id_test", "bleu", "st"), _t("covost2_en_ta_test", "AudioLLMs/covost2_en_ta_test", "bleu", "st"), _t("covost2_id_en_test", "AudioLLMs/covost2_id_en_test", "bleu", "st"), @@ -157,11 +106,8 @@ def _t( _t("covost2_ta_en_test", "AudioLLMs/covost2_ta_en_test", "bleu", "st"), ] -_BUCKET_B_REASONING = [ - # Spoken-MQA reasoning splits (GSM-8K-like, acc scoring). All 4 share - # one HF repo; the split is an upstream config — passed as ``data_dir`` - # so the YAML/HF snapshot_download dedups across splits while AudioBench - # still knows which split to read. +_NEW_REASONING = [ + # Spoken-MQA — 4 splits share one HF repo; split is an upstream data_dir. _t( "spoken-mqa", "amao0o0/spoken-mqa", @@ -198,66 +144,33 @@ def _t( data_dir="multi_step_reasoning", name="audiobench_spoken_mqa_multi_step_reasoning", ), - # MMAU mini — deterministic string-match scoring (judge-free path). - _t( - "mmau_mini", - "AudioLLMs/MMAU-mini", - "string_match", - "reasoning", - upstream_metric="string_match", - ), - # AudioCaps — METEOR is the judge-free scorer (judges also available). - _t( - "audiocaps_test", - "AudioLLMs/audiocaps_test", - "meteor", - "reasoning", - upstream_metric="meteor", - ), + _t("mmau_mini", "AudioLLMs/MMAU-mini", "string_match", "reasoning"), + _t("audiocaps_test", "AudioLLMs/audiocaps_test", "meteor", "reasoning"), ] -# --------------------------------------------------------------------------- -# Bucket A — 7 dual-registered duplicates of benchmarks already in lmms-eval. -# These are for paper-comparability with AudioBench; the lmms-eval versions -# stay in place and produce independent numbers under their own task names. -# The HF repos are distinct (AudioLLMs/* vs lmms-lab/*) so there is no risk -# of snapshot_download collision. -# --------------------------------------------------------------------------- - -_BUCKET_A_DUAL = [ - # LibriSpeech (English ASR). +# Dual-registered duplicates of benchmarks also in lmms-eval. These use +# AudioBench's scorer/normaliser for paper-comparable numbers; the lmms-eval +# versions stay in place. HF repos differ (AudioLLMs/* vs lmms-lab/*) so +# snapshot_download does not collide. +_DUAL = [ _t("librispeech_test_clean", "AudioLLMs/librispeech_test_clean", "wer", "asr"), _t("librispeech_test_other", "AudioLLMs/librispeech_test_other", "wer", "asr"), - # Common Voice 15 English ASR. _t("common_voice_15_en_test", "AudioLLMs/common_voice_15_en_test", "wer", "asr"), - # GigaSpeech v1 English ASR. _t("gigaspeech_test", "AudioLLMs/gigaspeech_test", "wer", "asr"), - # People's Speech English ASR (note upstream repo name has the "s"). _t("peoples_speech_test", "AudioLLMs/peoples_speech_test", "wer", "asr"), - # TED-LIUM 3 standard test (distinct from tedlium3_long_form_test above). _t("tedlium3_test", "AudioLLMs/tedlium3_test", "wer", "asr"), - # CoVoST2 en→zh (ST). _t("covost2_en_zh_test", "AudioLLMs/covost2_en_zh_test", "bleu", "st"), ] -# --------------------------------------------------------------------------- -# Public registry — flat list of all Phase-1 task specs. -# Order is stable (ASR / ST / reasoning) for deterministic YAML ordering -# and for readable test-failure diffs. -# --------------------------------------------------------------------------- - AUDIOBENCH_TASKS: list[AudioBenchTaskSpec] = [ - *_BUCKET_B_ASR, - *_BUCKET_B_ST, - *_BUCKET_B_REASONING, - *_BUCKET_A_DUAL, + *_NEW_ASR, + *_NEW_ST, + *_NEW_REASONING, + *_DUAL, ] -# Fail-fast consistency checks — runs at import time so a typo in the -# registry breaks the test suite rather than manifesting as a silent job -# routing error later. def _validate() -> None: seen_names: set[str] = set() for t in AUDIOBENCH_TASKS: @@ -279,13 +192,7 @@ def _validate() -> None: def get_task_spec(name: str) -> AudioBenchTaskSpec: - """Look up an :class:`AudioBenchTaskSpec` by canonical task name. - - Raises - ------ - KeyError - If *name* does not correspond to any registered AudioBench task. - """ + """Look up a spec by canonical task name; raises ``KeyError`` if missing.""" for t in AUDIOBENCH_TASKS: if t.name == name: return t diff --git a/oellm/resources/clusters.yaml b/oellm/resources/clusters.yaml index 36e23d29..370201b6 100644 --- a/oellm/resources/clusters.yaml +++ b/oellm/resources/clusters.yaml @@ -1,5 +1,5 @@ shared: - TIME_LIMIT: "00:30:00" # time limit in the format HH:MM:SS + TIME_LIMIT: "02:30:00" # time limit in the format HH:MM:SS UV_LINK_MODE: "copy" EVAL_OUTPUT_DIR: "{EVAL_BASE_DIR}/{USER}" # where evaluations are written GPUS_PER_NODE: 1 @@ -7,13 +7,16 @@ shared: HF_DATASETS_DISABLE_PROGRESS_BARS: "1" leonardo: - hostname_pattern: "*.leonardo.local" # use this regexp to automatically assign environment variables corresponding to this YAML - EVAL_BASE_DIR: "/leonardo_work/AIFAC_L01_028/oellm-cli-shared-evals" - PARTITION: "boost_usr_prod" # default partition to use - ACCOUNT: "OELLM_prod2026" # default account to use - QUEUE_LIMIT: 1000 # maximum number of jobs that can be submitted as job/array, used to send only jobs that respects QOS - EVAL_CONTAINER_IMAGE: "eval_env-leonardo.sif" # name of the container image that is pulled which is built automatically with Github actions + hostname_pattern: "*.leonardo.local" + EVAL_BASE_DIR: "/leonardo/home/userexternal/islobozh/oellm-cli-shared-evals/" + PARTITION: "boost_usr_prod" + ACCOUNT: "OELLM_prod2026" + QUEUE_LIMIT: 1000 + EVAL_CONTAINER_IMAGE: "eval_env-leonardo.sif" SINGULARITY_ARGS: "--nv" + HF_HOME: "/leonardo_work/OELLM_prod2026/huggingface" + GPUS_PER_NODE: 4 + REGION_REASONER_DIR: "/leonardo/home/userexternal/islobozh/RegionReasoner" jureca: hostname_pattern: "*.jureca" diff --git a/pyproject.toml b/pyproject.toml index 09f9d036..4128db93 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,17 +33,15 @@ audio = [ "librosa", "jiwer", ] -# AudioBench contrib plugin. AudioBench itself is NOT pip-installable (no -# build backend upstream and bare imports like ``from dataset import ...`` -# that break after install), so we don't list it as a Python dependency. -# Instead, AUDIOBENCH_DIR in clusters.yaml points at a local git clone and -# suite.py subprocesses into ``python src/main_evaluate.py``. What we do -# need here is our own post-processing / scorer deps for result parsing. +# AudioBench contrib plugin. AudioBench itself is not pip-installable +# (no build backend upstream, bare imports), so AUDIOBENCH_DIR in +# clusters.yaml points at a local git clone and suite.py subprocesses into +# ``python src/main_evaluate.py``. These are our post-processing deps. audiobench = [ - "jiwer", # Phase 1 — WER result sanity checks - "sacrebleu", # Phase 1 — BLEU scorer verification (covost2) - "pythainlp", # Phase 1 — Thai tokenisation for gigaspeech2_thai - "evaluate", # Phase 1 — MMAU / METEOR post-processing + "jiwer", # WER sanity checks + "sacrebleu", # BLEU verification (covost2) + "pythainlp", # Thai tokenisation for gigaspeech2_thai + "evaluate", # MMAU / METEOR post-processing "soundfile", "librosa", ] diff --git a/tests/test_audiobench.py b/tests/test_audiobench.py new file mode 100644 index 00000000..e3a945ee --- /dev/null +++ b/tests/test_audiobench.py @@ -0,0 +1,725 @@ +"""Tests for the AudioBench contrib benchmark integration.""" + +from __future__ import annotations + +import json +import os +import sys +from pathlib import Path +from unittest.mock import patch + +import pytest + +from oellm.task_groups import ( + _collect_dataset_specs, + _expand_task_groups, + get_all_task_group_names, +) + +SUITE = "audiobench" +TOP_GROUP = "audio-audiobench" +ASR_GROUP = "audio-audiobench-asr" +ST_GROUP = "audio-audiobench-st" +REASONING_GROUP = "audio-audiobench-reasoning" + +# Canonical task names that MUST be in the registry. Assertions that +# reference individual task names go here so the audit table in the plan is +# reflected 1:1 in tests; a silent rename breaks the build. +BUCKET_B_TASKS = { + # ASR (9) + "audiobench_aishell_asr_zh_test", + "audiobench_earnings21_test", + "audiobench_earnings22_test", + "audiobench_tedlium3_long_form_test", + "audiobench_gigaspeech2_thai", + "audiobench_gigaspeech2_indo", + "audiobench_gigaspeech2_viet", + "audiobench_seame_dev_man", + "audiobench_seame_dev_sge", + # ST (5) + "audiobench_covost2_en_id_test", + "audiobench_covost2_en_ta_test", + "audiobench_covost2_id_en_test", + "audiobench_covost2_zh_en_test", + "audiobench_covost2_ta_en_test", + # Reasoning (6) + "audiobench_spoken_mqa_short_digit", + "audiobench_spoken_mqa_long_digit", + "audiobench_spoken_mqa_single_step_reasoning", + "audiobench_spoken_mqa_multi_step_reasoning", + "audiobench_mmau_mini", + "audiobench_audiocaps_test", +} + +BUCKET_A_DUAL = { + "audiobench_librispeech_test_clean", + "audiobench_librispeech_test_other", + "audiobench_common_voice_15_en_test", + "audiobench_gigaspeech_test", + "audiobench_peoples_speech_test", + "audiobench_tedlium3_test", + "audiobench_covost2_en_zh_test", +} + +ALL_PHASE1_TASKS = BUCKET_B_TASKS | BUCKET_A_DUAL + + +# --------------------------------------------------------------------------- +# Registry — task.py +# --------------------------------------------------------------------------- + + +class TestTaskRegistry: + def test_registry_has_exactly_27_tasks(self): + from oellm.contrib.audiobench.task import AUDIOBENCH_TASKS + + assert len(AUDIOBENCH_TASKS) == 27 + + def test_registry_covers_all_phase1_task_names(self): + from oellm.contrib.audiobench.task import AUDIOBENCH_TASKS + + names = {t.name for t in AUDIOBENCH_TASKS} + assert names == ALL_PHASE1_TASKS + + def test_every_task_has_audiobench_prefix(self): + from oellm.contrib.audiobench.task import AUDIOBENCH_TASKS + + for t in AUDIOBENCH_TASKS: + assert t.name.startswith("audiobench_"), t.name + + def test_every_task_has_audiollms_or_amao_hf_repo(self): + from oellm.contrib.audiobench.task import AUDIOBENCH_TASKS + + # Tasks live on AudioLLMs/* with the single exception of spoken-mqa + # (amao0o0/spoken-mqa). + for t in AUDIOBENCH_TASKS: + assert t.hf_repo.startswith(("AudioLLMs/", "amao0o0/")), ( + f"{t.name} has unexpected repo {t.hf_repo}" + ) + + def test_asr_tasks_all_use_wer(self): + from oellm.contrib.audiobench.task import AUDIOBENCH_TASKS + + for t in AUDIOBENCH_TASKS: + if t.family == "asr": + assert t.metric == "wer", f"{t.name}: {t.metric}" + + def test_st_tasks_all_use_bleu(self): + from oellm.contrib.audiobench.task import AUDIOBENCH_TASKS + + for t in AUDIOBENCH_TASKS: + if t.family == "st": + assert t.metric == "bleu", f"{t.name}: {t.metric}" + + def test_gigaspeech2_tasks_share_repo_and_differ_by_data_dir(self): + from oellm.contrib.audiobench.task import AUDIOBENCH_TASKS + + gs2 = [t for t in AUDIOBENCH_TASKS if "gigaspeech2" in t.name] + assert len(gs2) == 3 + # All share the same HF repo. + assert {t.hf_repo for t in gs2} == {"AudioLLMs/gigaspeech2-test"} + # Each has a distinct data_dir. + assert {t.data_dir for t in gs2} == {"th-test", "id-test", "vi-test"} + + def test_spoken_mqa_tasks_share_repo_and_differ_by_data_dir(self): + from oellm.contrib.audiobench.task import AUDIOBENCH_TASKS + + smqa = [t for t in AUDIOBENCH_TASKS if "spoken_mqa" in t.name] + assert len(smqa) == 4 + assert {t.hf_repo for t in smqa} == {"amao0o0/spoken-mqa"} + assert {t.data_dir for t in smqa} == { + "short_digit", + "long_digit", + "single_step_reasoning", + "multi_step_reasoning", + } + + def test_get_task_spec_returns_spec(self): + from oellm.contrib.audiobench.task import get_task_spec + + spec = get_task_spec("audiobench_librispeech_test_clean") + assert spec.upstream_name == "librispeech_test_clean" + assert spec.metric == "wer" + assert spec.family == "asr" + + def test_get_task_spec_unknown_raises(self): + from oellm.contrib.audiobench.task import get_task_spec + + with pytest.raises(KeyError, match="Unknown AudioBench task"): + get_task_spec("audiobench_does_not_exist") + + +# --------------------------------------------------------------------------- +# Adapter — adapter.py +# --------------------------------------------------------------------------- + + +class TestAudioBenchModelAdapter: + @pytest.fixture + def adapter_cls(self): + from oellm.contrib.audiobench.adapter import AudioBenchModelAdapter + from oellm.core.base_model_adapter import BaseModelAdapter + + return AudioBenchModelAdapter, BaseModelAdapter + + def test_is_base_model_adapter(self, adapter_cls): + cls, base = adapter_cls + assert issubclass(cls, base) + + def test_qwen2_audio(self, adapter_cls): + cls, _ = adapter_cls + assert cls("Qwen/Qwen2-Audio-7B-Instruct").to_contrib_flags() == "qwen2_audio" + + def test_qwen_audio(self, adapter_cls): + cls, _ = adapter_cls + assert cls("Qwen/Qwen-Audio-Chat").to_contrib_flags() == "qwen2_audio" + + def test_salmonn(self, adapter_cls): + cls, _ = adapter_cls + assert cls("tsinghua/SALMONN-13B").to_contrib_flags() == "salmonn" + + def test_ltu(self, adapter_cls): + cls, _ = adapter_cls + assert cls("MIT/ltu-as").to_contrib_flags() == "ltu" + + def test_whisper(self, adapter_cls): + cls, _ = adapter_cls + assert cls("openai/whisper-large-v3").to_contrib_flags() == "whisper" + + def test_audio_flamingo(self, adapter_cls): + cls, _ = adapter_cls + assert cls("nvidia/audio-flamingo-2").to_contrib_flags() == "audioflamingo" + + def test_meralion(self, adapter_cls): + cls, _ = adapter_cls + assert cls("Singapore-NLP/MERaLiON-7B").to_contrib_flags() == "meralion" + + def test_unknown_defaults_to_generic(self, adapter_cls): + cls, _ = adapter_cls + assert cls("random/unknown-model").to_contrib_flags() == "generic" + + def test_module_level_detect_function(self): + from oellm.contrib.audiobench.adapter import detect_audiobench_model_type + + assert detect_audiobench_model_type("Qwen/Qwen2-Audio-7B") == "qwen2_audio" + assert detect_audiobench_model_type("completely/unknown") == "generic" + + +# --------------------------------------------------------------------------- +# Suite plugin protocol — suite.py +# --------------------------------------------------------------------------- + + +class TestSuiteProtocol: + @pytest.fixture + def suite(self): + import oellm.contrib.audiobench.suite as s + + return s + + def test_suite_name(self, suite): + assert suite.SUITE_NAME == "audiobench" + + def test_cluster_env_vars_declared(self, suite): + assert "AUDIOBENCH_DIR" in suite.CLUSTER_ENV_VARS + + def test_task_groups_contains_all_four_groups(self, suite): + groups = suite.TASK_GROUPS["task_groups"] + for g in (TOP_GROUP, ASR_GROUP, ST_GROUP, REASONING_GROUP): + assert g in groups, f"{g} missing from TASK_GROUPS" + + def test_top_level_group_has_all_27_tasks(self, suite): + tasks = suite.TASK_GROUPS["task_groups"][TOP_GROUP]["tasks"] + assert len(tasks) == 27 + + def test_task_metrics_present_for_all_leaves(self, suite): + metrics = suite.TASK_GROUPS["task_metrics"] + assert set(metrics.keys()) == ALL_PHASE1_TASKS + + def test_all_groups_are_zero_shot(self, suite): + for name in (TOP_GROUP, ASR_GROUP, ST_GROUP, REASONING_GROUP): + group = suite.TASK_GROUPS["task_groups"][name] + assert group["n_shots"] == [0] + assert group["suite"] == SUITE + + def test_detect_model_flags_qwen2_audio(self, suite): + assert suite.detect_model_flags("Qwen/Qwen2-Audio-7B-Instruct") == "qwen2_audio" + + def test_detect_model_flags_unknown_defaults_to_generic(self, suite): + assert suite.detect_model_flags("some/unknown-model") == "generic" + + def test_parse_results_recognises_audiobench_json(self, suite): + data = { + "model_name_or_path": "/path/to/model", + "results": { + "audiobench_librispeech_test_clean": {"wer": 0.047}, + }, + "configs": {"audiobench_librispeech_test_clean": {"num_fewshot": 0}}, + } + result = suite.parse_results(data) + assert result is not None + model_id, task_name, n_shot, metrics = result + assert model_id == "/path/to/model" + assert task_name == "audiobench_librispeech_test_clean" + assert n_shot == 0 + assert metrics["wer"] == pytest.approx(0.047) + + def test_parse_results_rejects_non_audiobench_json(self, suite): + # lmms-eval style — no audiobench_ prefix. + data = { + "model_name_or_path": "some/model", + "results": {"librispeech_test_clean": {"wer,none": 0.05}}, + "configs": {"librispeech_test_clean": {"num_fewshot": 0}}, + } + assert suite.parse_results(data) is None + + def test_parse_results_empty_returns_none(self, suite): + assert suite.parse_results({}) is None + + def test_parse_results_malformed_returns_none(self, suite): + assert suite.parse_results({"results": "not a dict"}) is None + + +# --------------------------------------------------------------------------- +# TASK_GROUPS integration with core registry. +# --------------------------------------------------------------------------- + + +class TestTaskGroupsIntegration: + def test_groups_registered_via_registry(self): + all_names = get_all_task_group_names() + for g in (TOP_GROUP, ASR_GROUP, ST_GROUP, REASONING_GROUP): + assert g in all_names + + def test_top_group_expands_to_27_zero_shot_tasks(self): + results = _expand_task_groups([TOP_GROUP]) + assert len(results) == 27 + for r in results: + assert r.n_shot == 0 + assert r.suite == SUITE + + def test_top_group_expands_to_expected_task_names(self): + results = _expand_task_groups([TOP_GROUP]) + assert {r.task for r in results} == ALL_PHASE1_TASKS + + def test_asr_group_has_15_leaves(self): + results = _expand_task_groups([ASR_GROUP]) + # 9 bucket-B ASR + 6 bucket-A dual ASR = 15. + assert len(results) == 15 + for r in results: + assert r.suite == SUITE + + def test_st_group_has_6_leaves(self): + results = _expand_task_groups([ST_GROUP]) + # 5 bucket-B ST + 1 bucket-A dual (en→zh) = 6. + assert len(results) == 6 + + def test_reasoning_group_has_6_leaves(self): + results = _expand_task_groups([REASONING_GROUP]) + # 4 spoken-mqa + mmau_mini + audiocaps = 6. + assert len(results) == 6 + + def test_dataset_specs_flag_snapshot_download(self): + # Auto-derived from the ``audio-*`` group-name prefix in + # _collect_dataset_specs. + specs = _collect_dataset_specs([TOP_GROUP]) + assert specs, "No dataset specs returned" + for s in specs: + assert s.needs_snapshot_download, ( + f"DatasetSpec for {s.repo_id} missing needs_snapshot_download=True" + ) + + def test_dataset_specs_dedupe_shared_repos(self): + # gigaspeech2 (3 tasks) → 1 spec; spoken-mqa (4 tasks) → 1 spec. + specs = _collect_dataset_specs([TOP_GROUP]) + repo_ids = [s.repo_id for s in specs] + assert repo_ids.count("AudioLLMs/gigaspeech2-test") == 1 + assert repo_ids.count("amao0o0/spoken-mqa") == 1 + + def test_dataset_specs_contain_audiollms_repos(self): + specs = _collect_dataset_specs([TOP_GROUP]) + repo_ids = {s.repo_id for s in specs} + # Sanity-check a handful of expected entries. + assert "AudioLLMs/librispeech_test_clean" in repo_ids + assert "AudioLLMs/earnings21_test" in repo_ids + assert "AudioLLMs/MMAU-mini" in repo_ids + assert "amao0o0/spoken-mqa" in repo_ids + + +# --------------------------------------------------------------------------- +# Registry auto-discovery. +# --------------------------------------------------------------------------- + + +class TestRegistryDiscovery: + def test_audiobench_suite_is_auto_discovered(self): + # Clear the _discover() cache so this test doesn't rely on import + # order from earlier tests. + from oellm import registry + + registry._discover.cache_clear() + mod = registry.get_suite("audiobench") + assert mod.SUITE_NAME == "audiobench" + assert hasattr(mod, "run") + assert hasattr(mod, "parse_results") + assert hasattr(mod, "detect_model_flags") + + def test_task_groups_merged_into_registry(self): + from oellm import registry + + registry._discover.cache_clear() + merged = registry.get_all_task_groups() + assert TOP_GROUP in merged["task_groups"] + # task_metrics come through too. + assert "audiobench_librispeech_test_clean" in merged["task_metrics"] + + +# --------------------------------------------------------------------------- +# EvalRunner — resolve_suite wires audiobench through the adapter. +# --------------------------------------------------------------------------- + + +class TestRunnerIntegration: + def test_resolve_suite_appends_model_flag(self): + from oellm.constants import EvaluationJob + from oellm.runner import EvalRunner + + runner = EvalRunner() + job = EvaluationJob( + model_path="Qwen/Qwen2-Audio-7B-Instruct", + task_path="audiobench_librispeech_test_clean", + n_shot=0, + eval_suite="audiobench", + ) + result = runner.resolve_suite(job) + assert result == "audiobench:qwen2_audio" + + def test_resolve_suite_generic_fallback(self): + from oellm.constants import EvaluationJob + from oellm.runner import EvalRunner + + runner = EvalRunner() + job = EvaluationJob( + model_path="some/unknown-model", + task_path="audiobench_mmau_mini", + n_shot=0, + eval_suite="audiobench", + ) + result = runner.resolve_suite(job) + assert result == "audiobench:generic" + + +# --------------------------------------------------------------------------- +# run() subprocess harness — exercise with a mocked subprocess. +# --------------------------------------------------------------------------- + + +class TestRunHarness: + """Exercise suite.run() with a mocked subprocess, verifying the CLI + it would invoke and the output JSON it writes. + """ + + def _fake_audiobench_tree(self, tmp_path: Path) -> Path: + """Create a minimal directory tree that looks like an AudioBench clone.""" + ab_dir = tmp_path / "AudioBench" + (ab_dir / "src").mkdir(parents=True) + (ab_dir / "src" / "main_evaluate.py").write_text("# placeholder\n") + return ab_dir + + def test_run_missing_audiobench_dir_raises(self, tmp_path): + from oellm.contrib.audiobench.suite import run + + with pytest.raises(RuntimeError, match="AUDIOBENCH_DIR must be set"): + run( + model_path="Qwen/Qwen2-Audio-7B", + task="audiobench_librispeech_test_clean", + n_shot=0, + output_path=tmp_path / "out.json", + model_flags="qwen2_audio", + env={}, # no AUDIOBENCH_DIR + ) + + def test_run_missing_entrypoint_raises(self, tmp_path): + from oellm.contrib.audiobench.suite import run + + bad_dir = tmp_path / "not-audiobench" + bad_dir.mkdir() + with pytest.raises(FileNotFoundError, match="AudioBench entry point"): + run( + model_path="Qwen/Qwen2-Audio-7B", + task="audiobench_librispeech_test_clean", + n_shot=0, + output_path=tmp_path / "out.json", + model_flags="qwen2_audio", + env={"AUDIOBENCH_DIR": str(bad_dir)}, + ) + + def test_run_invokes_subprocess_with_expected_cli(self, tmp_path): + from oellm.contrib.audiobench import suite + + ab_dir = self._fake_audiobench_tree(tmp_path) + output_path = tmp_path / "result.json" + + def fake_run(cmd, cwd, env, check): + # Write a fake AudioBench result JSON into the run_dir that + # _extract_metrics will pick up. + run_dir = Path(cmd[cmd.index("--log_dir") + 1]) + (run_dir / "task_result.json").write_text(json.dumps({"wer": 0.063})) + return _FakeCompletedProcess(0) + + with patch( + "oellm.contrib.audiobench.suite.subprocess.run", side_effect=fake_run + ) as mock_sp: + suite.run( + model_path="Qwen/Qwen2-Audio-7B-Instruct", + task="audiobench_librispeech_test_clean", + n_shot=0, + output_path=output_path, + model_flags="qwen2_audio", + env={"AUDIOBENCH_DIR": str(ab_dir), "LIMIT": "100"}, + ) + + assert mock_sp.call_count == 1 + cmd = mock_sp.call_args.args[0] + assert cmd[:2] == ["python", "src/main_evaluate.py"] + assert "--dataset" in cmd + assert cmd[cmd.index("--dataset") + 1] == "librispeech_test_clean" + assert cmd[cmd.index("--model") + 1] == "qwen2_audio" + assert cmd[cmd.index("--model_name") + 1] == "Qwen/Qwen2-Audio-7B-Instruct" + assert cmd[cmd.index("--metrics") + 1] == "wer" + # LIMIT propagated. + assert cmd[cmd.index("--number_of_samples") + 1] == "100" + # cwd is AUDIOBENCH_DIR. + assert mock_sp.call_args.kwargs["cwd"] == str(ab_dir) + + # Output JSON is lmms-eval-shaped and contains the extracted metric. + body = json.loads(output_path.read_text()) + assert body["model_name_or_path"] == "Qwen/Qwen2-Audio-7B-Instruct" + assert body["results"]["audiobench_librispeech_test_clean"][ + "wer" + ] == pytest.approx(0.063) + assert body["configs"]["audiobench_librispeech_test_clean"]["num_fewshot"] == 0 + + def test_run_forwards_data_dir_for_gigaspeech2(self, tmp_path): + from oellm.contrib.audiobench import suite + + ab_dir = self._fake_audiobench_tree(tmp_path) + output_path = tmp_path / "result.json" + + def fake_run(cmd, cwd, env, check): + run_dir = Path(cmd[cmd.index("--log_dir") + 1]) + (run_dir / "gs2.json").write_text(json.dumps({"wer": 0.12})) + return _FakeCompletedProcess(0) + + with patch( + "oellm.contrib.audiobench.suite.subprocess.run", side_effect=fake_run + ) as mock_sp: + suite.run( + model_path="Qwen/Qwen2-Audio-7B", + task="audiobench_gigaspeech2_thai", + n_shot=0, + output_path=output_path, + model_flags="qwen2_audio", + env={"AUDIOBENCH_DIR": str(ab_dir)}, + ) + + cmd = mock_sp.call_args.args[0] + assert "--data_dir" in cmd + assert cmd[cmd.index("--data_dir") + 1] == "th-test" + + def test_run_omits_number_of_samples_when_limit_empty(self, tmp_path): + from oellm.contrib.audiobench import suite + + ab_dir = self._fake_audiobench_tree(tmp_path) + output_path = tmp_path / "result.json" + + def fake_run(cmd, cwd, env, check): + run_dir = Path(cmd[cmd.index("--log_dir") + 1]) + (run_dir / "r.json").write_text(json.dumps({"wer": 0.1})) + return _FakeCompletedProcess(0) + + with patch( + "oellm.contrib.audiobench.suite.subprocess.run", side_effect=fake_run + ) as mock_sp: + suite.run( + model_path="Qwen/Qwen2-Audio-7B", + task="audiobench_librispeech_test_clean", + n_shot=0, + output_path=output_path, + model_flags="qwen2_audio", + env={"AUDIOBENCH_DIR": str(ab_dir), "LIMIT": ""}, + ) + + cmd = mock_sp.call_args.args[0] + assert "--number_of_samples" not in cmd + + def test_run_nonzero_exit_raises(self, tmp_path): + from oellm.contrib.audiobench import suite + + ab_dir = self._fake_audiobench_tree(tmp_path) + output_path = tmp_path / "result.json" + + with patch( + "oellm.contrib.audiobench.suite.subprocess.run", + return_value=_FakeCompletedProcess(1), + ): + with pytest.raises(RuntimeError, match="AudioBench exited with code 1"): + suite.run( + model_path="Qwen/Qwen2-Audio-7B", + task="audiobench_librispeech_test_clean", + n_shot=0, + output_path=output_path, + model_flags="qwen2_audio", + env={"AUDIOBENCH_DIR": str(ab_dir)}, + ) + + def test_run_handles_nested_metric_json(self, tmp_path): + """AudioBench output format has drifted; support + ``{"metrics": {"wer": {"score": 0.05}}}`` as well as flat + ``{"wer": 0.05}``. + """ + from oellm.contrib.audiobench import suite + + ab_dir = self._fake_audiobench_tree(tmp_path) + output_path = tmp_path / "result.json" + + def fake_run(cmd, cwd, env, check): + run_dir = Path(cmd[cmd.index("--log_dir") + 1]) + (run_dir / "nested.json").write_text( + json.dumps({"metrics": {"wer": {"score": 0.051, "notes": "nested"}}}) + ) + return _FakeCompletedProcess(0) + + with patch("oellm.contrib.audiobench.suite.subprocess.run", side_effect=fake_run): + suite.run( + model_path="Qwen/Qwen2-Audio-7B", + task="audiobench_librispeech_test_clean", + n_shot=0, + output_path=output_path, + model_flags="qwen2_audio", + env={"AUDIOBENCH_DIR": str(ab_dir)}, + ) + + body = json.loads(output_path.read_text()) + assert body["results"]["audiobench_librispeech_test_clean"][ + "wer" + ] == pytest.approx(0.051) + + def test_run_missing_metric_in_output_raises(self, tmp_path): + from oellm.contrib.audiobench import suite + + ab_dir = self._fake_audiobench_tree(tmp_path) + output_path = tmp_path / "result.json" + + def fake_run(cmd, cwd, env, check): + run_dir = Path(cmd[cmd.index("--log_dir") + 1]) + (run_dir / "no_metric.json").write_text(json.dumps({"irrelevant": 1})) + return _FakeCompletedProcess(0) + + with patch("oellm.contrib.audiobench.suite.subprocess.run", side_effect=fake_run): + with pytest.raises(RuntimeError, match="Could not locate metric"): + suite.run( + model_path="Qwen/Qwen2-Audio-7B", + task="audiobench_librispeech_test_clean", + n_shot=0, + output_path=output_path, + model_flags="qwen2_audio", + env={"AUDIOBENCH_DIR": str(ab_dir)}, + ) + + +class _FakeCompletedProcess: + """Stand-in for subprocess.CompletedProcess.""" + + def __init__(self, returncode: int) -> None: + self.returncode = returncode + + +# --------------------------------------------------------------------------- +# schedule_evals dry-run — wiring smoke test. +# --------------------------------------------------------------------------- + + +class TestScheduleEvalsDryRun: + def test_dry_run_writes_audiobench_suite_to_csv(self, tmp_path): + import pandas as pd + + from oellm.main import schedule_evals + + with ( + patch("oellm.scheduler._load_cluster_env"), + patch("oellm.scheduler._num_jobs_in_queue", return_value=0), + patch.dict(os.environ, {"EVAL_OUTPUT_DIR": str(tmp_path)}), + ): + schedule_evals( + models="Qwen/Qwen2-Audio-7B-Instruct", + task_groups=ASR_GROUP, + skip_checks=True, + venv_path=str(Path(sys.prefix)), + dry_run=True, + ) + + csv_files = list(tmp_path.glob("**/jobs.csv")) + assert len(csv_files) == 1 + df = pd.read_csv(csv_files[0]) + # All rows route to audiobench (with or without model-flag suffix). + assert all(s.startswith("audiobench") for s in df["eval_suite"].unique()) + # task_path column contains canonical audiobench_ names. + assert all(t.startswith("audiobench_") for t in df["task_path"].unique()) + + def test_dry_run_sbatch_contains_contrib_dispatch(self, tmp_path): + from oellm.main import schedule_evals + + with ( + patch("oellm.scheduler._load_cluster_env"), + patch("oellm.scheduler._num_jobs_in_queue", return_value=0), + patch.dict(os.environ, {"EVAL_OUTPUT_DIR": str(tmp_path)}), + ): + schedule_evals( + models="Qwen/Qwen2-Audio-7B-Instruct", + task_groups=TOP_GROUP, + skip_checks=True, + venv_path=str(Path(sys.prefix)), + dry_run=True, + ) + + sbatch_files = list(tmp_path.glob("**/submit_evals.sbatch")) + assert len(sbatch_files) == 1 + content = sbatch_files[0].read_text() + assert "oellm.contrib.dispatch" in content + # LIMIT is now exported so contrib plugins can read it. + assert "export LIMIT=" in content + + +# --------------------------------------------------------------------------- +# collect_results compatibility — verify a run() output flows through unchanged. +# --------------------------------------------------------------------------- + + +class TestCollectResultsCompat: + def test_collect_results_parses_audiobench_json(self, tmp_path): + import pandas as pd + + from oellm.main import collect_results + + results_dir = tmp_path / "results" + results_dir.mkdir() + + mock_output = { + "model_name_or_path": "/cluster/models/Qwen2-Audio-7B", + "results": { + "audiobench_librispeech_test_clean": {"wer": 0.052}, + }, + "configs": {"audiobench_librispeech_test_clean": {"num_fewshot": 0}}, + } + (results_dir / "ab123.json").write_text(json.dumps(mock_output)) + + output_csv = str(tmp_path / "results.csv") + collect_results(str(tmp_path), output_csv=output_csv) + + df = pd.read_csv(output_csv) + assert len(df) == 1 + row = df.iloc[0] + assert row["task"] == "audiobench_librispeech_test_clean" + assert float(row["performance"]) == pytest.approx(0.052) + assert row["model_name"] == "/cluster/models/Qwen2-Audio-7B" From 68343b1d1610cad76ba0d26e3d888b511f69f25d Mon Sep 17 00:00:00 2001 From: Ivan Slobozhan Date: Mon, 4 May 2026 10:55:43 +0200 Subject: [PATCH 3/6] fixes --- oellm/contrib/audiobench/adapter.py | 49 +-- oellm/contrib/audiobench/suite.py | 113 ++++--- oellm/contrib/audiobench/task.py | 40 +-- oellm/scheduler.py | 11 +- pyproject.toml | 9 +- tests/test_audiobench.py | 451 +++++++++++++++++++++------- 6 files changed, 469 insertions(+), 204 deletions(-) diff --git a/oellm/contrib/audiobench/adapter.py b/oellm/contrib/audiobench/adapter.py index 4734ba69..66a9655a 100644 --- a/oellm/contrib/audiobench/adapter.py +++ b/oellm/contrib/audiobench/adapter.py @@ -1,29 +1,38 @@ """AudioBench model adapter. -Maps a HuggingFace model path to the string key that AudioBench's -``src/main_evaluate.py --model`` argument expects. The detected value is -passed to :mod:`oellm.contrib.dispatch` as the ``model_flags`` portion of -the ``eval_suite`` column (``audiobench:``). +Maps a HuggingFace model path to AudioBench's literal ``--model_name`` value. + +AudioBench's ``Model`` class (in ``$AUDIOBENCH_DIR/src/model.py``) dispatches +on **exact-string** match against a fixed list — there is no family-level +indirection and no fallback. Each supported model has a hardcoded loader +under ``model_src/`` that loads its own HF repo internally; AudioBench +**cannot evaluate arbitrary HF checkpoints**, only the variants it knows +about. If we can't map the user's ``model_path`` to one of those literals, +we return ``None`` and ``suite.run`` raises a clear error. """ from __future__ import annotations from oellm.core.base_model_adapter import BaseModelAdapter -# (model-family key, substrings to match in lowered model path). Order -# matters — first match wins, so more-specific patterns come first. +# (audiobench_model_name, substrings_to_match_in_lower(model_path)). +# Order matters — first match wins; put more-specific patterns first. +# Keys MUST be the exact literals AudioBench's model.py dispatch expects. _PATTERNS: list[tuple[str, tuple[str, ...]]] = [ - ("qwen2_audio", ("qwen2-audio", "qwen2_audio", "qwen-audio", "qwen_audio")), - ("salmonn", ("salmonn",)), - ("ltu", ("ltu-", "/ltu", "_ltu", "ltu_as")), - ("whisper", ("whisper-", "/whisper", "openai/whisper")), - ("audioflamingo", ("audio-flamingo", "audioflamingo", "audio_flamingo")), - ("meralion", ("meralion",)), + ("Qwen2-Audio-7B-Instruct", ("qwen2-audio-7b-instruct", "qwen2_audio_7b_instruct")), + ("Qwen-Audio-Chat", ("qwen-audio-chat", "qwen_audio_chat")), + ("SALMONN_7B", ("salmonn",)), + ("MERaLiON-AudioLLM-Whisper-SEA-LION", ("meralion-audiollm", "meralion_audiollm")), + ("whisper_large_v3", ("whisper-large-v3", "whisper_large_v3")), + ("whisper_large_v2", ("whisper-large-v2", "whisper_large_v2")), + ("phi_4_multimodal_instruct", ("phi-4-multimodal", "phi_4_multimodal")), + ("seallms_audio_7b", ("seallms-audio-7b", "seallms_audio_7b")), + ("WavLLM_fairseq", ("wavllm",)), ] class AudioBenchModelAdapter(BaseModelAdapter): - """Adapter resolving the ``--model`` flag for the AudioBench subprocess.""" + """Adapter resolving the ``--model_name`` value for the AudioBench subprocess.""" def __init__(self, model_path: str) -> None: self._path = model_path @@ -42,13 +51,19 @@ def to_lmms_eval_args(self) -> str: return f"pretrained={self._path}" def to_contrib_flags(self) -> str | None: + """Return AudioBench's ``model_name`` dispatch key, or ``None`` if no match. + + Returning ``None`` is intentional: AudioBench has no generic loader, + so an unmatched model path must fail loudly rather than fall through + to a fictitious ``generic`` key that AudioBench doesn't recognize. + """ lowered = self._path.lower() for key, needles in _PATTERNS: if any(n in lowered for n in needles): return key - return "generic" + return None -def detect_audiobench_model_type(model_path: str) -> str: - """Like ``to_contrib_flags`` but always returns a string (default ``generic``).""" - return AudioBenchModelAdapter(model_path).to_contrib_flags() or "generic" +def detect_audiobench_model_type(model_path: str) -> str | None: + """Convenience wrapper around :meth:`AudioBenchModelAdapter.to_contrib_flags`.""" + return AudioBenchModelAdapter(model_path).to_contrib_flags() diff --git a/oellm/contrib/audiobench/suite.py b/oellm/contrib/audiobench/suite.py index e601e784..7ec28e46 100644 --- a/oellm/contrib/audiobench/suite.py +++ b/oellm/contrib/audiobench/suite.py @@ -57,11 +57,10 @@ def _build_task_groups() -> dict: task_metrics: dict[str, str] = {t.name: t.metric for t in AUDIOBENCH_TASKS} def _task_entry(t: AudioBenchTaskSpec) -> dict: - # We deliberately omit ``subset`` — load_dataset treats it as a - # config name, but for gigaspeech2 / spoken-mqa the upstream - # distinction is a ``data_dir``. The ``audio-*`` prefix triggers - # full-repo snapshot_download, so AudioBench can read the right - # data_dir at runtime. + # No ``subset`` — for gigaspeech2 / spoken-mqa the upstream split + # selection is encoded in ``upstream_name`` itself (e.g. + # ``gigaspeech2_thai``). The ``audio-*`` group prefix triggers + # full-repo snapshot_download in :func:`_collect_dataset_specs`. return {"task": t.name, "dataset": t.hf_repo} groups: dict[str, dict] = {} @@ -99,7 +98,13 @@ def _task_entry(t: AudioBenchTaskSpec) -> dict: def detect_model_flags(model_path: str) -> str | None: - """Return the AudioBench ``--model`` family key for *model_path*.""" + """Return AudioBench's literal ``--model_name`` dispatch key for *model_path*. + + Returns ``None`` when *model_path* does not match any AudioBench-supported + model family — :func:`run` then raises a clear error. AudioBench has no + generic loader, so silently falling back to a fictitious key would just + move the error deeper inside the subprocess. + """ from oellm.contrib.audiobench.adapter import AudioBenchModelAdapter return AudioBenchModelAdapter(model_path).to_contrib_flags() @@ -136,27 +141,31 @@ def run( ) spec = get_task_spec(task) - model_key = model_flags or "generic" - - run_dir = output_path.parent / f"audiobench_{output_path.stem}" - run_dir.mkdir(parents=True, exist_ok=True) + if not model_flags: + raise RuntimeError( + f"Could not map model_path={model_path!r} to an AudioBench-supported " + f"model. AudioBench dispatches on a fixed list of literal " + f"model_name strings (Qwen2-Audio-7B-Instruct, SALMONN_7B, " + f"whisper_large_v3, …) — see oellm/contrib/audiobench/adapter.py. " + f"AudioBench cannot evaluate arbitrary HF checkpoints; it loads " + f"its own hardcoded HF repos per model family." + ) + model_key = model_flags # AudioBench's dispatch key, e.g. "Qwen2-Audio-7B-Instruct" cmd = [ "python", "src/main_evaluate.py", - "--dataset", + "--dataset_name", spec.upstream_name, - "--model", - model_key, "--model_name", - model_path, + model_key, "--metrics", spec.upstream_metric, - "--log_dir", - str(run_dir), + # Force re-eval — AudioBench skips by default if a stale score file + # already exists under log_for_all_models/. + "--overwrite", + "True", ] - if spec.data_dir: - cmd.extend(["--data_dir", spec.data_dir]) limit = env.get("LIMIT", "").strip() if limit: @@ -172,10 +181,12 @@ def run( if completed.returncode != 0: raise RuntimeError( f"AudioBench exited with code {completed.returncode} for " - f"task={task!r} model={model_path!r}" + f"task={task!r} model={model_path!r} (dispatch key={model_key!r})" ) - metrics = _extract_metrics(run_dir, spec) + metrics = _extract_metrics( + audiobench_dir=Path(ab_dir), model_key=model_key, spec=spec + ) _write_lmms_shaped_json( output_path=output_path, model_path=model_path, @@ -186,32 +197,48 @@ def run( logger.info("Results written to %s", output_path) -def _extract_metrics(run_dir: Path, spec: AudioBenchTaskSpec) -> dict[str, float]: - """Find AudioBench's per-task result JSON under *run_dir* and read it.""" - candidates = sorted(run_dir.rglob("*.json")) - if not candidates: +def _extract_metrics( + *, + audiobench_dir: Path, + model_key: str, + spec: AudioBenchTaskSpec, +) -> dict[str, float]: + """Read AudioBench's score file from its hardcoded output path. + + AudioBench writes to ``$cwd/log_for_all_models//__score.json`` + (see ``main_evaluate.py:118``). Path is fixed — there is no ``--log_dir``. + """ + score_file = ( + audiobench_dir + / "log_for_all_models" + / model_key + / f"{spec.upstream_name}_{spec.upstream_metric}_score.json" + ) + if not score_file.exists(): raise RuntimeError( - f"AudioBench produced no result JSON under {run_dir}. " - "Check stdout/stderr for crashes." + f"AudioBench did not write expected score file at {score_file}. " + f"Either AudioBench crashed silently, or the dispatch key " + f"{model_key!r} / dataset_name {spec.upstream_name!r} / metric " + f"{spec.upstream_metric!r} is wrong. Check stdout/stderr." ) - target_key = spec.upstream_metric - for path in candidates: - try: - with open(path) as f: - body = json.load(f) - except (json.JSONDecodeError, OSError): - continue - value = _find_metric(body, target_key) - if value is not None: - # Emit under our canonical key so collect_results' metric - # resolution picks up task_metrics.yaml. - return {spec.metric: float(value)} - - raise RuntimeError( - f"Could not locate metric {target_key!r} in any of " - f"{len(candidates)} AudioBench result JSON(s) under {run_dir}" - ) + try: + with open(score_file) as f: + body = json.load(f) + except (json.JSONDecodeError, OSError) as e: + raise RuntimeError( + f"Could not read AudioBench score file {score_file}: {e}" + ) from e + + value = _find_metric(body, spec.upstream_metric) + if value is None: + raise RuntimeError( + f"Could not locate metric {spec.upstream_metric!r} in AudioBench " + f"score file {score_file}. Body: {body!r}" + ) + # Emit under our canonical key so collect_results' metric resolution + # picks up task_metrics.yaml. + return {spec.metric: float(value)} def _find_metric(body: object, key: str) -> float | None: diff --git a/oellm/contrib/audiobench/task.py b/oellm/contrib/audiobench/task.py index 849477f1..32098880 100644 --- a/oellm/contrib/audiobench/task.py +++ b/oellm/contrib/audiobench/task.py @@ -21,11 +21,10 @@ class AudioBenchTaskSpec: """Metadata for a single AudioBench task. - ``upstream_name`` is what AudioBench's ``--dataset`` flag expects; - ``upstream_metric`` is what ``--metrics`` expects (usually identical to - ``metric``). ``data_dir`` is the optional upstream ``--data_dir`` - selector used when multiple tasks share one HF repo (gigaspeech2, - spoken-mqa). + ``upstream_name`` is the literal string AudioBench's ``--dataset_name`` + expects (matched exactly against ``$AUDIOBENCH_DIR/src/dataset.py``'s + dispatch table). ``upstream_metric`` is what ``--metrics`` expects + (usually identical to our canonical ``metric``). """ name: str @@ -34,7 +33,6 @@ class AudioBenchTaskSpec: metric: str upstream_metric: str family: str - data_dir: str | None = None @property def task_group(self) -> str: @@ -48,7 +46,6 @@ def _t( family: str, *, upstream_metric: str | None = None, - data_dir: str | None = None, name: str | None = None, ) -> AudioBenchTaskSpec: """Build a spec with ``name = audiobench_`` by default.""" @@ -59,7 +56,6 @@ def _t( metric=metric, upstream_metric=upstream_metric or metric, family=family, - data_dir=data_dir, ) @@ -69,29 +65,28 @@ def _t( _t("earnings21_test", "AudioLLMs/earnings21_test", "wer", "asr"), _t("earnings22_test", "AudioLLMs/earnings22_test", "wer", "asr"), _t("tedlium3_long_form_test", "AudioLLMs/tedlium3_long_form_test", "wer", "asr"), - # GigaSpeech2 — 3 languages share one HF repo, disambiguated by data_dir. + # GigaSpeech2 — 3 languages share one HF repo. AudioBench dispatches via + # the dataset_name string itself (gigaspeech2_thai/indo/viet), not via a + # --data_dir flag (which doesn't exist upstream). _t( - "gigaspeech2", + "gigaspeech2_thai", "AudioLLMs/gigaspeech2-test", "wer", "asr", - data_dir="th-test", name="audiobench_gigaspeech2_thai", ), _t( - "gigaspeech2", + "gigaspeech2_indo", "AudioLLMs/gigaspeech2-test", "wer", "asr", - data_dir="id-test", name="audiobench_gigaspeech2_indo", ), _t( - "gigaspeech2", + "gigaspeech2_viet", "AudioLLMs/gigaspeech2-test", "wer", "asr", - data_dir="vi-test", name="audiobench_gigaspeech2_viet", ), _t("seame_dev_man", "AudioLLMs/seame_dev_man", "wer", "asr"), @@ -107,41 +102,38 @@ def _t( ] _NEW_REASONING = [ - # Spoken-MQA — 4 splits share one HF repo; split is an upstream data_dir. + # Spoken-MQA — 4 splits share one HF repo. AudioBench dispatches via + # the hyphen-prefixed dataset_name (spoken-mqa_), not --data_dir. _t( - "spoken-mqa", + "spoken-mqa_short_digit", "amao0o0/spoken-mqa", "accuracy", "reasoning", upstream_metric="acc", - data_dir="short_digit", name="audiobench_spoken_mqa_short_digit", ), _t( - "spoken-mqa", + "spoken-mqa_long_digit", "amao0o0/spoken-mqa", "accuracy", "reasoning", upstream_metric="acc", - data_dir="long_digit", name="audiobench_spoken_mqa_long_digit", ), _t( - "spoken-mqa", + "spoken-mqa_single_step_reasoning", "amao0o0/spoken-mqa", "accuracy", "reasoning", upstream_metric="acc", - data_dir="single_step_reasoning", name="audiobench_spoken_mqa_single_step_reasoning", ), _t( - "spoken-mqa", + "spoken-mqa_multi_step_reasoning", "amao0o0/spoken-mqa", "accuracy", "reasoning", upstream_metric="acc", - data_dir="multi_step_reasoning", name="audiobench_spoken_mqa_multi_step_reasoning", ), _t("mmau_mini", "AudioLLMs/MMAU-mini", "string_match", "reasoning"), diff --git a/oellm/scheduler.py b/oellm/scheduler.py index af203c91..61a80ddd 100644 --- a/oellm/scheduler.py +++ b/oellm/scheduler.py @@ -306,7 +306,16 @@ def schedule_evals( logging.warning("No evaluation jobs to schedule.") return None - df["eval_suite"] = df["eval_suite"].str.lower() + # Lowercase the suite name only, preserve any ``:model_flags`` suffix + # verbatim — contrib dispatch keys can be case-sensitive (e.g. + # AudioBench's ``Qwen2-Audio-7B-Instruct`` is matched literally). + def _lower_suite_only(s: str) -> str: + if ":" in s: + head, tail = s.split(":", 1) + return f"{head.lower()}:{tail}" + return s.lower() + + df["eval_suite"] = df["eval_suite"].map(_lower_suite_only) # Ensure that all datasets required by the tasks are cached locally to avoid # network access on compute nodes. diff --git a/pyproject.toml b/pyproject.toml index 4128db93..8dcdb8f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,10 +38,11 @@ audio = [ # clusters.yaml points at a local git clone and suite.py subprocesses into # ``python src/main_evaluate.py``. These are our post-processing deps. audiobench = [ - "jiwer", # WER sanity checks - "sacrebleu", # BLEU verification (covost2) - "pythainlp", # Thai tokenisation for gigaspeech2_thai - "evaluate", # MMAU / METEOR post-processing + "jiwer<3", # AudioBench uses jiwer.compute_measures, removed in 3.0 + "transformers>=4.45,<5", # AudioBench's Qwen2-Audio loader uses the v4 processor API (`audios=` kwarg); v5 silently drops audio inputs and produces garbage predictions + "sacrebleu", # BLEU verification (covost2) + "pythainlp", # Thai tokenisation for gigaspeech2_thai + "evaluate", # MMAU / METEOR post-processing "soundfile", "librosa", ] diff --git a/tests/test_audiobench.py b/tests/test_audiobench.py index e3a945ee..d7617eb7 100644 --- a/tests/test_audiobench.py +++ b/tests/test_audiobench.py @@ -1,4 +1,24 @@ -"""Tests for the AudioBench contrib benchmark integration.""" +"""Tests for the AudioBench contrib benchmark integration. + +The shape of these tests reflects AudioBench's actual upstream API +(``$AUDIOBENCH_DIR/src/main_evaluate.py``), which we discovered while +debugging the first cluster smoke test: + +* ``main()`` accepts only ``dataset_name`` / ``model_name`` / ``metrics`` / + ``overwrite`` / ``number_of_samples`` — no ``--model``, no ``--log_dir``, + no ``--data_dir``. +* ``Model.__init__`` and ``Dataset.load_dataset`` dispatch on **exact** + string match against fixed lists; AudioBench cannot evaluate arbitrary + HF checkpoints (only the variants whose loaders ship under + ``model_src/``), and split selection happens via the dataset_name itself + (``gigaspeech2_thai``, ``spoken-mqa_short_digit``) — there is no + ``--data_dir`` flag. +* AudioBench writes scores to the hardcoded path + ``$cwd/log_for_all_models//__score.json``. +* Without ``--overwrite True`` AudioBench skips evaluation when a stale + score file exists, so :func:`oellm.contrib.audiobench.suite.run` always + passes that flag. +""" from __future__ import annotations @@ -22,10 +42,9 @@ ST_GROUP = "audio-audiobench-st" REASONING_GROUP = "audio-audiobench-reasoning" -# Canonical task names that MUST be in the registry. Assertions that -# reference individual task names go here so the audit table in the plan is -# reflected 1:1 in tests; a silent rename breaks the build. -BUCKET_B_TASKS = { +# Canonical task names that MUST be in the registry. A silent rename +# breaks the build. +NEW_TASKS = { # ASR (9) "audiobench_aishell_asr_zh_test", "audiobench_earnings21_test", @@ -51,7 +70,7 @@ "audiobench_audiocaps_test", } -BUCKET_A_DUAL = { +DUAL_TASKS = { "audiobench_librispeech_test_clean", "audiobench_librispeech_test_other", "audiobench_common_voice_15_en_test", @@ -61,7 +80,7 @@ "audiobench_covost2_en_zh_test", } -ALL_PHASE1_TASKS = BUCKET_B_TASKS | BUCKET_A_DUAL +ALL_PHASE1_TASKS = NEW_TASKS | DUAL_TASKS # --------------------------------------------------------------------------- @@ -90,8 +109,6 @@ def test_every_task_has_audiobench_prefix(self): def test_every_task_has_audiollms_or_amao_hf_repo(self): from oellm.contrib.audiobench.task import AUDIOBENCH_TASKS - # Tasks live on AudioLLMs/* with the single exception of spoken-mqa - # (amao0o0/spoken-mqa). for t in AUDIOBENCH_TASKS: assert t.hf_repo.startswith(("AudioLLMs/", "amao0o0/")), ( f"{t.name} has unexpected repo {t.hf_repo}" @@ -111,29 +128,51 @@ def test_st_tasks_all_use_bleu(self): if t.family == "st": assert t.metric == "bleu", f"{t.name}: {t.metric}" - def test_gigaspeech2_tasks_share_repo_and_differ_by_data_dir(self): + def test_gigaspeech2_tasks_use_per_split_upstream_name(self): + """All 3 GigaSpeech2 tasks share one HF repo, but AudioBench's + ``--dataset_name`` dispatch keys are the split-suffixed forms + (``gigaspeech2_thai``/``_indo``/``_viet``) — there is no + ``--data_dir`` flag. + """ from oellm.contrib.audiobench.task import AUDIOBENCH_TASKS gs2 = [t for t in AUDIOBENCH_TASKS if "gigaspeech2" in t.name] assert len(gs2) == 3 - # All share the same HF repo. assert {t.hf_repo for t in gs2} == {"AudioLLMs/gigaspeech2-test"} - # Each has a distinct data_dir. - assert {t.data_dir for t in gs2} == {"th-test", "id-test", "vi-test"} + assert {t.upstream_name for t in gs2} == { + "gigaspeech2_thai", + "gigaspeech2_indo", + "gigaspeech2_viet", + } - def test_spoken_mqa_tasks_share_repo_and_differ_by_data_dir(self): + def test_spoken_mqa_tasks_use_per_split_upstream_name(self): + """All 4 spoken-mqa tasks share one HF repo; AudioBench dispatches + via the hyphen-prefixed split-suffixed dataset_name + (``spoken-mqa_``). + """ from oellm.contrib.audiobench.task import AUDIOBENCH_TASKS smqa = [t for t in AUDIOBENCH_TASKS if "spoken_mqa" in t.name] assert len(smqa) == 4 assert {t.hf_repo for t in smqa} == {"amao0o0/spoken-mqa"} - assert {t.data_dir for t in smqa} == { - "short_digit", - "long_digit", - "single_step_reasoning", - "multi_step_reasoning", + assert {t.upstream_name for t in smqa} == { + "spoken-mqa_short_digit", + "spoken-mqa_long_digit", + "spoken-mqa_single_step_reasoning", + "spoken-mqa_multi_step_reasoning", } + def test_spoken_mqa_uses_acc_metric_upstream(self): + """Upstream metric for spoken-mqa is ``acc`` (the canonical key + we expose externally is ``accuracy``). + """ + from oellm.contrib.audiobench.task import AUDIOBENCH_TASKS + + for t in AUDIOBENCH_TASKS: + if "spoken_mqa" in t.name: + assert t.metric == "accuracy" + assert t.upstream_metric == "acc" + def test_get_task_spec_returns_spec(self): from oellm.contrib.audiobench.task import get_task_spec @@ -148,6 +187,17 @@ def test_get_task_spec_unknown_raises(self): with pytest.raises(KeyError, match="Unknown AudioBench task"): get_task_spec("audiobench_does_not_exist") + def test_no_task_spec_carries_data_dir_attribute(self): + """``data_dir`` was removed once we discovered AudioBench has no + such flag; guard against accidental reintroduction. + """ + from oellm.contrib.audiobench.task import AUDIOBENCH_TASKS, AudioBenchTaskSpec + + # Field removed entirely from the dataclass. + assert "data_dir" not in AudioBenchTaskSpec.__dataclass_fields__ + for t in AUDIOBENCH_TASKS: + assert not hasattr(t, "data_dir") + # --------------------------------------------------------------------------- # Adapter — adapter.py @@ -155,6 +205,13 @@ def test_get_task_spec_unknown_raises(self): class TestAudioBenchModelAdapter: + """Adapter must return AudioBench's literal ``model_name`` dispatch keys. + + Each pattern check is a regression target — AudioBench's ``model.py`` + does ``if self.model_name == "":`` and raises + NotImplementedError on any other value. + """ + @pytest.fixture def adapter_cls(self): from oellm.contrib.audiobench.adapter import AudioBenchModelAdapter @@ -166,43 +223,61 @@ def test_is_base_model_adapter(self, adapter_cls): cls, base = adapter_cls assert issubclass(cls, base) - def test_qwen2_audio(self, adapter_cls): + def test_qwen2_audio_7b_instruct_returns_literal_key(self, adapter_cls): cls, _ = adapter_cls - assert cls("Qwen/Qwen2-Audio-7B-Instruct").to_contrib_flags() == "qwen2_audio" + # AudioBench dispatches on the literal "Qwen2-Audio-7B-Instruct". + assert ( + cls("Qwen/Qwen2-Audio-7B-Instruct").to_contrib_flags() + == "Qwen2-Audio-7B-Instruct" + ) - def test_qwen_audio(self, adapter_cls): + def test_qwen_audio_chat_returns_literal_key(self, adapter_cls): cls, _ = adapter_cls - assert cls("Qwen/Qwen-Audio-Chat").to_contrib_flags() == "qwen2_audio" + assert cls("Qwen/Qwen-Audio-Chat").to_contrib_flags() == "Qwen-Audio-Chat" - def test_salmonn(self, adapter_cls): + def test_salmonn_returns_salmonn_7b(self, adapter_cls): cls, _ = adapter_cls - assert cls("tsinghua/SALMONN-13B").to_contrib_flags() == "salmonn" + # AudioBench only ships the 7B variant (model_src/salmonn_7b.py). + assert cls("tsinghua/SALMONN-7B").to_contrib_flags() == "SALMONN_7B" - def test_ltu(self, adapter_cls): + def test_whisper_large_v3(self, adapter_cls): cls, _ = adapter_cls - assert cls("MIT/ltu-as").to_contrib_flags() == "ltu" + assert cls("openai/whisper-large-v3").to_contrib_flags() == "whisper_large_v3" - def test_whisper(self, adapter_cls): + def test_whisper_large_v2(self, adapter_cls): cls, _ = adapter_cls - assert cls("openai/whisper-large-v3").to_contrib_flags() == "whisper" + assert cls("openai/whisper-large-v2").to_contrib_flags() == "whisper_large_v2" - def test_audio_flamingo(self, adapter_cls): + def test_meralion_returns_full_literal_key(self, adapter_cls): cls, _ = adapter_cls - assert cls("nvidia/audio-flamingo-2").to_contrib_flags() == "audioflamingo" + assert ( + cls("MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION").to_contrib_flags() + == "MERaLiON-AudioLLM-Whisper-SEA-LION" + ) - def test_meralion(self, adapter_cls): + def test_phi_4_multimodal(self, adapter_cls): cls, _ = adapter_cls - assert cls("Singapore-NLP/MERaLiON-7B").to_contrib_flags() == "meralion" + assert ( + cls("microsoft/Phi-4-multimodal-instruct").to_contrib_flags() + == "phi_4_multimodal_instruct" + ) - def test_unknown_defaults_to_generic(self, adapter_cls): + def test_unknown_returns_none(self, adapter_cls): + """AudioBench has no generic loader. Unmatched paths must return + ``None`` so :func:`suite.run` can raise a clear error rather than + falling through to a fictitious dispatch key. + """ cls, _ = adapter_cls - assert cls("random/unknown-model").to_contrib_flags() == "generic" + assert cls("random/unknown-model").to_contrib_flags() is None def test_module_level_detect_function(self): from oellm.contrib.audiobench.adapter import detect_audiobench_model_type - assert detect_audiobench_model_type("Qwen/Qwen2-Audio-7B") == "qwen2_audio" - assert detect_audiobench_model_type("completely/unknown") == "generic" + assert ( + detect_audiobench_model_type("Qwen/Qwen2-Audio-7B-Instruct") + == "Qwen2-Audio-7B-Instruct" + ) + assert detect_audiobench_model_type("completely/unknown") is None # --------------------------------------------------------------------------- @@ -243,10 +318,13 @@ def test_all_groups_are_zero_shot(self, suite): assert group["suite"] == SUITE def test_detect_model_flags_qwen2_audio(self, suite): - assert suite.detect_model_flags("Qwen/Qwen2-Audio-7B-Instruct") == "qwen2_audio" + assert ( + suite.detect_model_flags("Qwen/Qwen2-Audio-7B-Instruct") + == "Qwen2-Audio-7B-Instruct" + ) - def test_detect_model_flags_unknown_defaults_to_generic(self, suite): - assert suite.detect_model_flags("some/unknown-model") == "generic" + def test_detect_model_flags_unknown_returns_none(self, suite): + assert suite.detect_model_flags("some/unknown-model") is None def test_parse_results_recognises_audiobench_json(self, suite): data = { @@ -304,14 +382,14 @@ def test_top_group_expands_to_expected_task_names(self): def test_asr_group_has_15_leaves(self): results = _expand_task_groups([ASR_GROUP]) - # 9 bucket-B ASR + 6 bucket-A dual ASR = 15. + # 9 new ASR + 6 dual ASR = 15. assert len(results) == 15 for r in results: assert r.suite == SUITE def test_st_group_has_6_leaves(self): results = _expand_task_groups([ST_GROUP]) - # 5 bucket-B ST + 1 bucket-A dual (en→zh) = 6. + # 5 new ST + 1 dual (en→zh) = 6. assert len(results) == 6 def test_reasoning_group_has_6_leaves(self): @@ -339,7 +417,6 @@ def test_dataset_specs_dedupe_shared_repos(self): def test_dataset_specs_contain_audiollms_repos(self): specs = _collect_dataset_specs([TOP_GROUP]) repo_ids = {s.repo_id for s in specs} - # Sanity-check a handful of expected entries. assert "AudioLLMs/librispeech_test_clean" in repo_ids assert "AudioLLMs/earnings21_test" in repo_ids assert "AudioLLMs/MMAU-mini" in repo_ids @@ -370,7 +447,6 @@ def test_task_groups_merged_into_registry(self): registry._discover.cache_clear() merged = registry.get_all_task_groups() assert TOP_GROUP in merged["task_groups"] - # task_metrics come through too. assert "audiobench_librispeech_test_clean" in merged["task_metrics"] @@ -380,7 +456,7 @@ def test_task_groups_merged_into_registry(self): class TestRunnerIntegration: - def test_resolve_suite_appends_model_flag(self): + def test_resolve_suite_appends_audiobench_dispatch_key(self): from oellm.constants import EvaluationJob from oellm.runner import EvalRunner @@ -392,9 +468,17 @@ def test_resolve_suite_appends_model_flag(self): eval_suite="audiobench", ) result = runner.resolve_suite(job) - assert result == "audiobench:qwen2_audio" - - def test_resolve_suite_generic_fallback(self): + # AudioBench's literal dispatch key (case-sensitive) must come + # through verbatim so dispatch.py / suite.run get the exact value + # AudioBench's ``Model`` class compares against. + assert result == "audiobench:Qwen2-Audio-7B-Instruct" + + def test_resolve_suite_unknown_model_passes_through_bare(self): + """When the adapter returns ``None`` (no AudioBench-supported + loader for the model path), ``resolve_suite`` keeps the bare + suite name; :func:`suite.run` then raises a clear error at + dispatch time rather than fabricating a fake key. + """ from oellm.constants import EvaluationJob from oellm.runner import EvalRunner @@ -406,7 +490,7 @@ def test_resolve_suite_generic_fallback(self): eval_suite="audiobench", ) result = runner.resolve_suite(job) - assert result == "audiobench:generic" + assert result == "audiobench" # bare, no ``:flags`` suffix # --------------------------------------------------------------------------- @@ -415,8 +499,10 @@ def test_resolve_suite_generic_fallback(self): class TestRunHarness: - """Exercise suite.run() with a mocked subprocess, verifying the CLI - it would invoke and the output JSON it writes. + """Exercise suite.run() with a mocked subprocess, verifying both the + CLI it would invoke (matching AudioBench's actual ``main()`` signature) + and that we read the score file from AudioBench's hardcoded output + location. """ def _fake_audiobench_tree(self, tmp_path: Path) -> Path: @@ -426,16 +512,54 @@ def _fake_audiobench_tree(self, tmp_path: Path) -> Path: (ab_dir / "src" / "main_evaluate.py").write_text("# placeholder\n") return ab_dir + @staticmethod + def _score_file_path( + ab_dir: Path, model_name: str, dataset: str, metric: str + ) -> Path: + """Mirror suite._extract_metrics' path construction.""" + return ( + ab_dir / "log_for_all_models" / model_name / f"{dataset}_{metric}_score.json" + ) + + def _fake_run_writing_score( + self, ab_dir: Path, *, score_value: float, body_shape: str = "flat" + ): + """Build a fake_run that writes a score file at AudioBench's + hardcoded path, parameterized by the JSON shape we want to test. + """ + + def fake_run(cmd, cwd, env, check): + model_name = cmd[cmd.index("--model_name") + 1] + dataset = cmd[cmd.index("--dataset_name") + 1] + metric = cmd[cmd.index("--metrics") + 1] + score_file = self._score_file_path(Path(cwd), model_name, dataset, metric) + score_file.parent.mkdir(parents=True, exist_ok=True) + if body_shape == "flat": + score_file.write_text(json.dumps({metric: score_value})) + elif body_shape == "nested": + score_file.write_text( + json.dumps({"metrics": {metric: {"score": score_value, "n": 100}}}) + ) + elif body_shape == "missing_metric": + score_file.write_text(json.dumps({"irrelevant": 1})) + elif body_shape == "no_file": + pass # deliberately don't write + else: + raise ValueError(f"unknown body_shape: {body_shape}") + return _FakeCompletedProcess(0) + + return fake_run + def test_run_missing_audiobench_dir_raises(self, tmp_path): from oellm.contrib.audiobench.suite import run with pytest.raises(RuntimeError, match="AUDIOBENCH_DIR must be set"): run( - model_path="Qwen/Qwen2-Audio-7B", + model_path="Qwen/Qwen2-Audio-7B-Instruct", task="audiobench_librispeech_test_clean", n_shot=0, output_path=tmp_path / "out.json", - model_flags="qwen2_audio", + model_flags="Qwen2-Audio-7B-Instruct", env={}, # no AUDIOBENCH_DIR ) @@ -446,53 +570,74 @@ def test_run_missing_entrypoint_raises(self, tmp_path): bad_dir.mkdir() with pytest.raises(FileNotFoundError, match="AudioBench entry point"): run( - model_path="Qwen/Qwen2-Audio-7B", + model_path="Qwen/Qwen2-Audio-7B-Instruct", task="audiobench_librispeech_test_clean", n_shot=0, output_path=tmp_path / "out.json", - model_flags="qwen2_audio", + model_flags="Qwen2-Audio-7B-Instruct", env={"AUDIOBENCH_DIR": str(bad_dir)}, ) + def test_run_unmapped_model_raises(self, tmp_path): + """AudioBench has no generic loader. When ``model_flags`` is + ``None`` (adapter found no match), :func:`run` must fail loudly + rather than invoking AudioBench with a missing/empty model_name. + """ + from oellm.contrib.audiobench.suite import run + + ab_dir = self._fake_audiobench_tree(tmp_path) + with pytest.raises(RuntimeError, match="Could not map model_path"): + run( + model_path="random/unknown-model", + task="audiobench_librispeech_test_clean", + n_shot=0, + output_path=tmp_path / "out.json", + model_flags=None, + env={"AUDIOBENCH_DIR": str(ab_dir)}, + ) + def test_run_invokes_subprocess_with_expected_cli(self, tmp_path): from oellm.contrib.audiobench import suite ab_dir = self._fake_audiobench_tree(tmp_path) output_path = tmp_path / "result.json" - def fake_run(cmd, cwd, env, check): - # Write a fake AudioBench result JSON into the run_dir that - # _extract_metrics will pick up. - run_dir = Path(cmd[cmd.index("--log_dir") + 1]) - (run_dir / "task_result.json").write_text(json.dumps({"wer": 0.063})) - return _FakeCompletedProcess(0) - with patch( - "oellm.contrib.audiobench.suite.subprocess.run", side_effect=fake_run + "oellm.contrib.audiobench.suite.subprocess.run", + side_effect=self._fake_run_writing_score(ab_dir, score_value=0.063), ) as mock_sp: suite.run( model_path="Qwen/Qwen2-Audio-7B-Instruct", task="audiobench_librispeech_test_clean", n_shot=0, output_path=output_path, - model_flags="qwen2_audio", + model_flags="Qwen2-Audio-7B-Instruct", env={"AUDIOBENCH_DIR": str(ab_dir), "LIMIT": "100"}, ) assert mock_sp.call_count == 1 cmd = mock_sp.call_args.args[0] assert cmd[:2] == ["python", "src/main_evaluate.py"] - assert "--dataset" in cmd - assert cmd[cmd.index("--dataset") + 1] == "librispeech_test_clean" - assert cmd[cmd.index("--model") + 1] == "qwen2_audio" - assert cmd[cmd.index("--model_name") + 1] == "Qwen/Qwen2-Audio-7B-Instruct" + + # AudioBench's actual main() signature: dataset_name / model_name + # / metrics / overwrite / number_of_samples. No --model, no + # --log_dir, no --data_dir. + assert cmd[cmd.index("--dataset_name") + 1] == "librispeech_test_clean" + assert cmd[cmd.index("--model_name") + 1] == "Qwen2-Audio-7B-Instruct" assert cmd[cmd.index("--metrics") + 1] == "wer" - # LIMIT propagated. + assert cmd[cmd.index("--overwrite") + 1] == "True" assert cmd[cmd.index("--number_of_samples") + 1] == "100" - # cwd is AUDIOBENCH_DIR. + + # Flags AudioBench does NOT accept must not be in the cmd. + assert "--model" not in cmd # only --model_name exists upstream + assert "--log_dir" not in cmd # AudioBench writes to a fixed path + assert "--data_dir" not in cmd # split selection is via dataset_name + + # cwd is AUDIOBENCH_DIR so AudioBench's relative writes + # (log_for_all_models/...) land inside the clone. assert mock_sp.call_args.kwargs["cwd"] == str(ab_dir) - # Output JSON is lmms-eval-shaped and contains the extracted metric. + # Output JSON is lmms-eval-shaped. body = json.loads(output_path.read_text()) assert body["model_name_or_path"] == "Qwen/Qwen2-Audio-7B-Instruct" assert body["results"]["audiobench_librispeech_test_clean"][ @@ -500,32 +645,31 @@ def fake_run(cmd, cwd, env, check): ] == pytest.approx(0.063) assert body["configs"]["audiobench_librispeech_test_clean"]["num_fewshot"] == 0 - def test_run_forwards_data_dir_for_gigaspeech2(self, tmp_path): + def test_run_uses_per_split_dataset_name_for_gigaspeech2(self, tmp_path): + """GigaSpeech2 splits are dispatched via the dataset_name itself + (``gigaspeech2_thai``), not a ``--data_dir`` flag. + """ from oellm.contrib.audiobench import suite ab_dir = self._fake_audiobench_tree(tmp_path) output_path = tmp_path / "result.json" - def fake_run(cmd, cwd, env, check): - run_dir = Path(cmd[cmd.index("--log_dir") + 1]) - (run_dir / "gs2.json").write_text(json.dumps({"wer": 0.12})) - return _FakeCompletedProcess(0) - with patch( - "oellm.contrib.audiobench.suite.subprocess.run", side_effect=fake_run + "oellm.contrib.audiobench.suite.subprocess.run", + side_effect=self._fake_run_writing_score(ab_dir, score_value=0.12), ) as mock_sp: suite.run( - model_path="Qwen/Qwen2-Audio-7B", + model_path="Qwen/Qwen2-Audio-7B-Instruct", task="audiobench_gigaspeech2_thai", n_shot=0, output_path=output_path, - model_flags="qwen2_audio", + model_flags="Qwen2-Audio-7B-Instruct", env={"AUDIOBENCH_DIR": str(ab_dir)}, ) cmd = mock_sp.call_args.args[0] - assert "--data_dir" in cmd - assert cmd[cmd.index("--data_dir") + 1] == "th-test" + assert cmd[cmd.index("--dataset_name") + 1] == "gigaspeech2_thai" + assert "--data_dir" not in cmd def test_run_omits_number_of_samples_when_limit_empty(self, tmp_path): from oellm.contrib.audiobench import suite @@ -533,26 +677,48 @@ def test_run_omits_number_of_samples_when_limit_empty(self, tmp_path): ab_dir = self._fake_audiobench_tree(tmp_path) output_path = tmp_path / "result.json" - def fake_run(cmd, cwd, env, check): - run_dir = Path(cmd[cmd.index("--log_dir") + 1]) - (run_dir / "r.json").write_text(json.dumps({"wer": 0.1})) - return _FakeCompletedProcess(0) - with patch( - "oellm.contrib.audiobench.suite.subprocess.run", side_effect=fake_run + "oellm.contrib.audiobench.suite.subprocess.run", + side_effect=self._fake_run_writing_score(ab_dir, score_value=0.1), ) as mock_sp: suite.run( - model_path="Qwen/Qwen2-Audio-7B", + model_path="Qwen/Qwen2-Audio-7B-Instruct", task="audiobench_librispeech_test_clean", n_shot=0, output_path=output_path, - model_flags="qwen2_audio", + model_flags="Qwen2-Audio-7B-Instruct", env={"AUDIOBENCH_DIR": str(ab_dir), "LIMIT": ""}, ) cmd = mock_sp.call_args.args[0] assert "--number_of_samples" not in cmd + def test_run_always_passes_overwrite_true(self, tmp_path): + """AudioBench skips evaluation when a stale score file already + exists unless ``--overwrite True`` is passed; we always pass it + because we do our own deduplication via output_path. + """ + from oellm.contrib.audiobench import suite + + ab_dir = self._fake_audiobench_tree(tmp_path) + output_path = tmp_path / "result.json" + + with patch( + "oellm.contrib.audiobench.suite.subprocess.run", + side_effect=self._fake_run_writing_score(ab_dir, score_value=0.1), + ) as mock_sp: + suite.run( + model_path="Qwen/Qwen2-Audio-7B-Instruct", + task="audiobench_librispeech_test_clean", + n_shot=0, + output_path=output_path, + model_flags="Qwen2-Audio-7B-Instruct", + env={"AUDIOBENCH_DIR": str(ab_dir)}, + ) + + cmd = mock_sp.call_args.args[0] + assert cmd[cmd.index("--overwrite") + 1] == "True" + def test_run_nonzero_exit_raises(self, tmp_path): from oellm.contrib.audiobench import suite @@ -565,38 +731,36 @@ def test_run_nonzero_exit_raises(self, tmp_path): ): with pytest.raises(RuntimeError, match="AudioBench exited with code 1"): suite.run( - model_path="Qwen/Qwen2-Audio-7B", + model_path="Qwen/Qwen2-Audio-7B-Instruct", task="audiobench_librispeech_test_clean", n_shot=0, output_path=output_path, - model_flags="qwen2_audio", + model_flags="Qwen2-Audio-7B-Instruct", env={"AUDIOBENCH_DIR": str(ab_dir)}, ) def test_run_handles_nested_metric_json(self, tmp_path): - """AudioBench output format has drifted; support - ``{"metrics": {"wer": {"score": 0.05}}}`` as well as flat - ``{"wer": 0.05}``. + """AudioBench's score-file shape has drifted across releases; we + tolerate both ``{"wer": 0.05}`` and + ``{"metrics": {"wer": {"score": 0.05}}}`` layouts. """ from oellm.contrib.audiobench import suite ab_dir = self._fake_audiobench_tree(tmp_path) output_path = tmp_path / "result.json" - def fake_run(cmd, cwd, env, check): - run_dir = Path(cmd[cmd.index("--log_dir") + 1]) - (run_dir / "nested.json").write_text( - json.dumps({"metrics": {"wer": {"score": 0.051, "notes": "nested"}}}) - ) - return _FakeCompletedProcess(0) - - with patch("oellm.contrib.audiobench.suite.subprocess.run", side_effect=fake_run): + with patch( + "oellm.contrib.audiobench.suite.subprocess.run", + side_effect=self._fake_run_writing_score( + ab_dir, score_value=0.051, body_shape="nested" + ), + ): suite.run( - model_path="Qwen/Qwen2-Audio-7B", + model_path="Qwen/Qwen2-Audio-7B-Instruct", task="audiobench_librispeech_test_clean", n_shot=0, output_path=output_path, - model_flags="qwen2_audio", + model_flags="Qwen2-Audio-7B-Instruct", env={"AUDIOBENCH_DIR": str(ab_dir)}, ) @@ -605,25 +769,53 @@ def fake_run(cmd, cwd, env, check): "wer" ] == pytest.approx(0.051) - def test_run_missing_metric_in_output_raises(self, tmp_path): + def test_run_missing_score_file_raises(self, tmp_path): + """If AudioBench exits 0 but doesn't write the score file at the + expected path, surface a clear error rather than producing an + empty CSV row downstream. + """ from oellm.contrib.audiobench import suite ab_dir = self._fake_audiobench_tree(tmp_path) output_path = tmp_path / "result.json" - def fake_run(cmd, cwd, env, check): - run_dir = Path(cmd[cmd.index("--log_dir") + 1]) - (run_dir / "no_metric.json").write_text(json.dumps({"irrelevant": 1})) - return _FakeCompletedProcess(0) + with patch( + "oellm.contrib.audiobench.suite.subprocess.run", + side_effect=self._fake_run_writing_score( + ab_dir, score_value=0.0, body_shape="no_file" + ), + ): + with pytest.raises( + RuntimeError, match="AudioBench did not write expected score file" + ): + suite.run( + model_path="Qwen/Qwen2-Audio-7B-Instruct", + task="audiobench_librispeech_test_clean", + n_shot=0, + output_path=output_path, + model_flags="Qwen2-Audio-7B-Instruct", + env={"AUDIOBENCH_DIR": str(ab_dir)}, + ) + + def test_run_score_file_without_metric_key_raises(self, tmp_path): + from oellm.contrib.audiobench import suite + + ab_dir = self._fake_audiobench_tree(tmp_path) + output_path = tmp_path / "result.json" - with patch("oellm.contrib.audiobench.suite.subprocess.run", side_effect=fake_run): + with patch( + "oellm.contrib.audiobench.suite.subprocess.run", + side_effect=self._fake_run_writing_score( + ab_dir, score_value=0.0, body_shape="missing_metric" + ), + ): with pytest.raises(RuntimeError, match="Could not locate metric"): suite.run( - model_path="Qwen/Qwen2-Audio-7B", + model_path="Qwen/Qwen2-Audio-7B-Instruct", task="audiobench_librispeech_test_clean", n_shot=0, output_path=output_path, - model_flags="qwen2_audio", + model_flags="Qwen2-Audio-7B-Instruct", env={"AUDIOBENCH_DIR": str(ab_dir)}, ) @@ -662,11 +854,40 @@ def test_dry_run_writes_audiobench_suite_to_csv(self, tmp_path): csv_files = list(tmp_path.glob("**/jobs.csv")) assert len(csv_files) == 1 df = pd.read_csv(csv_files[0]) - # All rows route to audiobench (with or without model-flag suffix). + # All rows route to audiobench (with model-flag suffix). assert all(s.startswith("audiobench") for s in df["eval_suite"].unique()) # task_path column contains canonical audiobench_ names. assert all(t.startswith("audiobench_") for t in df["task_path"].unique()) + def test_dry_run_preserves_model_flag_capitalization(self, tmp_path): + """Regression: scheduler.py used to lowercase the entire eval_suite + column, breaking AudioBench's case-sensitive dispatch keys + (``Qwen2-Audio-7B-Instruct`` was being mangled to + ``qwen2-audio-7b-instruct``). + """ + import pandas as pd + + from oellm.main import schedule_evals + + with ( + patch("oellm.scheduler._load_cluster_env"), + patch("oellm.scheduler._num_jobs_in_queue", return_value=0), + patch.dict(os.environ, {"EVAL_OUTPUT_DIR": str(tmp_path)}), + ): + schedule_evals( + models="Qwen/Qwen2-Audio-7B-Instruct", + task_groups=ASR_GROUP, + skip_checks=True, + venv_path=str(Path(sys.prefix)), + dry_run=True, + ) + + csv_files = list(tmp_path.glob("**/jobs.csv")) + df = pd.read_csv(csv_files[0]) + suites = set(df["eval_suite"].unique()) + # The exact AudioBench dispatch literal must come through case-intact. + assert "audiobench:Qwen2-Audio-7B-Instruct" in suites + def test_dry_run_sbatch_contains_contrib_dispatch(self, tmp_path): from oellm.main import schedule_evals @@ -687,7 +908,7 @@ def test_dry_run_sbatch_contains_contrib_dispatch(self, tmp_path): assert len(sbatch_files) == 1 content = sbatch_files[0].read_text() assert "oellm.contrib.dispatch" in content - # LIMIT is now exported so contrib plugins can read it. + # LIMIT is exported so contrib plugins can read it. assert "export LIMIT=" in content From bce403bb398cd1a22b578a4f8e398c4becfacddd Mon Sep 17 00:00:00 2001 From: Ivan Slobozhan Date: Mon, 4 May 2026 16:02:49 +0200 Subject: [PATCH 4/6] update readme --- docs/VENV.md | 24 +++- oellm/contrib/audiobench/README.md | 150 ++++++++++++++--------- oellm/contrib/regiondial_bench/README.md | 57 ++++----- 3 files changed, 142 insertions(+), 89 deletions(-) diff --git a/docs/VENV.md b/docs/VENV.md index 553500fc..da9f53bb 100644 --- a/docs/VENV.md +++ b/docs/VENV.md @@ -4,7 +4,29 @@ Instead of using pre-built containers, you can run evaluations with your own Python virtual environment by passing `--venv-path`. -## Setup +## Choosing your venv + +Most evaluations share **one general venv**. A handful of framework-level +suites have hard dependency conflicts and need their own venv: + +| Task group(s) | Engine | Venv | Setup | +|---|---|---|---| +| `open-sci-*`, `belebele_*_cf`, all text/multilingual tasks | `lm-eval-harness`, `lighteval` | **general** | [Setup](#setup-general-venv) | +| `image-*`, `video-*`, `audio-*` (modality-prefixed) | `lmms-eval` | **general** | [Setup](#setup-general-venv) | +| `dclm-core-22` | `lm-eval-harness` (pinned 0.4.9.2) | **dclm** | [DCLM-core-22](#dclm-core-22) | +| `reasoning` (GPQA/MATH500/AIME/MBPP/etc.) | `evalchemy` + forked lm-eval | **evalchemy** | [Evalchemy](#evalchemy-reasoning) | + +Custom contrib benchmarks bring their own dependency stacks and are +documented in `oellm/contrib//README.md`: + +| Task group(s) | Contrib | README | +|---|---|---| +| `audio-audiobench*` | `audiobench` | [`oellm/contrib/audiobench/README.md`](../oellm/contrib/audiobench/README.md) | +| `regiondial-*` | `regiondial_bench` | [`oellm/contrib/regiondial_bench/README.md`](../oellm/contrib/regiondial_bench/README.md) | + +Use `oellm list-tasks` to see which suite a given task group routes to. + +## Setup (general venv) 1. Create a venv with Python 3.12: ```bash diff --git a/oellm/contrib/audiobench/README.md b/oellm/contrib/audiobench/README.md index 4306901a..ea4c49ab 100644 --- a/oellm/contrib/audiobench/README.md +++ b/oellm/contrib/audiobench/README.md @@ -34,38 +34,22 @@ and depend on a vLLM judge service being provisioned on Leonardo. ## Prerequisites -### 1. Clone AudioBench on the cluster +AudioBench is not pip-installable (no upstream build backend, bare imports +in `src/main_evaluate.py`); the plugin invokes it as a subprocess from an +on-cluster clone. A dedicated venv is required: the `[audiobench]` extra +pins `transformers<5` and `jiwer<3`, which conflict with the general eval +venv (see [`docs/VENV.md`](../../../docs/VENV.md) for the framework venvs). -AudioBench is **not** pip-installable — upstream is a script harness with -bare imports (`from dataset import ...` inside `src/main_evaluate.py`) and -no `pyproject.toml` / `setup.py`. The plugin invokes it as a subprocess -from an on-cluster clone. +### 1. Clone AudioBench ```bash git clone https://github.com/AudioLLMs/AudioBench /path/to/AudioBench ``` -We track the **latest `main`** — no pinned SHA — so updates are a simple -`git pull` under `$AUDIOBENCH_DIR`. If a breaking upstream change lands, -file an issue and we'll introduce a pin. +AudioBench's `main` branch is tracked without a pinned SHA; updates are a +`git pull` under `$AUDIOBENCH_DIR`. -### 2. Install AudioBench's own runtime dependencies - -Still inside the clone: - -```bash -cd /path/to/AudioBench -python -m venv .venv && source .venv/bin/activate -pip install -r requirements.txt -``` - -AudioBench's deps (unpinned upstream): `transformers`, `vllm`, `datasets`, -`torchaudio`, `peft`, `autoawq`, `huggingface-hub`, `librosa`, `soundfile`, -`fire`, `evaluate`, `jiwer`, `more_itertools`. Use a **separate venv** -from the elliot-cli venv — AudioBench typically pulls in a bleeding-edge -`transformers` that will conflict with lmms-eval's pin. - -### 3. Configure `clusters.yaml` +### 2. Configure clusters.yaml Add `AUDIOBENCH_DIR` to your cluster block in `oellm/resources/clusters.yaml`: @@ -76,29 +60,67 @@ leonardo: AUDIOBENCH_DIR: "/leonardo/home/userexternal//AudioBench" ``` -The plugin fails fast at dispatch time (via -`oellm.contrib.dispatch`'s `CLUSTER_ENV_VARS` check) if the variable is -missing, so you'll get a clean error message instead of a crash deep -inside the subprocess. - -### 4. Install the elliot-cli `audiobench` extra +`oellm.contrib.dispatch`'s `CLUSTER_ENV_VARS` check raises a clear error +at dispatch time if the variable is missing. -On the submission / login node where you run `oellm schedule-evals`: +### 3. Create a venv and install the `[audiobench]` extra ```bash +uv venv --python 3.12 audiobench-venv +source audiobench-venv/bin/activate uv pip install -e ".[audiobench]" ``` -This installs our Python-side scorer deps (`jiwer`, `sacrebleu`, -`pythainlp`, `evaluate`) used for result post-processing — **not** -AudioBench itself. +The extra pins `transformers>=4.45,<5`, `jiwer<3`, `sacrebleu`, +`pythainlp`, `evaluate`, `soundfile`, `librosa`. -### 5. Dataset pre-download +### 4. Install AudioBench's runtime dependencies -No manual steps required. `schedule-evals` auto-downloads every -`AudioLLMs/*` HF repo referenced by the requested task group on the -login node via `huggingface_hub.snapshot_download(max_workers=2)` so the -compute nodes do not need internet access. +Filter `vllm` — it is only used by judge-dependent tasks (deferred): + +```bash +grep -v -i '^vllm' /path/to/AudioBench/requirements.txt > /tmp/ab-reqs.txt +uv pip install -r /tmp/ab-reqs.txt +``` + +### 5. Re-pin PyTorch for the cluster's CUDA driver + +PyPI's default torch wheels target a CUDA runtime newer than most HPC +drivers (Leonardo, JURECA report CUDA 12.2) and crash with +`NVIDIA driver too old`. Use the `cu121` index: + +```bash +uv pip install torch torchvision torchaudio \ + --index-url https://download.pytorch.org/whl/cu121 +``` + +### 6. Reinstall rapidfuzz + +The pure-Python fallback raises `NotImplementedError` on +`Levenshtein.editops`, which jiwer's WER scoring calls. Force a fresh +install of the C extension: + +```bash +uv pip install --reinstall rapidfuzz +``` + +### 7. Verify + +```bash +python -c " +from transformers import Qwen2AudioForConditionalGeneration +from rapidfuzz.distance import Levenshtein +Levenshtein.editops('a', 'b') # must not raise NotImplementedError +print('audiobench venv OK') +" +``` + +### Dataset pre-download + +No manual steps required. `schedule-eval` pre-downloads every +`AudioLLMs/*` HF repo referenced by the requested task group on the login +node via `huggingface_hub.snapshot_download(max_workers=2)`, so compute +nodes do not need internet access. ## Running @@ -115,23 +137,23 @@ compute nodes do not need internet access. ```bash # Full AudioBench suite on a Qwen2-Audio model: -oellm schedule-evals \ +oellm schedule-eval \ --models Qwen/Qwen2-Audio-7B-Instruct \ --task-groups audio-audiobench \ - --venv-path ~/elliot-venv + --venv-path audiobench-venv # ASR only: -oellm schedule-evals \ +oellm schedule-eval \ --models Qwen/Qwen2-Audio-7B-Instruct \ --task-groups audio-audiobench-asr \ - --venv-path ~/elliot-venv + --venv-path audiobench-venv # Smoke test with --limit: -oellm schedule-evals \ +oellm schedule-eval \ --models Qwen/Qwen2-Audio-7B-Instruct \ --task-groups audio-audiobench-asr \ --limit 100 \ - --venv-path ~/elliot-venv + --venv-path audiobench-venv ``` `--limit N` is forwarded to AudioBench's `--number_of_samples N`. When @@ -154,23 +176,31 @@ vs `lmms_eval`) — no silent averaging. ## Supported model adapters -| Model path pattern | AudioBench `--model` key | -|-------------------------------------|--------------------------| -| `*qwen2-audio*` / `*qwen-audio*` | `qwen2_audio` | -| `*salmonn*` | `salmonn` | -| `*ltu-*` / `*/ltu*` / `*ltu_as*` | `ltu` | -| `*whisper-*` / `*/whisper*` | `whisper` | -| `*audio-flamingo*` / `*audioflamingo*` | `audioflamingo` | -| `*meralion*` | `meralion` | -| (anything else) | `generic` (default HF pipeline) | - -To override detection explicitly, pass the key as a suffix in the suite -column: `audiobench:qwen2_audio`. The dispatcher in -`oellm/contrib/dispatch.py` already splits on `:`. +AudioBench dispatches on a fixed list of literal `model_name` strings +(see `$AUDIOBENCH_DIR/src/model.py`); each loader under `model_src/` +fetches its own HF repo. Arbitrary HF checkpoints are not supported — +only the variants below: + +| Model path substring (lowered) | AudioBench `model_name` (literal) | +|------------------------------------------------|-------------------------------------------| +| `qwen2-audio-7b-instruct` / `qwen2_audio_7b_instruct` | `Qwen2-Audio-7B-Instruct` | +| `qwen-audio-chat` / `qwen_audio_chat` | `Qwen-Audio-Chat` | +| `salmonn` | `SALMONN_7B` | +| `meralion-audiollm` / `meralion_audiollm` | `MERaLiON-AudioLLM-Whisper-SEA-LION` | +| `whisper-large-v3` / `whisper_large_v3` | `whisper_large_v3` | +| `whisper-large-v2` / `whisper_large_v2` | `whisper_large_v2` | +| `phi-4-multimodal` / `phi_4_multimodal` | `phi_4_multimodal_instruct` | +| `seallms-audio-7b` / `seallms_audio_7b` | `seallms_audio_7b` | +| `wavllm` | `WavLLM_fairseq` | +| (anything else) | error — no generic loader upstream | + +To override detection, pass the literal AudioBench key as a suffix: +`audiobench:Qwen2-Audio-7B-Instruct`. Case is preserved end-to-end +(AudioBench's match is case-sensitive). ## How results flow end-to-end -1. `schedule-evals` expands `audio-audiobench*` groups → 27 rows in +1. `schedule-eval` expands `audio-audiobench*` groups → 27 rows in `jobs.csv` with `eval_suite=audiobench` (plus an adapter suffix from `detect_model_flags`). 2. `_collect_dataset_specs` auto-derives `needs_snapshot_download=True` diff --git a/oellm/contrib/regiondial_bench/README.md b/oellm/contrib/regiondial_bench/README.md index b540d6ef..57f646ea 100644 --- a/oellm/contrib/regiondial_bench/README.md +++ b/oellm/contrib/regiondial_bench/README.md @@ -16,17 +16,17 @@ plus per-round breakdown (R1–R7) for gIoU and bbox_AP. ## Prerequisites -### 1. Clone RegionReasoner +The benchmark calls `test/evaluation/evaluation_multi_segmentation.py` and +the `test/vision_reasoner/` model wrapper from the RegionReasoner +repository as a subprocess, so the repo must be present on the cluster +filesystem. A dedicated venv is required for `flash-attn` (specific +pre-built wheel) and HEIF image support (`pi-heif`); see +[`docs/VENV.md`](../../../docs/VENV.md) for the framework venvs. -The benchmark relies on the inference script -`test/evaluation/evaluation_multi_segmentation.py` and the model wrapper -`test/vision_reasoner/` from the RegionReasoner repository. These are **not -packaged** — the platform calls them directly as a subprocess, so the repo -must be present on the cluster filesystem. +### 1. Clone RegionReasoner ```bash -git clone https://github.com/lmsdss/RegionReasoner \ - /path/to/RegionReasoner +git clone https://github.com/lmsdss/RegionReasoner /path/to/RegionReasoner ``` ### 2. Configure clusters.yaml @@ -38,40 +38,41 @@ my-cluster: ... HF_HOME: "/path/to/large/filesystem/huggingface" # must have ~30 GB free REGION_REASONER_DIR: "/path/to/RegionReasoner" - GPUS_PER_NODE: 4 # controls both SLURM --gres and shard count + GPUS_PER_NODE: 4 # controls SLURM --gres and shard count ``` -> **`HF_HOME`** must point to a filesystem with at least **30 GB** of free -> space. On CINECA Leonardo, use the work filesystem -> (`/leonardo_work//huggingface`), not the home filesystem (50 GB -> quota, fills up quickly). +> `HF_HOME` must point to a filesystem with at least 30 GB free. On +> CINECA Leonardo, use the work filesystem +> (`/leonardo_work//huggingface`), not the home filesystem +> (50 GB quota). -### 3. Install dependencies in your venv +### 3. Create a venv and install dependencies ```bash -# PyTorch — match the CUDA version available on your cluster -pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/cu121 +uv venv --python 3.12 regiondial-venv +source regiondial-venv/bin/activate +uv pip install -e . -# Matching torchvision -pip install torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu121 +# PyTorch — match the cluster's CUDA driver (cu121 for driver supporting CUDA 12.2) +uv pip install torch==2.5.1 torchvision==0.20.1 \ + --index-url https://download.pytorch.org/whl/cu121 -# flash-attn pre-built wheel (no compilation needed) +# flash-attn pre-built wheel (Python 3.12 / CUDA 12.x / torch 2.5.1) wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp312-cp312-linux_x86_64.whl -pip install flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp312-cp312-linux_x86_64.whl +uv pip install flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp312-cp312-linux_x86_64.whl # HEIF image support -pip install pi-heif +uv pip install pi-heif ``` -> **flash-attn note:** The pre-built wheel above is for Python 3.12, CUDA 12.x, -> torch 2.5.1. If your configuration differs, find the matching wheel at -> https://github.com/Dao-AILab/flash-attention/releases +> If your Python / CUDA / torch combination differs, find the matching +> flash-attn wheel at +> . -### 4. What gets auto-downloaded +### What gets auto-downloaded -When you run `oellm schedule-eval`, the platform automatically pre-downloads -the following on the login node (before SLURM submission, so compute nodes do -not need internet access): +`oellm schedule-eval` pre-downloads the following on the login node so +compute nodes do not need internet access: | Asset | HF repo | Size | |---|---|---| From b355cd57983eb41447eff7fdc65e1557c3ed5eee Mon Sep 17 00:00:00 2001 From: Ivan Slobozhan Date: Mon, 4 May 2026 23:10:31 +0200 Subject: [PATCH 5/6] fix comments --- oellm/contrib/audiobench/README.md | 89 +++++++----------------------- 1 file changed, 21 insertions(+), 68 deletions(-) diff --git a/oellm/contrib/audiobench/README.md b/oellm/contrib/audiobench/README.md index ea4c49ab..60ab3261 100644 --- a/oellm/contrib/audiobench/README.md +++ b/oellm/contrib/audiobench/README.md @@ -40,30 +40,22 @@ on-cluster clone. A dedicated venv is required: the `[audiobench]` extra pins `transformers<5` and `jiwer<3`, which conflict with the general eval venv (see [`docs/VENV.md`](../../../docs/VENV.md) for the framework venvs). -### 1. Clone AudioBench +### 1. Clone AudioBench and configure `clusters.yaml` ```bash git clone https://github.com/AudioLLMs/AudioBench /path/to/AudioBench ``` -AudioBench's `main` branch is tracked without a pinned SHA; updates are a -`git pull` under `$AUDIOBENCH_DIR`. - -### 2. Configure clusters.yaml - Add `AUDIOBENCH_DIR` to your cluster block in `oellm/resources/clusters.yaml`: ```yaml leonardo: ... - AUDIOBENCH_DIR: "/leonardo/home/userexternal//AudioBench" + AUDIOBENCH_DIR: "/path/to/AudioBench" ``` -`oellm.contrib.dispatch`'s `CLUSTER_ENV_VARS` check raises a clear error -at dispatch time if the variable is missing. - -### 3. Create a venv and install the `[audiobench]` extra +### 2. Create the venv ```bash uv venv --python 3.12 audiobench-venv @@ -71,49 +63,36 @@ source audiobench-venv/bin/activate uv pip install -e ".[audiobench]" ``` -The extra pins `transformers>=4.45,<5`, `jiwer<3`, `sacrebleu`, -`pythainlp`, `evaluate`, `soundfile`, `librosa`. - -### 4. Install AudioBench's runtime dependencies +The `[audiobench]` extra pins `transformers>=4.45,<5`, `jiwer<3`, +`sacrebleu`, `pythainlp`, `evaluate`, `soundfile`, `librosa`. -Filter `vllm` — it is only used by judge-dependent tasks (deferred): +### 3. Install AudioBench's runtime dependencies ```bash +# AudioBench's own requirements (filter vllm; only used by deferred judge tasks) grep -v -i '^vllm' /path/to/AudioBench/requirements.txt > /tmp/ab-reqs.txt uv pip install -r /tmp/ab-reqs.txt -``` - -### 5. Re-pin PyTorch for the cluster's CUDA driver -PyPI's default torch wheels target a CUDA runtime newer than most HPC -drivers (Leonardo, JURECA report CUDA 12.2) and crash with -`NVIDIA driver too old`. Use the `cu121` index: - -```bash +# PyTorch for cluster's CUDA driver — PyPI defaults target a newer runtime +# than most HPC drivers (Leonardo / JURECA report CUDA 12.2) and crash with +# `NVIDIA driver too old`. Use the cu121 index. uv pip install torch torchvision torchaudio \ --index-url https://download.pytorch.org/whl/cu121 -``` - -### 6. Reinstall rapidfuzz -The pure-Python fallback raises `NotImplementedError` on -`Levenshtein.editops`, which jiwer's WER scoring calls. Force a fresh -install of the C extension: - -```bash +# rapidfuzz C extension — without this, jiwer's WER scoring hits the +# pure-Python fallback and raises NotImplementedError on Levenshtein.editops. uv pip install --reinstall rapidfuzz ``` -### 7. Verify - -```bash -python -c " -from transformers import Qwen2AudioForConditionalGeneration -from rapidfuzz.distance import Levenshtein -Levenshtein.editops('a', 'b') # must not raise NotImplementedError -print('audiobench venv OK') -" -``` +> Verify the venv works: +> ```bash +> python -c " +> from transformers import Qwen2AudioForConditionalGeneration +> from rapidfuzz.distance import Levenshtein +> Levenshtein.editops('a', 'b') # must not raise +> print('audiobench venv OK') +> " +> ``` ### Dataset pre-download @@ -198,29 +177,3 @@ To override detection, pass the literal AudioBench key as a suffix: `audiobench:Qwen2-Audio-7B-Instruct`. Case is preserved end-to-end (AudioBench's match is case-sensitive). -## How results flow end-to-end - -1. `schedule-eval` expands `audio-audiobench*` groups → 27 rows in - `jobs.csv` with `eval_suite=audiobench` (plus an adapter suffix from - `detect_model_flags`). -2. `_collect_dataset_specs` auto-derives `needs_snapshot_download=True` - from the group-name prefix (`audio-*`) and snapshots every referenced - `AudioLLMs/*` repo to the shared HF cache. -3. `template.sbatch`'s `*)` catch-all invokes - `python -m oellm.contrib.dispatch --suite audiobench: …`. -4. `oellm.contrib.audiobench.suite.run()` subprocesses - `python src/main_evaluate.py …` inside `$AUDIOBENCH_DIR`, captures - the result JSON AudioBench writes under its `--log_dir`, extracts the - metric value, and writes a lmms-eval-compatible JSON at - `$output_path`. -5. `collect-results` reads it via `parse_results()` and the standard - `_resolve_metric` fallback chain — no special-casing in core code. - -## Open items - -- **Judge service hosting:** judge-dependent tasks need a Llama-3-70B-AWQ - judge on an OpenAI-compatible endpoint. Plan is a separate long-running - vLLM sbatch whose URL/model lands in `clusters.yaml` as - `AUDIOBENCH_JUDGE_URL` and `AUDIOBENCH_JUDGE_MODEL`. -- **MERaLiON / IMDA NSC tasks:** ~21 gated AudioBench tasks require - corpora not on public HF. Deferred until WP4 needs them. From 6117bd511582f3c381ff66084e8ba18fe9b829eb Mon Sep 17 00:00:00 2001 From: Ivan Slobozhan Date: Mon, 4 May 2026 23:13:39 +0200 Subject: [PATCH 6/6] Revert accidental clusters.yaml changes --- oellm/resources/clusters.yaml | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/oellm/resources/clusters.yaml b/oellm/resources/clusters.yaml index 370201b6..36e23d29 100644 --- a/oellm/resources/clusters.yaml +++ b/oellm/resources/clusters.yaml @@ -1,5 +1,5 @@ shared: - TIME_LIMIT: "02:30:00" # time limit in the format HH:MM:SS + TIME_LIMIT: "00:30:00" # time limit in the format HH:MM:SS UV_LINK_MODE: "copy" EVAL_OUTPUT_DIR: "{EVAL_BASE_DIR}/{USER}" # where evaluations are written GPUS_PER_NODE: 1 @@ -7,16 +7,13 @@ shared: HF_DATASETS_DISABLE_PROGRESS_BARS: "1" leonardo: - hostname_pattern: "*.leonardo.local" - EVAL_BASE_DIR: "/leonardo/home/userexternal/islobozh/oellm-cli-shared-evals/" - PARTITION: "boost_usr_prod" - ACCOUNT: "OELLM_prod2026" - QUEUE_LIMIT: 1000 - EVAL_CONTAINER_IMAGE: "eval_env-leonardo.sif" + hostname_pattern: "*.leonardo.local" # use this regexp to automatically assign environment variables corresponding to this YAML + EVAL_BASE_DIR: "/leonardo_work/AIFAC_L01_028/oellm-cli-shared-evals" + PARTITION: "boost_usr_prod" # default partition to use + ACCOUNT: "OELLM_prod2026" # default account to use + QUEUE_LIMIT: 1000 # maximum number of jobs that can be submitted as job/array, used to send only jobs that respects QOS + EVAL_CONTAINER_IMAGE: "eval_env-leonardo.sif" # name of the container image that is pulled which is built automatically with Github actions SINGULARITY_ARGS: "--nv" - HF_HOME: "/leonardo_work/OELLM_prod2026/huggingface" - GPUS_PER_NODE: 4 - REGION_REASONER_DIR: "/leonardo/home/userexternal/islobozh/RegionReasoner" jureca: hostname_pattern: "*.jureca"