Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions src/openbench/cli/commands/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ def run_alias_mode(
wandb_tags: list[str] | None,
use_keywords: bool | None,
force_language: bool,
pipeline_config: list[str] | None,
verbose: bool,
) -> BenchmarkResult:
"""Run evaluation using pipeline and dataset aliases."""
Expand Down Expand Up @@ -208,6 +209,16 @@ def run_alias_mode(
if verbose:
typer.echo("✅ Force language: enabled")

# Handle generic pipeline config overrides (key=value pairs)
if pipeline_config:
for item in pipeline_config:
if "=" not in item:
raise typer.BadParameter(f"Invalid --pipeline-config format: '{item}'. Expected key=value")
key, value = item.split("=", 1)
pipeline_config_override[key] = value
if verbose:
typer.echo(f"Config override: {key}={value}")

pipeline = PipelineRegistry.create_pipeline(pipeline_name, config=pipeline_config_override)

######### Build Benchmark Config #########
Expand Down Expand Up @@ -345,6 +356,12 @@ def evaluate(
"--force-language",
help="Force language hinting for compatible pipelines",
),
pipeline_config: list[str] | None = typer.Option(
None,
"--pipeline-config",
"-pc",
help="Override pipeline config values as key=value pairs (e.g. --pipeline-config speaker=serena)",
),
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output"),
) -> None:
"""Run evaluation benchmarks.
Expand Down Expand Up @@ -406,6 +423,7 @@ def evaluate(
wandb_tags=wandb_tags,
use_keywords=use_keywords,
force_language=force_language,
pipeline_config=pipeline_config,
verbose=verbose,
)
display_result(result)
Expand Down
3 changes: 3 additions & 0 deletions src/openbench/dataset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .dataset_diarization import DiarizationDataset, DiarizationSample
from .dataset_orchestration import OrchestrationDataset, OrchestrationSample
from .dataset_registry import DatasetRegistry
from .dataset_speech_generation import SpeechGenerationDataset, SpeechGenerationSample
from .dataset_streaming_transcription import StreamingDataset, StreamingSample
from .dataset_transcription import TranscriptionDataset, TranscriptionSample

Expand All @@ -24,11 +25,13 @@
"TranscriptionDataset",
"StreamingDataset",
"OrchestrationDataset",
"SpeechGenerationDataset",
# Sample types
"DiarizationSample",
"TranscriptionSample",
"StreamingSample",
"OrchestrationSample",
"SpeechGenerationSample",
# Registry
"DatasetRegistry",
]
15 changes: 15 additions & 0 deletions src/openbench/dataset/dataset_aliases.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,6 +554,20 @@ def register_dataset_aliases() -> None:
description="Common Voice dataset for transcription evaluation with up to 400 samples per language this subset contains only russian",
)

########## SPEECH GENERATION ##########

DatasetRegistry.register_alias(
"customer-service-tts-prompts-vocalized",
DatasetConfig(
dataset_id="argmaxinc/customer-service-tts-prompts-vocalized",
split="validation",
),
supported_pipeline_types={
PipelineType.SPEECH_GENERATION,
},
description="Customer service TTS prompts with vocalized audio for speech generation evaluation.",
)

########## STREAMING TRANSCRIPTION ##########

DatasetRegistry.register_alias(
Expand Down Expand Up @@ -646,6 +660,7 @@ def register_dataset_aliases() -> None:
PipelineType.DIARIZATION,
PipelineType.STREAMING_TRANSCRIPTION,
PipelineType.ORCHESTRATION,
PipelineType.SPEECH_GENERATION,
},
description="Local dataset for testing. To use this dataset you need to set the `LOCAL_DATASET_PATH` and `LOCAL_DATASET_SPLIT` environment variables.",
)
Expand Down
2 changes: 2 additions & 0 deletions src/openbench/dataset/dataset_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from .dataset_base import BaseDataset, DatasetConfig
from .dataset_diarization import DiarizationDataset
from .dataset_orchestration import OrchestrationDataset
from .dataset_speech_generation import SpeechGenerationDataset
from .dataset_streaming_transcription import StreamingDataset
from .dataset_transcription import TranscriptionDataset

Expand Down Expand Up @@ -139,3 +140,4 @@ def has_alias(cls, alias: str) -> bool:
DatasetRegistry.register(PipelineType.ORCHESTRATION, OrchestrationDataset)
DatasetRegistry.register(PipelineType.STREAMING_TRANSCRIPTION, StreamingDataset)
DatasetRegistry.register(PipelineType.TRANSCRIPTION, TranscriptionDataset)
DatasetRegistry.register(PipelineType.SPEECH_GENERATION, SpeechGenerationDataset)
102 changes: 102 additions & 0 deletions src/openbench/dataset/dataset_speech_generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# For licensing see accompanying LICENSE.md file.
# Copyright (C) 2025 Argmax, Inc. All Rights Reserved.

import numpy as np
from pydantic import Field
from typing_extensions import TypedDict

from ..pipeline_prediction import Transcript
from .dataset_base import BaseDataset, BaseSample


class SpeechGenerationExtraInfo(TypedDict, total=False):
"""Extra info for speech generation samples."""

language: str
dialogue: list[dict]


class SpeechGenerationRow(TypedDict):
"""Expected row structure for speech generation.

Requires 'text' (the prompt string). No audio needed.
"""

text: str


class SpeechGenerationSample(BaseSample[Transcript, SpeechGenerationExtraInfo]):
"""Sample for speech generation tasks.

The reference Transcript is created from the text
prompt. The pipeline generates audio from this text
and transcribes it to compute WER against reference.
"""

generated_audio_duration: float | None = Field(
default=None,
description=("Duration (seconds) of the TTS-generated audio. Set by the pipeline after generation."),
)

def get_audio_duration(self) -> float:
"""Return generated audio duration if available.

Falls back to the dummy waveform calculation
when the pipeline hasn't set the real duration yet.
"""
if self.generated_audio_duration is not None:
return self.generated_audio_duration
return super().get_audio_duration()

@property
def text(self) -> str:
"""The original text prompt."""
return self.reference.get_transcript_string()


class SpeechGenerationDataset(BaseDataset[SpeechGenerationSample]):
"""Dataset for speech generation pipelines.

Expects column: 'text' (the prompt string).
No audio column is required since audio is generated
by the pipeline itself.
"""

_expected_columns = ["text"]
_sample_class = SpeechGenerationSample

def _extract_audio_info(self, row: dict) -> tuple[str, np.ndarray, int]:
"""Override to provide dummy audio info.

Speech generation datasets don't have input audio.
We provide a placeholder waveform so the framework
sample structure is satisfied. The pipeline ignores
the waveform entirely.
"""
audio_name = f"sample_{row['idx']}"
# Use audio_name from the row if available
if "audio_name" in row and row["audio_name"]:
audio_name = str(row["audio_name"])
dummy_waveform = np.zeros(1, dtype=np.float32)
dummy_sample_rate = 16000
return audio_name, dummy_waveform, dummy_sample_rate

def prepare_sample(self, row: SpeechGenerationRow) -> tuple[Transcript, SpeechGenerationExtraInfo]:
"""Prepare reference from dataset row.

Splits text prompt into words to create the
reference Transcript.
"""
text = row["text"]
words = text.split()
reference = Transcript.from_words_info(
words=words,
)

extra_info: SpeechGenerationExtraInfo = {}
if "language" in row:
extra_info["language"] = row["language"]
if "dialogue" in row and row["dialogue"]:
extra_info["dialogue"] = row["dialogue"]

return reference, extra_info
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ def compute_keyword_stats(
) -> dict[str, Any]:
"""Compute keyword statistics between reference and hypothesis."""

if not dictionary:
return {"true_positives": 0, "ground_truth": 0, "false_positives": 0, "keyword_stats": {}}

# Convert transcripts to text
ref_text = reference.get_transcript_string()
hyp_text = hypothesis.get_transcript_string()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ def compute_metric(self, detail: Details) -> float:
PipelineType.TRANSCRIPTION,
PipelineType.ORCHESTRATION,
PipelineType.STREAMING_TRANSCRIPTION,
PipelineType.SPEECH_GENERATION,
),
MetricOptions.WER,
)
Expand Down
1 change: 1 addition & 0 deletions src/openbench/pipeline/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .diarization import *
from .orchestration import *
from .pipeline_registry import PipelineRegistry
from .speech_generation import *
from .streaming_transcription import *
from .transcription import *

Expand Down
133 changes: 133 additions & 0 deletions src/openbench/pipeline/pipeline_aliases.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,14 @@
WhisperXPipeline,
)
from .pipeline_registry import PipelineRegistry
from .speech_generation import (
CartesiaSpeechGenerationPipeline,
ElevenLabsDialogueGenerationPipeline,
ElevenLabsSpeechGenerationPipeline,
GeminiSpeechGenerationPipeline,
OpenAISpeechGenerationPipeline,
WhisperKitSpeechGenerationPipeline,
)
from .streaming_transcription import (
AssemblyAIStreamingPipeline,
DeepgramStreamingPipeline,
Expand Down Expand Up @@ -642,6 +650,131 @@ def register_pipeline_aliases() -> None:
description="PyannoteAI transcription pipeline (ignores speaker attribution). Uses the precision-2 model with Nvidia Parakeet STT. Requires `PYANNOTE_TOKEN` env var from https://www.pyannote.ai/.",
)

################# SPEECH GENERATION PIPELINES #################

PipelineRegistry.register_alias(
"whisperkit-speech-generation",
WhisperKitSpeechGenerationPipeline,
default_config={
"out_dir": "./speech_generation_results",
"cli_path": os.getenv("WHISPERKIT_CLI_PATH"),
"speaker": "aiden",
"language": "english",
"seed": 10,
"temperature": 0.9,
"top_k": 50,
"max_new_tokens": 245,
"transcription_cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"),
"transcription_repo_id": "argmaxinc/parakeetkit-pro",
"transcription_model_variant": "nvidia_parakeet-v2_476MB",
},
description="WhisperKit speech generation pipeline. Generates audio from text prompts using whisperkit-cli TTS, "
"then transcribes the generated audio to compute WER against the original prompt. "
"Requires `WHISPERKIT_CLI_PATH` env var pointing to the whisperkit-cli binary.",
)

PipelineRegistry.register_alias(
"cartesia-speech-generation",
CartesiaSpeechGenerationPipeline,
default_config={
"out_dir": "./speech_generation_results",
"model_id": "sonic-3",
"voice_id": "e07c00bc-4134-4eae-9ea4-1a55fb45746b",
"container": "wav",
"encoding": "pcm_f32le",
"sample_rate": 44100,
"transcription_cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"),
"transcription_repo_id": "argmaxinc/parakeetkit-pro",
"transcription_model_variant": "nvidia_parakeet-v2_476MB",
"keep_generated_audio": False,
},
description="Cartesia speech generation pipeline. Generates audio from text prompts using Cartesia TTS API, "
"then transcribes the generated audio to compute WER against the original prompt. "
"Requires `CARTESIA_API_KEY` and `WHISPERKITPRO_CLI_PATH` env vars.",
)

PipelineRegistry.register_alias(
"elevenlabs-dialogue-generation",
ElevenLabsDialogueGenerationPipeline,
default_config={
"out_dir": "./speech_generation_results",
"model_id": "eleven_v3",
"speaker_voice_map": {
"doctor": "9BWtsMINqrJLrRacOk9x",
"patient": "IKne3meq5aSn9XLyUdCD",
"assistant": "pFZP5JQG7iQjIQuC4Bku",
},
"default_voice_id": "9BWtsMINqrJLrRacOk9x",
"max_chars_per_chunk": 4500,
"chunk_silence_duration": 0.75,
"transcription_cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"),
"transcription_repo_id": "argmaxinc/parakeetkit-pro",
"transcription_model_variant": "nvidia_parakeet-v2_476MB",
"keep_generated_audio": False,
},
description="ElevenLabs dialogue generation pipeline. Generates multi-speaker conversational audio "
"from dialogue turns using ElevenLabs text_to_dialogue API, then transcribes the generated "
"audio to compute WER against the original dialogue text. "
"Requires `ELEVENLABS_API_KEY` and `WHISPERKITPRO_CLI_PATH` env vars.",
)

PipelineRegistry.register_alias(
"elevenlabs-speech-generation",
ElevenLabsSpeechGenerationPipeline,
default_config={
"out_dir": "./speech_generation_results",
"voice_id": "JBFqnCBsd6RMkjVDRZzb",
"model_id": "eleven_multilingual_v2",
"output_format": "mp3_44100_128",
"transcription_cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"),
"transcription_repo_id": "argmaxinc/parakeetkit-pro",
"transcription_model_variant": "nvidia_parakeet-v2_476MB",
"keep_generated_audio": False,
},
description="ElevenLabs speech generation pipeline. Generates audio from text prompts using ElevenLabs TTS API, "
"then transcribes the generated audio to compute WER against the original prompt. "
"Requires `ELEVENLABS_API_KEY` and `WHISPERKITPRO_CLI_PATH` env vars.",
)

PipelineRegistry.register_alias(
"gemini-speech-generation",
GeminiSpeechGenerationPipeline,
default_config={
"out_dir": "./speech_generation_results",
"voice_name": "Charon",
"language_code": "en-US",
"model_name": "gemini-2.5-pro-tts",
"audio_encoding": "MP3",
"transcription_cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"),
"transcription_repo_id": "argmaxinc/parakeetkit-pro",
"transcription_model_variant": "nvidia_parakeet-v2_476MB",
"keep_generated_audio": False,
},
description="Google Gemini speech generation pipeline. Generates audio from text prompts using Google Cloud TTS, "
"then transcribes the generated audio to compute WER against the original prompt. "
"Requires Google Cloud credentials and `WHISPERKITPRO_CLI_PATH` env var.",
)

PipelineRegistry.register_alias(
"openai-speech-generation",
OpenAISpeechGenerationPipeline,
default_config={
"out_dir": "./speech_generation_results",
"model": "gpt-4o-mini-tts",
"voice": "coral",
"response_format": "wav",
"speed": 1.0,
"transcription_cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"),
"transcription_repo_id": "argmaxinc/parakeetkit-pro",
"transcription_model_variant": "nvidia_parakeet-v2_476MB",
"keep_generated_audio": False,
},
description="OpenAI speech generation pipeline. Generates audio from text prompts using OpenAI TTS API, "
"then transcribes the generated audio to compute WER against the original prompt. "
"Requires `OPENAI_API_KEY` and `WHISPERKITPRO_CLI_PATH` env vars.",
)


################# STREAMING TRANSCRIPTION PIPELINES #################

PipelineRegistry.register_alias(
Expand Down
Loading
Loading