dbrkn · dbrkn · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026
diff --git a/src/openbench/cli/commands/evaluate.py b/src/openbench/cli/commands/evaluate.py
@@ -176,6 +176,7 @@ def run_alias_mode(
     wandb_tags: list[str] | None,
     use_keywords: bool | None,
     force_language: bool,
+    pipeline_config: list[str] | None,
     verbose: bool,
 ) -> BenchmarkResult:
     """Run evaluation using pipeline and dataset aliases."""
@@ -208,6 +209,16 @@ def run_alias_mode(
             if verbose:
                 typer.echo("✅ Force language: enabled")
 
+        # Handle generic pipeline config overrides (key=value pairs)
+        if pipeline_config:
+            for item in pipeline_config:
+                if "=" not in item:
+                    raise typer.BadParameter(f"Invalid --pipeline-config format: '{item}'. Expected key=value")
+                key, value = item.split("=", 1)
+                pipeline_config_override[key] = value
+                if verbose:
+                    typer.echo(f"Config override: {key}={value}")
+
         pipeline = PipelineRegistry.create_pipeline(pipeline_name, config=pipeline_config_override)
 
         ######### Build Benchmark Config #########
@@ -345,6 +356,12 @@ def evaluate(
         "--force-language",
         help="Force language hinting for compatible pipelines",
     ),
+    pipeline_config: list[str] | None = typer.Option(
+        None,
+        "--pipeline-config",
+        "-pc",
+        help="Override pipeline config values as key=value pairs (e.g. --pipeline-config speaker=serena)",
+    ),
     verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output"),
 ) -> None:
     """Run evaluation benchmarks.
@@ -406,6 +423,7 @@ def evaluate(
                 wandb_tags=wandb_tags,
                 use_keywords=use_keywords,
                 force_language=force_language,
+                pipeline_config=pipeline_config,
                 verbose=verbose,
             )
         display_result(result)

diff --git a/src/openbench/dataset/__init__.py b/src/openbench/dataset/__init__.py
@@ -6,6 +6,7 @@
 from .dataset_diarization import DiarizationDataset, DiarizationSample
 from .dataset_orchestration import OrchestrationDataset, OrchestrationSample
 from .dataset_registry import DatasetRegistry
+from .dataset_speech_generation import SpeechGenerationDataset, SpeechGenerationSample
 from .dataset_streaming_transcription import StreamingDataset, StreamingSample
 from .dataset_transcription import TranscriptionDataset, TranscriptionSample
 
@@ -24,11 +25,13 @@
     "TranscriptionDataset",
     "StreamingDataset",
     "OrchestrationDataset",
+    "SpeechGenerationDataset",
     # Sample types
     "DiarizationSample",
     "TranscriptionSample",
     "StreamingSample",
     "OrchestrationSample",
+    "SpeechGenerationSample",
     # Registry
     "DatasetRegistry",
 ]
diff --git a/src/openbench/dataset/dataset_aliases.py b/src/openbench/dataset/dataset_aliases.py
@@ -554,6 +554,20 @@ def register_dataset_aliases() -> None:
         description="Common Voice dataset for transcription evaluation with up to 400 samples per language this subset contains only russian",
     )
 
+    ########## SPEECH GENERATION ##########
+
+    DatasetRegistry.register_alias(
+        "customer-service-tts-prompts-vocalized",
+        DatasetConfig(
+            dataset_id="argmaxinc/customer-service-tts-prompts-vocalized",
+            split="validation",
+        ),
+        supported_pipeline_types={
+            PipelineType.SPEECH_GENERATION,
+        },
+        description="Customer service TTS prompts with vocalized audio for speech generation evaluation.",
+    )
+
     ########## STREAMING TRANSCRIPTION ##########
 
     DatasetRegistry.register_alias(
@@ -646,6 +660,7 @@ def register_dataset_aliases() -> None:
                 PipelineType.DIARIZATION,
                 PipelineType.STREAMING_TRANSCRIPTION,
                 PipelineType.ORCHESTRATION,
+                PipelineType.SPEECH_GENERATION,
             },
             description="Local dataset for testing. To use this dataset you need to set the `LOCAL_DATASET_PATH` and `LOCAL_DATASET_SPLIT` environment variables.",
         )

diff --git a/src/openbench/dataset/dataset_registry.py b/src/openbench/dataset/dataset_registry.py
@@ -8,6 +8,7 @@
 from .dataset_base import BaseDataset, DatasetConfig
 from .dataset_diarization import DiarizationDataset
 from .dataset_orchestration import OrchestrationDataset
+from .dataset_speech_generation import SpeechGenerationDataset
 from .dataset_streaming_transcription import StreamingDataset
 from .dataset_transcription import TranscriptionDataset
 
@@ -139,3 +140,4 @@ def has_alias(cls, alias: str) -> bool:
 DatasetRegistry.register(PipelineType.ORCHESTRATION, OrchestrationDataset)
 DatasetRegistry.register(PipelineType.STREAMING_TRANSCRIPTION, StreamingDataset)
 DatasetRegistry.register(PipelineType.TRANSCRIPTION, TranscriptionDataset)
+DatasetRegistry.register(PipelineType.SPEECH_GENERATION, SpeechGenerationDataset)
diff --git a/src/openbench/dataset/dataset_speech_generation.py b/src/openbench/dataset/dataset_speech_generation.py
@@ -0,0 +1,102 @@
+# For licensing see accompanying LICENSE.md file.
+# Copyright (C) 2025 Argmax, Inc. All Rights Reserved.
+
+import numpy as np
+from pydantic import Field
+from typing_extensions import TypedDict
+
+from ..pipeline_prediction import Transcript
+from .dataset_base import BaseDataset, BaseSample
+
+
+class SpeechGenerationExtraInfo(TypedDict, total=False):
+    """Extra info for speech generation samples."""
+
+    language: str
+    dialogue: list[dict]
+
+
+class SpeechGenerationRow(TypedDict):
+    """Expected row structure for speech generation.
+
+    Requires 'text' (the prompt string). No audio needed.
+    """
+
+    text: str
+
+
+class SpeechGenerationSample(BaseSample[Transcript, SpeechGenerationExtraInfo]):
+    """Sample for speech generation tasks.
+
+    The reference Transcript is created from the text
+    prompt. The pipeline generates audio from this text
+    and transcribes it to compute WER against reference.
+    """
+
+    generated_audio_duration: float | None = Field(
+        default=None,
+        description=("Duration (seconds) of the TTS-generated audio. Set by the pipeline after generation."),
+    )
+
+    def get_audio_duration(self) -> float:
+        """Return generated audio duration if available.
+
+        Falls back to the dummy waveform calculation
+        when the pipeline hasn't set the real duration yet.
+        """
+        if self.generated_audio_duration is not None:
+            return self.generated_audio_duration
+        return super().get_audio_duration()
+
+    @property
+    def text(self) -> str:
+        """The original text prompt."""
+        return self.reference.get_transcript_string()
+
+
+class SpeechGenerationDataset(BaseDataset[SpeechGenerationSample]):
+    """Dataset for speech generation pipelines.
+
+    Expects column: 'text' (the prompt string).
+    No audio column is required since audio is generated
+    by the pipeline itself.
+    """
+
+    _expected_columns = ["text"]
+    _sample_class = SpeechGenerationSample
+
+    def _extract_audio_info(self, row: dict) -> tuple[str, np.ndarray, int]:
+        """Override to provide dummy audio info.
+
+        Speech generation datasets don't have input audio.
+        We provide a placeholder waveform so the framework
+        sample structure is satisfied. The pipeline ignores
+        the waveform entirely.
+        """
+        audio_name = f"sample_{row['idx']}"
+        # Use audio_name from the row if available
+        if "audio_name" in row and row["audio_name"]:
+            audio_name = str(row["audio_name"])
+        dummy_waveform = np.zeros(1, dtype=np.float32)
+        dummy_sample_rate = 16000
+        return audio_name, dummy_waveform, dummy_sample_rate
+
+    def prepare_sample(self, row: SpeechGenerationRow) -> tuple[Transcript, SpeechGenerationExtraInfo]:
+        """Prepare reference from dataset row.
+
+        Splits text prompt into words to create the
+        reference Transcript.
+        """
+        text = row["text"]
+        words = text.split()
+        reference = Transcript.from_words_info(
+            words=words,
+        )
+
+        extra_info: SpeechGenerationExtraInfo = {}
+        if "language" in row:
+            extra_info["language"] = row["language"]
+        if "dialogue" in row and row["dialogue"]:
+            extra_info["dialogue"] = row["dialogue"]
+
+        return reference, extra_info
diff --git a/src/openbench/metric/keyword_boosting_metrics/boosting_metrics.py b/src/openbench/metric/keyword_boosting_metrics/boosting_metrics.py
@@ -26,6 +26,9 @@ def compute_keyword_stats(
     ) -> dict[str, Any]:
         """Compute keyword statistics between reference and hypothesis."""
 
+        if not dictionary:
+            return {"true_positives": 0, "ground_truth": 0, "false_positives": 0, "keyword_stats": {}}
+
         # Convert transcripts to text
         ref_text = reference.get_transcript_string()
         hyp_text = hypothesis.get_transcript_string()

diff --git a/src/openbench/metric/word_error_metrics/word_error_metrics.py b/src/openbench/metric/word_error_metrics/word_error_metrics.py
@@ -223,6 +223,7 @@ def compute_metric(self, detail: Details) -> float:
         PipelineType.TRANSCRIPTION,
         PipelineType.ORCHESTRATION,
         PipelineType.STREAMING_TRANSCRIPTION,
+        PipelineType.SPEECH_GENERATION,
     ),
     MetricOptions.WER,
 )

diff --git a/src/openbench/pipeline/__init__.py b/src/openbench/pipeline/__init__.py
@@ -6,6 +6,7 @@
 from .diarization import *
 from .orchestration import *
 from .pipeline_registry import PipelineRegistry
+from .speech_generation import *
 from .streaming_transcription import *
 from .transcription import *
 

diff --git a/src/openbench/pipeline/pipeline_aliases.py b/src/openbench/pipeline/pipeline_aliases.py
@@ -25,6 +25,14 @@
     WhisperXPipeline,
 )
 from .pipeline_registry import PipelineRegistry
+from .speech_generation import (
+    CartesiaSpeechGenerationPipeline,
+    ElevenLabsDialogueGenerationPipeline,
+    ElevenLabsSpeechGenerationPipeline,
+    GeminiSpeechGenerationPipeline,
+    OpenAISpeechGenerationPipeline,
+    WhisperKitSpeechGenerationPipeline,
+)
 from .streaming_transcription import (
     AssemblyAIStreamingPipeline,
     DeepgramStreamingPipeline,
@@ -642,6 +650,131 @@ def register_pipeline_aliases() -> None:
         description="PyannoteAI transcription pipeline (ignores speaker attribution). Uses the precision-2 model with Nvidia Parakeet STT. Requires `PYANNOTE_TOKEN` env var from https://www.pyannote.ai/.",
     )
 
+    ################# SPEECH GENERATION PIPELINES #################
+
+    PipelineRegistry.register_alias(
+        "whisperkit-speech-generation",
+        WhisperKitSpeechGenerationPipeline,
+        default_config={
+            "out_dir": "./speech_generation_results",
+            "cli_path": os.getenv("WHISPERKIT_CLI_PATH"),
+            "speaker": "aiden",
+            "language": "english",
+            "seed": 10,
+            "temperature": 0.9,
+            "top_k": 50,
+            "max_new_tokens": 245,
+            "transcription_cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"),
+            "transcription_repo_id": "argmaxinc/parakeetkit-pro",
+            "transcription_model_variant": "nvidia_parakeet-v2_476MB",
+        },
+        description="WhisperKit speech generation pipeline. Generates audio from text prompts using whisperkit-cli TTS, "
+        "then transcribes the generated audio to compute WER against the original prompt. "
+        "Requires `WHISPERKIT_CLI_PATH` env var pointing to the whisperkit-cli binary.",
+    )
+
+    PipelineRegistry.register_alias(
+        "cartesia-speech-generation",
+        CartesiaSpeechGenerationPipeline,
+        default_config={
+            "out_dir": "./speech_generation_results",
+            "model_id": "sonic-3",
+            "voice_id": "e07c00bc-4134-4eae-9ea4-1a55fb45746b",
+            "container": "wav",
+            "encoding": "pcm_f32le",
+            "sample_rate": 44100,
+            "transcription_cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"),
+            "transcription_repo_id": "argmaxinc/parakeetkit-pro",
+            "transcription_model_variant": "nvidia_parakeet-v2_476MB",
+            "keep_generated_audio": False,
+        },
+        description="Cartesia speech generation pipeline. Generates audio from text prompts using Cartesia TTS API, "
+        "then transcribes the generated audio to compute WER against the original prompt. "
+        "Requires `CARTESIA_API_KEY` and `WHISPERKITPRO_CLI_PATH` env vars.",
+    )
+
+    PipelineRegistry.register_alias(
+        "elevenlabs-dialogue-generation",
+        ElevenLabsDialogueGenerationPipeline,
+        default_config={
+            "out_dir": "./speech_generation_results",
+            "model_id": "eleven_v3",
+            "speaker_voice_map": {
+                "doctor": "9BWtsMINqrJLrRacOk9x",
+                "patient": "IKne3meq5aSn9XLyUdCD",
+                "assistant": "pFZP5JQG7iQjIQuC4Bku",
+            },
+            "default_voice_id": "9BWtsMINqrJLrRacOk9x",
+            "max_chars_per_chunk": 4500,
+            "chunk_silence_duration": 0.75,
+            "transcription_cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"),
+            "transcription_repo_id": "argmaxinc/parakeetkit-pro",
+            "transcription_model_variant": "nvidia_parakeet-v2_476MB",
+            "keep_generated_audio": False,
+        },
+        description="ElevenLabs dialogue generation pipeline. Generates multi-speaker conversational audio "
+        "from dialogue turns using ElevenLabs text_to_dialogue API, then transcribes the generated "
+        "audio to compute WER against the original dialogue text. "
+        "Requires `ELEVENLABS_API_KEY` and `WHISPERKITPRO_CLI_PATH` env vars.",
+    )
+
+    PipelineRegistry.register_alias(
+        "elevenlabs-speech-generation",
+        ElevenLabsSpeechGenerationPipeline,
+        default_config={
+            "out_dir": "./speech_generation_results",
+            "voice_id": "JBFqnCBsd6RMkjVDRZzb",
+            "model_id": "eleven_multilingual_v2",
+            "output_format": "mp3_44100_128",
+            "transcription_cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"),
+            "transcription_repo_id": "argmaxinc/parakeetkit-pro",
+            "transcription_model_variant": "nvidia_parakeet-v2_476MB",
+            "keep_generated_audio": False,
+        },
+        description="ElevenLabs speech generation pipeline. Generates audio from text prompts using ElevenLabs TTS API, "
+        "then transcribes the generated audio to compute WER against the original prompt. "
+        "Requires `ELEVENLABS_API_KEY` and `WHISPERKITPRO_CLI_PATH` env vars.",
+    )
+
+    PipelineRegistry.register_alias(
+        "gemini-speech-generation",
+        GeminiSpeechGenerationPipeline,
+        default_config={
+            "out_dir": "./speech_generation_results",
+            "voice_name": "Charon",
+            "language_code": "en-US",
+            "model_name": "gemini-2.5-pro-tts",
+            "audio_encoding": "MP3",
+            "transcription_cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"),
+            "transcription_repo_id": "argmaxinc/parakeetkit-pro",
+            "transcription_model_variant": "nvidia_parakeet-v2_476MB",
+            "keep_generated_audio": False,
+        },
+        description="Google Gemini speech generation pipeline. Generates audio from text prompts using Google Cloud TTS, "
+        "then transcribes the generated audio to compute WER against the original prompt. "
+        "Requires Google Cloud credentials and `WHISPERKITPRO_CLI_PATH` env var.",
+    )
+
+    PipelineRegistry.register_alias(
+        "openai-speech-generation",
+        OpenAISpeechGenerationPipeline,
+        default_config={
+            "out_dir": "./speech_generation_results",
+            "model": "gpt-4o-mini-tts",
+            "voice": "coral",
+            "response_format": "wav",
+            "speed": 1.0,
+            "transcription_cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"),
+            "transcription_repo_id": "argmaxinc/parakeetkit-pro",
+            "transcription_model_variant": "nvidia_parakeet-v2_476MB",
+            "keep_generated_audio": False,
+        },
+        description="OpenAI speech generation pipeline. Generates audio from text prompts using OpenAI TTS API, "
+        "then transcribes the generated audio to compute WER against the original prompt. "
+        "Requires `OPENAI_API_KEY` and `WHISPERKITPRO_CLI_PATH` env vars.",
+    )
+
+
     ################# STREAMING TRANSCRIPTION PIPELINES #################
 
     PipelineRegistry.register_alias(