Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 24 additions & 10 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,27 +19,19 @@ readme = "README.md"
requires-python = ">=3.11,<3.14"
license = "MIT"
dependencies = [
# --- Core (framework-agnostic) ---
"pydantic>=2.0",
"pydantic-settings>=2.0",
"pipecat-ai==0.0.104",
"elevenlabs>=1.0.0",
"openai>=1.0.0",
"anthropic>=0.83.0",
"litellm>=1.30.0",
"deepgram-sdk>=3.5.0,<4.0.0",
"onnxruntime>=1.16.0",
"aioboto3>=12.0.0",
"google-generativeai>=0.3.0",
"google-genai>=0.3.0",
"google-cloud-speech>=2.0.0",
"google-cloud-texttospeech>=2.0.0",
"azure-cognitiveservices-speech>=1.31.0",
"cartesia>=1.0.0",
"assemblyai>=0.17.0",
"setuptools>=65.0.0",
"fastapi>=0.100.0",
"uvicorn>=0.23.0",
"websockets>=12.0",
"httpx>=0.25.0",
"pyyaml>=6.0",
"pandas>=2.0",
Expand All @@ -54,10 +46,27 @@ dependencies = [
"jaconv>=0.3.0",
"regex>=2023.0.0",
"more-itertools>=10.0.0",
"nvidia-riva-client>=2.25.0,<3.0.0"
]

[project.optional-dependencies]
# Voice framework: choose one (pipecat or roomkit)
pipecat = [
"pipecat-ai==0.0.104",
"deepgram-sdk>=3.5.0,<4.0.0",
"onnxruntime>=1.16.0",
"google-cloud-speech>=2.0.0",
"google-cloud-texttospeech>=2.0.0",
"azure-cognitiveservices-speech>=1.31.0",
"cartesia>=1.0.0",
"assemblyai>=0.17.0",
"websockets>=12.0,<16.0",
"nvidia-riva-client>=2.25.0,<3.0.0",
]
roomkit = [
"roomkit>=0.7.0a2; python_version>='3.12'",
"deepgram-sdk>=6.0.0",
"websockets>=16.0",
]
dev = [
"pytest>=7.0",
"pytest-asyncio>=0.21",
Expand All @@ -76,6 +85,11 @@ apps = [
[project.scripts]
eva = "eva.cli:main"

[tool.uv]
# pipecat and roomkit have incompatible dependency versions
# (deepgram-sdk v3 vs v6, websockets <16 vs >=16)
conflicts = [[{extra = "pipecat"}, {extra = "roomkit"}]]

[tool.hatch.build.targets.wheel]
packages = ["src/eva"]

Expand Down
36 changes: 36 additions & 0 deletions src/eva/assistant/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,37 @@
"""Assistant components for the voice agent benchmark framework."""

import importlib

from eva.assistant.base import AssistantServerBase

# Registry of available assistant frameworks.
# Each entry maps a framework name to the fully-qualified class path.
# Lazy imports prevent loading framework dependencies (e.g. Pipecat)
# when a different framework is selected.
_REGISTRY: dict[str, str] = {
"pipecat": "eva.assistant.server.PipecatAssistantServer",
"roomkit": "eva.assistant.roomkit_server.RoomKitAssistantServer",
}


def create_assistant_server(framework: str, **kwargs) -> AssistantServerBase:
"""Create an assistant server for the given framework.

Args:
framework: Framework identifier (e.g. ``"pipecat"``, ``"roomkit"``).
**kwargs: Arguments forwarded to the server constructor.

Returns:
An ``AssistantServerBase`` implementation.

Raises:
ValueError: If the framework is not registered.
"""
if framework not in _REGISTRY:
available = ", ".join(sorted(_REGISTRY))
raise ValueError(f"Unknown assistant framework '{framework}'. Available: {available}")

module_path, class_name = _REGISTRY[framework].rsplit(".", 1)
module = importlib.import_module(module_path)
cls = getattr(module, class_name)
return cls(**kwargs)
91 changes: 91 additions & 0 deletions src/eva/assistant/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
"""Abstract base class for assistant servers in the EVA benchmark framework.

Any voice framework (Pipecat, RoomKit, etc.) must implement this interface
so that the orchestrator can run conversations framework-agnostically.
"""

from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any

from eva.models.agents import AgentConfig
from eva.models.config import AudioLLMConfig, PipelineConfig, SpeechToSpeechConfig


class AssistantServerBase(ABC):
"""Contract between ConversationWorker and any voice framework.

Implementations must:
- Listen for WebSocket connections on the assigned port using the
Twilio-style JSON protocol (events: connected, start, media, stop)
with 8 kHz mu-law audio encoding.
- Write all required output files to ``output_dir`` when ``stop()``
is called.
- Expose conversation statistics via ``get_conversation_stats()``.
"""

@abstractmethod
def __init__(
self,
current_date_time: str,
pipeline_config: PipelineConfig | SpeechToSpeechConfig | AudioLLMConfig,
agent: AgentConfig,
agent_config_path: str,
scenario_db_path: str,
output_dir: Path,
port: int,
conversation_id: str,
) -> None:
"""Initialize the assistant server.

Args:
current_date_time: Current date/time string from the evaluation record.
pipeline_config: Model pipeline configuration (STT+LLM+TTS, S2S, or Audio-LLM).
agent: Agent configuration loaded from YAML.
agent_config_path: Path to the agent YAML configuration file.
scenario_db_path: Path to the scenario database JSON file.
output_dir: Directory where output files must be written.
port: WebSocket port to listen on.
conversation_id: Unique identifier for this conversation.
"""
...

@abstractmethod
async def start(self) -> None:
"""Start the WebSocket server.

Must block until the server is accepting connections on the assigned port.
"""
...

@abstractmethod
async def stop(self) -> None:
"""Stop the server and write all output files to ``output_dir``.

Required output files:
- ``audit_log.json`` — structured conversation log
- ``transcript.jsonl`` — ``{timestamp, role, content}`` per line
- ``initial_scenario_db.json`` — scenario DB snapshot before conversation
- ``final_scenario_db.json`` — scenario DB snapshot after conversation
- ``audio_mixed.wav`` — mixed user+assistant audio (16-bit PCM)
- ``audio_user.wav`` — user-only audio track
- ``audio_assistant.wav`` — assistant-only audio track
- ``framework_logs.jsonl`` — JSONL with event types:
``tts_text``, ``llm_response``, ``turn_start``, ``turn_end``
- ``response_latencies.json`` — ``{latencies, mean, max, count}``

Optional (framework-specific):
- ``pipecat_metrics.jsonl`` or equivalent latency metrics
"""
...

@abstractmethod
def get_conversation_stats(self) -> dict[str, Any]:
"""Return conversation statistics.

Must include at minimum:
- ``num_turns``: number of user turns
- ``num_tool_calls``: total tool invocations
- ``tools_called``: list of tool names invoked
"""
...
2 changes: 1 addition & 1 deletion src/eva/assistant/pipeline/observers.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def __init__(self, output_path: str, conversation_id: str, clock: WallClock, tur
self.output_path = Path(output_path)
self.conversation_id = conversation_id
self.clock = clock
self.log_file = self.output_path / "pipecat_logs.jsonl"
self.log_file = self.output_path / "framework_logs.jsonl"

# Ensure output directory exists
self.output_path.mkdir(parents=True, exist_ok=True)
Expand Down
Loading