ServiceNow · sboily · Mar 25, 2026 · Mar 25, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,27 +19,19 @@ readme = "README.md"
 requires-python = ">=3.11,<3.14"
 license = "MIT"
 dependencies = [
+    # --- Core (framework-agnostic) ---
     "pydantic>=2.0",
     "pydantic-settings>=2.0",
-    "pipecat-ai==0.0.104",
     "elevenlabs>=1.0.0",
     "openai>=1.0.0",
     "anthropic>=0.83.0",
     "litellm>=1.30.0",
-    "deepgram-sdk>=3.5.0,<4.0.0",
-    "onnxruntime>=1.16.0",
     "aioboto3>=12.0.0",
     "google-generativeai>=0.3.0",
     "google-genai>=0.3.0",
-    "google-cloud-speech>=2.0.0",
-    "google-cloud-texttospeech>=2.0.0",
-    "azure-cognitiveservices-speech>=1.31.0",
-    "cartesia>=1.0.0",
-    "assemblyai>=0.17.0",
     "setuptools>=65.0.0",
     "fastapi>=0.100.0",
     "uvicorn>=0.23.0",
-    "websockets>=12.0",
     "httpx>=0.25.0",
     "pyyaml>=6.0",
     "pandas>=2.0",
@@ -54,10 +46,27 @@ dependencies = [
     "jaconv>=0.3.0",
     "regex>=2023.0.0",
     "more-itertools>=10.0.0",
-    "nvidia-riva-client>=2.25.0,<3.0.0"
 ]
 
 [project.optional-dependencies]
+# Voice framework: choose one (pipecat or roomkit)
+pipecat = [
+    "pipecat-ai==0.0.104",
+    "deepgram-sdk>=3.5.0,<4.0.0",
+    "onnxruntime>=1.16.0",
+    "google-cloud-speech>=2.0.0",
+    "google-cloud-texttospeech>=2.0.0",
+    "azure-cognitiveservices-speech>=1.31.0",
+    "cartesia>=1.0.0",
+    "assemblyai>=0.17.0",
+    "websockets>=12.0,<16.0",
+    "nvidia-riva-client>=2.25.0,<3.0.0",
+]
+roomkit = [
+    "roomkit>=0.7.0a2; python_version>='3.12'",
+    "deepgram-sdk>=6.0.0",
+    "websockets>=16.0",
+]
 dev = [
     "pytest>=7.0",
     "pytest-asyncio>=0.21",
@@ -76,6 +85,11 @@ apps = [
 [project.scripts]
 eva = "eva.cli:main"
 
+[tool.uv]
+# pipecat and roomkit have incompatible dependency versions
+# (deepgram-sdk v3 vs v6, websockets <16 vs >=16)
+conflicts = [[{extra = "pipecat"}, {extra = "roomkit"}]]
+
 [tool.hatch.build.targets.wheel]
 packages = ["src/eva"]
 

diff --git a/src/eva/assistant/__init__.py b/src/eva/assistant/__init__.py
@@ -1 +1,37 @@
 """Assistant components for the voice agent benchmark framework."""
+
+import importlib
+
+from eva.assistant.base import AssistantServerBase
+
+# Registry of available assistant frameworks.
+# Each entry maps a framework name to the fully-qualified class path.
+# Lazy imports prevent loading framework dependencies (e.g. Pipecat)
+# when a different framework is selected.
+_REGISTRY: dict[str, str] = {
+    "pipecat": "eva.assistant.server.PipecatAssistantServer",
+    "roomkit": "eva.assistant.roomkit_server.RoomKitAssistantServer",
+}
+
+
+def create_assistant_server(framework: str, **kwargs) -> AssistantServerBase:
+    """Create an assistant server for the given framework.
+
+    Args:
+        framework: Framework identifier (e.g. ``"pipecat"``, ``"roomkit"``).
+        **kwargs: Arguments forwarded to the server constructor.
+
+    Returns:
+        An ``AssistantServerBase`` implementation.
+
+    Raises:
+        ValueError: If the framework is not registered.
+    """
+    if framework not in _REGISTRY:
+        available = ", ".join(sorted(_REGISTRY))
+        raise ValueError(f"Unknown assistant framework '{framework}'. Available: {available}")
+
+    module_path, class_name = _REGISTRY[framework].rsplit(".", 1)
+    module = importlib.import_module(module_path)
+    cls = getattr(module, class_name)
+    return cls(**kwargs)
diff --git a/src/eva/assistant/base.py b/src/eva/assistant/base.py
@@ -0,0 +1,91 @@
+"""Abstract base class for assistant servers in the EVA benchmark framework.
+
+Any voice framework (Pipecat, RoomKit, etc.) must implement this interface
+so that the orchestrator can run conversations framework-agnostically.
+"""
+
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Any
+
+from eva.models.agents import AgentConfig
+from eva.models.config import AudioLLMConfig, PipelineConfig, SpeechToSpeechConfig
+
+
+class AssistantServerBase(ABC):
+    """Contract between ConversationWorker and any voice framework.
+
+    Implementations must:
+    - Listen for WebSocket connections on the assigned port using the
+      Twilio-style JSON protocol (events: connected, start, media, stop)
+      with 8 kHz mu-law audio encoding.
+    - Write all required output files to ``output_dir`` when ``stop()``
+      is called.
+    - Expose conversation statistics via ``get_conversation_stats()``.
+    """
+
+    @abstractmethod
+    def __init__(
+        self,
+        current_date_time: str,
+        pipeline_config: PipelineConfig | SpeechToSpeechConfig | AudioLLMConfig,
+        agent: AgentConfig,
+        agent_config_path: str,
+        scenario_db_path: str,
+        output_dir: Path,
+        port: int,
+        conversation_id: str,
+    ) -> None:
+        """Initialize the assistant server.
+
+        Args:
+            current_date_time: Current date/time string from the evaluation record.
+            pipeline_config: Model pipeline configuration (STT+LLM+TTS, S2S, or Audio-LLM).
+            agent: Agent configuration loaded from YAML.
+            agent_config_path: Path to the agent YAML configuration file.
+            scenario_db_path: Path to the scenario database JSON file.
+            output_dir: Directory where output files must be written.
+            port: WebSocket port to listen on.
+            conversation_id: Unique identifier for this conversation.
+        """
+        ...
+
+    @abstractmethod
+    async def start(self) -> None:
+        """Start the WebSocket server.
+
+        Must block until the server is accepting connections on the assigned port.
+        """
+        ...
+
+    @abstractmethod
+    async def stop(self) -> None:
+        """Stop the server and write all output files to ``output_dir``.
+
+        Required output files:
+            - ``audit_log.json`` — structured conversation log
+            - ``transcript.jsonl`` — ``{timestamp, role, content}`` per line
+            - ``initial_scenario_db.json`` — scenario DB snapshot before conversation
+            - ``final_scenario_db.json`` — scenario DB snapshot after conversation
+            - ``audio_mixed.wav`` — mixed user+assistant audio (16-bit PCM)
+            - ``audio_user.wav`` — user-only audio track
+            - ``audio_assistant.wav`` — assistant-only audio track
+            - ``framework_logs.jsonl`` — JSONL with event types:
+              ``tts_text``, ``llm_response``, ``turn_start``, ``turn_end``
+            - ``response_latencies.json`` — ``{latencies, mean, max, count}``
+
+        Optional (framework-specific):
+            - ``pipecat_metrics.jsonl`` or equivalent latency metrics
+        """
+        ...
+
+    @abstractmethod
+    def get_conversation_stats(self) -> dict[str, Any]:
+        """Return conversation statistics.
+
+        Must include at minimum:
+            - ``num_turns``: number of user turns
+            - ``num_tool_calls``: total tool invocations
+            - ``tools_called``: list of tool names invoked
+        """
+        ...
diff --git a/src/eva/assistant/pipeline/observers.py b/src/eva/assistant/pipeline/observers.py
@@ -74,7 +74,7 @@ def __init__(self, output_path: str, conversation_id: str, clock: WallClock, tur
         self.output_path = Path(output_path)
         self.conversation_id = conversation_id
         self.clock = clock
-        self.log_file = self.output_path / "pipecat_logs.jsonl"
+        self.log_file = self.output_path / "framework_logs.jsonl"
 
         # Ensure output directory exists
         self.output_path.mkdir(parents=True, exist_ok=True)