Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ This project is designed for AI-first development. All agents MUST follow these
2. Add a concise entry under the `Unreleased` section (or the latest version block if no `Unreleased` section is present) describing what was added, changed, or fixed.
3. Follow the existing format of the file (typically [Keep a Changelog](https://keepachangelog.com/) style).
4. Update the changelog **before** marking the task as complete.
- **STYLE**: Entries must be **lean and user-facing**. Describe *what the user or operator gains*, not how it is implemented. No class names, endpoint paths, internal module names, or technical details. One sentence per entry is the target.


## Project Overview
Expand Down
6 changes: 6 additions & 0 deletions backend/omni/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]

### Added

- Real-time audio voice chat - websocket based.

## [0.0.3] - 2026-04-28

### Added
Expand Down
9 changes: 5 additions & 4 deletions backend/omni/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,15 @@ dependencies = [
"sqlalchemy",
"strands-agents",
"strands-agents-tools",
"httpx>=0.28.1",
"itsdangerous>=2.2.0",
"authlib[httpx]>=1.6.9",
"httpx",
"itsdangerous",
"authlib[httpx]",
"websockets",
]

[dependency-groups]
dev = [
"cryptography>=46.0.7",
"cryptography",
"datamodel-code-generator[ruff]",
"pytest",
"pytest-asyncio",
Expand Down
6 changes: 6 additions & 0 deletions backend/omni/src/modai/default_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,12 @@ modules:
module_dependencies:
predefined: "predefined_tool_registry"

audio_realtime_router:
class: modai.modules.audio_realtime.openai_audio_realtime.OpenAIAudioRealtimeModule
module_dependencies:
llm_provider_module: openai_model_provider
session: "session"

full_reset:
class: modai.modules.reset.reset_web_module.ResetWebModule
module_dependencies:
Expand Down
38 changes: 38 additions & 0 deletions backend/omni/src/modai/modules/audio_realtime/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Audio Realtime Module

## Interface

**Module type**: `AudioRealtimeWebModule` (Web Module — registers a FastAPI router)

**Endpoint**: `GET /api/realtime` (WebSocket upgrade)

The client opens a WebSocket to this endpoint and exchanges
[OpenAI Realtime API](https://platform.openai.com/docs/guides/realtime) events as JSON text frames.
The backend opens a second WebSocket to the configured LLM provider and proxies all
frames in both directions — the provider API key never leaves the server.

**Query parameter**: `model=<provider_name>/<model_name>` (required)
Example: `model=openai/gpt-4o-realtime-preview`

**Close codes**: `4000` bad request · `4001` unauthorized · `4004` provider not found ·
`4500` internal error.

---

## `OpenAIAudioRealtimeModule`

**Class**: `modai.modules.audio_realtime.openai_audio_realtime.OpenAIAudioRealtimeModule`

Resolves the provider by name from the `llm_provider_module` dependency, builds
the upstream `wss://` URL, and proxies all frames bidirectionally until either side closes.

### `config.yaml` snippet

```yaml
audio_realtime_router:
class: modai.modules.audio_realtime.openai_audio_realtime.OpenAIAudioRealtimeModule
module_dependencies:
llm_provider_module: openai_model_provider # required – any ModelProviderModule
session: session # required – any SessionModule
```

Empty file.
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
import pytest
from unittest.mock import AsyncMock, MagicMock

from modai.module import ModuleDependencies
from modai.modules.model_provider.module import (
ModelProviderResponse,
ModelProvidersListResponse,
)
from modai.modules.audio_realtime.openai_audio_realtime import (
OpenAIAudioRealtimeModule,
_build_ws_url,
_resolve_provider_and_model,
)
from modai.modules.session.module import SessionModule, Session


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------


def _make_provider(
name: str = "llmgateway", base_url: str = "https://llmgateway.example.org/v1"
) -> ModelProviderResponse:
return ModelProviderResponse(
id="provider-1",
type="openai",
name=name,
base_url=base_url,
api_key="sk-test",
properties={},
created_at=None,
updated_at=None,
)


def _make_session_module(valid: bool = True) -> MagicMock:
session_module = MagicMock(spec=SessionModule)
if valid:
session_module.validate_session.return_value = Session(
user_id="test-user", additional={}
)
else:
from modai.modules.session.module import InvalidSessionError

session_module.validate_session.side_effect = InvalidSessionError("no session")
return session_module


def _make_provider_module(providers: list[ModelProviderResponse]) -> MagicMock:
from modai.modules.model_provider.module import ModelProviderModule

provider_module = MagicMock(spec=ModelProviderModule)
provider_module.get_providers = AsyncMock(
return_value=ModelProvidersListResponse(
providers=providers, total=len(providers), limit=None, offset=None
)
)
return provider_module


def _make_module(
providers: list[ModelProviderResponse] | None = None,
session_valid: bool = True,
) -> OpenAIAudioRealtimeModule:
if providers is None:
providers = [_make_provider()]

session_module = _make_session_module(session_valid)
provider_module = _make_provider_module(providers)

mock_deps = MagicMock(spec=ModuleDependencies)
mock_deps.get_module.side_effect = lambda name: (
provider_module
if name == "llm_provider_module"
else session_module
if name == "session"
else None
)

return OpenAIAudioRealtimeModule(dependencies=mock_deps, config={})


def _make_upstream_mock(messages: list[str] | None = None) -> MagicMock:
"""Mock an upstream websockets connection that yields the given messages."""
upstream = AsyncMock()
upstream.send = AsyncMock()

async def aiter_messages():
for m in messages or []:
yield m

upstream.__aiter__ = lambda self: aiter_messages()
upstream.__aenter__ = AsyncMock(return_value=upstream)
upstream.__aexit__ = AsyncMock(return_value=False)
return upstream


# ---------------------------------------------------------------------------
# URL construction tests
# ---------------------------------------------------------------------------


def test_build_ws_url_base_with_v1():
assert _build_ws_url("https://api.openai.com/v1", "gpt-mini") == (
"wss://api.openai.com/v1/realtime?model=gpt-mini"
)


def test_build_ws_url_trailing_slash():
assert _build_ws_url("https://api.openai.com/v1/", "gpt-mini") == (
"wss://api.openai.com/v1/realtime?model=gpt-mini"
)


def test_build_ws_url_http_becomes_ws():
assert _build_ws_url("http://localhost:8080/v1", "gpt-mini") == (
"ws://localhost:8080/v1/realtime?model=gpt-mini"
)


# ---------------------------------------------------------------------------
# WebSocket endpoint tests
# ---------------------------------------------------------------------------


def test_resolve_provider_and_model_simple():
"""Simple 'providerName/model' format resolves to the matching provider."""
provider = _make_provider("llmgateway")
result = _resolve_provider_and_model("llmgateway/gpt-realtime-mini", [provider])
assert result is not None
resolved_provider, model_name = result
assert resolved_provider.name == "llmgateway"
assert model_name == "gpt-realtime-mini"


def test_resolve_provider_and_model_compound():
"""Model names containing slashes are passed through unchanged after the provider prefix."""
provider = _make_provider("llmgateway")
result = _resolve_provider_and_model(
"llmgateway/openai/gpt-realtime-mini-2025-12-15", [provider]
)
assert result is not None
resolved_provider, model_name = result
assert resolved_provider.name == "llmgateway"
assert model_name == "openai/gpt-realtime-mini-2025-12-15"


def test_resolve_provider_and_model_no_match():
"""Returns None when no provider matches any path segment."""
provider = _make_provider("llmgateway")
result = _resolve_provider_and_model("openai/gpt-4o-realtime", [provider])
assert result is None


def test_resolve_provider_and_model_no_slash():
"""Returns None for strings without a slash."""
provider = _make_provider("llmgateway")
assert _resolve_provider_and_model("gpt-realtime", [provider]) is None
assert _resolve_provider_and_model("", [provider]) is None


def test_missing_llm_provider_module_raises_on_init():
"""Constructor raises ValueError when llm_provider_module dependency is missing."""
mock_deps = MagicMock(spec=ModuleDependencies)
mock_deps.get_module.return_value = None

with pytest.raises(ValueError, match="llm_provider_module"):
OpenAIAudioRealtimeModule(dependencies=mock_deps, config={})
44 changes: 44 additions & 0 deletions backend/omni/src/modai/modules/audio_realtime/module.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""
Realtime Web Module: Interface for the realtime WebSocket proxy endpoint.
"""

from abc import ABC, abstractmethod
from typing import Any

from fastapi import APIRouter, Query, WebSocket

from modai.module import ModaiModule, ModuleDependencies


class AudioRealtimeWebModule(ModaiModule, ABC):
"""
Module Declaration for: Realtime WebSocket Proxy

Exposes ``GET /api/realtime`` as a WebSocket endpoint. The client
connects here and exchanges OpenAI Realtime API events (JSON text frames).
The backend opens a WebSocket to the configured LLM provider and proxies
all frames in both directions, keeping the API key on the server.
"""

def __init__(self, dependencies: ModuleDependencies, config: dict[str, Any]):
super().__init__(dependencies, config)
self.router = APIRouter()
self.router.add_api_websocket_route(
"/api/realtime",
self.websocket_proxy,
)

@abstractmethod
async def websocket_proxy(
self,
ws: WebSocket,
model: str = Query(...),
) -> None:
"""
Accept a client WebSocket, open a WebSocket to the LLM provider,
and proxy all JSON events bidirectionally until either side closes.

``model`` query parameter format: ``<provider_name>/<model_name>``
(e.g. ``myprovider/gpt-realtime-mini-2025-12-15``).
"""
pass
Loading
Loading