From 8d8de9b843a37e605ffd301ed2b1b6d158d49177 Mon Sep 17 00:00:00 2001 From: guenhter Date: Thu, 7 May 2026 16:05:42 +0200 Subject: [PATCH 1/2] refactor: searchable list for languages --- .../user-settings/LocalizationRoute.svelte | 65 ++++++++++++++----- .../src/modules/user-settings/locales/de.json | 4 +- 2 files changed, 52 insertions(+), 17 deletions(-) diff --git a/frontend/omni/src/modules/user-settings/LocalizationRoute.svelte b/frontend/omni/src/modules/user-settings/LocalizationRoute.svelte index 274f751..1b62d20 100644 --- a/frontend/omni/src/modules/user-settings/LocalizationRoute.svelte +++ b/frontend/omni/src/modules/user-settings/LocalizationRoute.svelte @@ -1,5 +1,9 @@ @@ -28,18 +38,41 @@ function handleLanguageChange(event: Event) {
- - + + + + {#snippet child({ props })} + + {/snippet} + + + + + + {t("noLanguageFound", { defaultValue: "No language found." })} + {#each languages as lang (lang.code)} + handleSelect(lang.code)} + > + {lang.label} + {#if selectedLanguage === lang.code} + + {/if} + + {/each} + + + +
diff --git a/frontend/omni/src/modules/user-settings/locales/de.json b/frontend/omni/src/modules/user-settings/locales/de.json index e26ceaa..ceb546c 100644 --- a/frontend/omni/src/modules/user-settings/locales/de.json +++ b/frontend/omni/src/modules/user-settings/locales/de.json @@ -3,5 +3,7 @@ "localizationNavLabel": "Sprache & Region", "localizationTitle": "Sprache & Lokalisierung", "localizationSubtitle": "Konfigurieren Sie Ihre bevorzugte Sprache.", - "languageLabel": "Sprache" + "languageLabel": "Sprache", + "searchLanguage": "Sprache suchen...", + "noLanguageFound": "Keine Sprache gefunden." } From bdd23031a4cdc1253d3cfe0434371d2b21648f2f Mon Sep 17 00:00:00 2001 From: guenhter Date: Tue, 12 May 2026 11:52:41 +0200 Subject: [PATCH 2/2] feat: voice mode --- AGENTS.md | 1 + backend/omni/CHANGELOG.md | 6 + backend/omni/pyproject.toml | 9 +- backend/omni/src/modai/default_config.yaml | 6 + .../modai/modules/audio_realtime/README.md | 38 +++ .../modai/modules/audio_realtime/__init__.py | 0 .../audio_realtime/__tests__/__init__.py | 0 .../__tests__/test_openai_audio_realtime.py | 169 ++++++++++++ .../modai/modules/audio_realtime/module.py | 44 ++++ .../audio_realtime/openai_audio_realtime.py | 196 ++++++++++++++ .../tools/tool_registry_predefined_vars.py | 2 +- .../src/modai/modules/user_settings/README.md | 2 +- backend/omni/uv.lock | 46 +++- frontend/omni/CHANGELOG.md | 5 + .../omni/public/modules_browser_only.json | 27 +- .../omni/public/modules_with_backend.json | 27 +- .../src/modules/audio-service/index.svelte.ts | 23 ++ .../modules/audio-service/realtimeAudio.ts | 32 +++ .../modules/audio-service/realtimeConfig.ts | 43 +++ .../audio-service/realtimeEventHandler.ts | 64 +++++ .../modules/audio-service/realtimeMicSetup.ts | 37 +++ .../audio-service/realtimeSession.svelte.ts | 120 +++++++++ .../modules/audio-service/realtimeWsParams.ts | 41 +++ .../audio-service/webAudioService.svelte.ts | 20 ++ .../modules/audio-settings/AudioRoute.svelte | 248 ++++++++++++++++++ .../audio-settings/audioNavigationItem.svelte | 18 ++ .../audioRouteDefinition.svelte.ts | 12 + .../audio-settings/audioSettings.svelte.ts | 67 +++++ .../modules/audio-settings/locales/de.json | 12 + .../src/modules/chat/ChatComponent.svelte | 141 ++++------ .../src/modules/chat/ChatInputPanel.svelte | 4 + .../modules/chat/ChatRealtimeButton.svelte | 72 +++++ .../src/modules/chat/chatMessages.svelte.ts | 187 +++++++++++++ .../omni/src/modules/chat/locales/de.json | 5 + frontend/omni/vite.config.ts | 5 +- 35 files changed, 1627 insertions(+), 102 deletions(-) create mode 100644 backend/omni/src/modai/modules/audio_realtime/README.md create mode 100644 backend/omni/src/modai/modules/audio_realtime/__init__.py create mode 100644 backend/omni/src/modai/modules/audio_realtime/__tests__/__init__.py create mode 100644 backend/omni/src/modai/modules/audio_realtime/__tests__/test_openai_audio_realtime.py create mode 100644 backend/omni/src/modai/modules/audio_realtime/module.py create mode 100644 backend/omni/src/modai/modules/audio_realtime/openai_audio_realtime.py create mode 100644 frontend/omni/src/modules/audio-service/index.svelte.ts create mode 100644 frontend/omni/src/modules/audio-service/realtimeAudio.ts create mode 100644 frontend/omni/src/modules/audio-service/realtimeConfig.ts create mode 100644 frontend/omni/src/modules/audio-service/realtimeEventHandler.ts create mode 100644 frontend/omni/src/modules/audio-service/realtimeMicSetup.ts create mode 100644 frontend/omni/src/modules/audio-service/realtimeSession.svelte.ts create mode 100644 frontend/omni/src/modules/audio-service/realtimeWsParams.ts create mode 100644 frontend/omni/src/modules/audio-service/webAudioService.svelte.ts create mode 100644 frontend/omni/src/modules/audio-settings/AudioRoute.svelte create mode 100644 frontend/omni/src/modules/audio-settings/audioNavigationItem.svelte create mode 100644 frontend/omni/src/modules/audio-settings/audioRouteDefinition.svelte.ts create mode 100644 frontend/omni/src/modules/audio-settings/audioSettings.svelte.ts create mode 100644 frontend/omni/src/modules/audio-settings/locales/de.json create mode 100644 frontend/omni/src/modules/chat/ChatRealtimeButton.svelte create mode 100644 frontend/omni/src/modules/chat/chatMessages.svelte.ts diff --git a/AGENTS.md b/AGENTS.md index 9ee372f..5c6e767 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -63,6 +63,7 @@ This project is designed for AI-first development. All agents MUST follow these 2. Add a concise entry under the `Unreleased` section (or the latest version block if no `Unreleased` section is present) describing what was added, changed, or fixed. 3. Follow the existing format of the file (typically [Keep a Changelog](https://keepachangelog.com/) style). 4. Update the changelog **before** marking the task as complete. +- **STYLE**: Entries must be **lean and user-facing**. Describe *what the user or operator gains*, not how it is implemented. No class names, endpoint paths, internal module names, or technical details. One sentence per entry is the target. ## Project Overview diff --git a/backend/omni/CHANGELOG.md b/backend/omni/CHANGELOG.md index 26e235b..c9608f3 100644 --- a/backend/omni/CHANGELOG.md +++ b/backend/omni/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Added + +- Real-time audio voice chat - websocket based. + ## [0.0.3] - 2026-04-28 ### Added diff --git a/backend/omni/pyproject.toml b/backend/omni/pyproject.toml index 22a8fc9..45d937b 100644 --- a/backend/omni/pyproject.toml +++ b/backend/omni/pyproject.toml @@ -14,14 +14,15 @@ dependencies = [ "sqlalchemy", "strands-agents", "strands-agents-tools", - "httpx>=0.28.1", - "itsdangerous>=2.2.0", - "authlib[httpx]>=1.6.9", + "httpx", + "itsdangerous", + "authlib[httpx]", + "websockets", ] [dependency-groups] dev = [ - "cryptography>=46.0.7", + "cryptography", "datamodel-code-generator[ruff]", "pytest", "pytest-asyncio", diff --git a/backend/omni/src/modai/default_config.yaml b/backend/omni/src/modai/default_config.yaml index a669067..cefb41e 100644 --- a/backend/omni/src/modai/default_config.yaml +++ b/backend/omni/src/modai/default_config.yaml @@ -104,6 +104,12 @@ modules: module_dependencies: predefined: "predefined_tool_registry" + audio_realtime_router: + class: modai.modules.audio_realtime.openai_audio_realtime.OpenAIAudioRealtimeModule + module_dependencies: + llm_provider_module: openai_model_provider + session: "session" + full_reset: class: modai.modules.reset.reset_web_module.ResetWebModule module_dependencies: diff --git a/backend/omni/src/modai/modules/audio_realtime/README.md b/backend/omni/src/modai/modules/audio_realtime/README.md new file mode 100644 index 0000000..ad85757 --- /dev/null +++ b/backend/omni/src/modai/modules/audio_realtime/README.md @@ -0,0 +1,38 @@ +# Audio Realtime Module + +## Interface + +**Module type**: `AudioRealtimeWebModule` (Web Module — registers a FastAPI router) + +**Endpoint**: `GET /api/realtime` (WebSocket upgrade) + +The client opens a WebSocket to this endpoint and exchanges +[OpenAI Realtime API](https://platform.openai.com/docs/guides/realtime) events as JSON text frames. +The backend opens a second WebSocket to the configured LLM provider and proxies all +frames in both directions — the provider API key never leaves the server. + +**Query parameter**: `model=/` (required) +Example: `model=openai/gpt-4o-realtime-preview` + +**Close codes**: `4000` bad request · `4001` unauthorized · `4004` provider not found · +`4500` internal error. + +--- + +## `OpenAIAudioRealtimeModule` + +**Class**: `modai.modules.audio_realtime.openai_audio_realtime.OpenAIAudioRealtimeModule` + +Resolves the provider by name from the `llm_provider_module` dependency, builds +the upstream `wss://` URL, and proxies all frames bidirectionally until either side closes. + +### `config.yaml` snippet + +```yaml +audio_realtime_router: + class: modai.modules.audio_realtime.openai_audio_realtime.OpenAIAudioRealtimeModule + module_dependencies: + llm_provider_module: openai_model_provider # required – any ModelProviderModule + session: session # required – any SessionModule +``` + diff --git a/backend/omni/src/modai/modules/audio_realtime/__init__.py b/backend/omni/src/modai/modules/audio_realtime/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/omni/src/modai/modules/audio_realtime/__tests__/__init__.py b/backend/omni/src/modai/modules/audio_realtime/__tests__/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/omni/src/modai/modules/audio_realtime/__tests__/test_openai_audio_realtime.py b/backend/omni/src/modai/modules/audio_realtime/__tests__/test_openai_audio_realtime.py new file mode 100644 index 0000000..057beed --- /dev/null +++ b/backend/omni/src/modai/modules/audio_realtime/__tests__/test_openai_audio_realtime.py @@ -0,0 +1,169 @@ +import pytest +from unittest.mock import AsyncMock, MagicMock + +from modai.module import ModuleDependencies +from modai.modules.model_provider.module import ( + ModelProviderResponse, + ModelProvidersListResponse, +) +from modai.modules.audio_realtime.openai_audio_realtime import ( + OpenAIAudioRealtimeModule, + _build_ws_url, + _resolve_provider_and_model, +) +from modai.modules.session.module import SessionModule, Session + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_provider( + name: str = "llmgateway", base_url: str = "https://llmgateway.example.org/v1" +) -> ModelProviderResponse: + return ModelProviderResponse( + id="provider-1", + type="openai", + name=name, + base_url=base_url, + api_key="sk-test", + properties={}, + created_at=None, + updated_at=None, + ) + + +def _make_session_module(valid: bool = True) -> MagicMock: + session_module = MagicMock(spec=SessionModule) + if valid: + session_module.validate_session.return_value = Session( + user_id="test-user", additional={} + ) + else: + from modai.modules.session.module import InvalidSessionError + + session_module.validate_session.side_effect = InvalidSessionError("no session") + return session_module + + +def _make_provider_module(providers: list[ModelProviderResponse]) -> MagicMock: + from modai.modules.model_provider.module import ModelProviderModule + + provider_module = MagicMock(spec=ModelProviderModule) + provider_module.get_providers = AsyncMock( + return_value=ModelProvidersListResponse( + providers=providers, total=len(providers), limit=None, offset=None + ) + ) + return provider_module + + +def _make_module( + providers: list[ModelProviderResponse] | None = None, + session_valid: bool = True, +) -> OpenAIAudioRealtimeModule: + if providers is None: + providers = [_make_provider()] + + session_module = _make_session_module(session_valid) + provider_module = _make_provider_module(providers) + + mock_deps = MagicMock(spec=ModuleDependencies) + mock_deps.get_module.side_effect = lambda name: ( + provider_module + if name == "llm_provider_module" + else session_module + if name == "session" + else None + ) + + return OpenAIAudioRealtimeModule(dependencies=mock_deps, config={}) + + +def _make_upstream_mock(messages: list[str] | None = None) -> MagicMock: + """Mock an upstream websockets connection that yields the given messages.""" + upstream = AsyncMock() + upstream.send = AsyncMock() + + async def aiter_messages(): + for m in messages or []: + yield m + + upstream.__aiter__ = lambda self: aiter_messages() + upstream.__aenter__ = AsyncMock(return_value=upstream) + upstream.__aexit__ = AsyncMock(return_value=False) + return upstream + + +# --------------------------------------------------------------------------- +# URL construction tests +# --------------------------------------------------------------------------- + + +def test_build_ws_url_base_with_v1(): + assert _build_ws_url("https://api.openai.com/v1", "gpt-mini") == ( + "wss://api.openai.com/v1/realtime?model=gpt-mini" + ) + + +def test_build_ws_url_trailing_slash(): + assert _build_ws_url("https://api.openai.com/v1/", "gpt-mini") == ( + "wss://api.openai.com/v1/realtime?model=gpt-mini" + ) + + +def test_build_ws_url_http_becomes_ws(): + assert _build_ws_url("http://localhost:8080/v1", "gpt-mini") == ( + "ws://localhost:8080/v1/realtime?model=gpt-mini" + ) + + +# --------------------------------------------------------------------------- +# WebSocket endpoint tests +# --------------------------------------------------------------------------- + + +def test_resolve_provider_and_model_simple(): + """Simple 'providerName/model' format resolves to the matching provider.""" + provider = _make_provider("llmgateway") + result = _resolve_provider_and_model("llmgateway/gpt-realtime-mini", [provider]) + assert result is not None + resolved_provider, model_name = result + assert resolved_provider.name == "llmgateway" + assert model_name == "gpt-realtime-mini" + + +def test_resolve_provider_and_model_compound(): + """Model names containing slashes are passed through unchanged after the provider prefix.""" + provider = _make_provider("llmgateway") + result = _resolve_provider_and_model( + "llmgateway/openai/gpt-realtime-mini-2025-12-15", [provider] + ) + assert result is not None + resolved_provider, model_name = result + assert resolved_provider.name == "llmgateway" + assert model_name == "openai/gpt-realtime-mini-2025-12-15" + + +def test_resolve_provider_and_model_no_match(): + """Returns None when no provider matches any path segment.""" + provider = _make_provider("llmgateway") + result = _resolve_provider_and_model("openai/gpt-4o-realtime", [provider]) + assert result is None + + +def test_resolve_provider_and_model_no_slash(): + """Returns None for strings without a slash.""" + provider = _make_provider("llmgateway") + assert _resolve_provider_and_model("gpt-realtime", [provider]) is None + assert _resolve_provider_and_model("", [provider]) is None + + +def test_missing_llm_provider_module_raises_on_init(): + """Constructor raises ValueError when llm_provider_module dependency is missing.""" + mock_deps = MagicMock(spec=ModuleDependencies) + mock_deps.get_module.return_value = None + + with pytest.raises(ValueError, match="llm_provider_module"): + OpenAIAudioRealtimeModule(dependencies=mock_deps, config={}) diff --git a/backend/omni/src/modai/modules/audio_realtime/module.py b/backend/omni/src/modai/modules/audio_realtime/module.py new file mode 100644 index 0000000..05ed837 --- /dev/null +++ b/backend/omni/src/modai/modules/audio_realtime/module.py @@ -0,0 +1,44 @@ +""" +Realtime Web Module: Interface for the realtime WebSocket proxy endpoint. +""" + +from abc import ABC, abstractmethod +from typing import Any + +from fastapi import APIRouter, Query, WebSocket + +from modai.module import ModaiModule, ModuleDependencies + + +class AudioRealtimeWebModule(ModaiModule, ABC): + """ + Module Declaration for: Realtime WebSocket Proxy + + Exposes ``GET /api/realtime`` as a WebSocket endpoint. The client + connects here and exchanges OpenAI Realtime API events (JSON text frames). + The backend opens a WebSocket to the configured LLM provider and proxies + all frames in both directions, keeping the API key on the server. + """ + + def __init__(self, dependencies: ModuleDependencies, config: dict[str, Any]): + super().__init__(dependencies, config) + self.router = APIRouter() + self.router.add_api_websocket_route( + "/api/realtime", + self.websocket_proxy, + ) + + @abstractmethod + async def websocket_proxy( + self, + ws: WebSocket, + model: str = Query(...), + ) -> None: + """ + Accept a client WebSocket, open a WebSocket to the LLM provider, + and proxy all JSON events bidirectionally until either side closes. + + ``model`` query parameter format: ``/`` + (e.g. ``myprovider/gpt-realtime-mini-2025-12-15``). + """ + pass diff --git a/backend/omni/src/modai/modules/audio_realtime/openai_audio_realtime.py b/backend/omni/src/modai/modules/audio_realtime/openai_audio_realtime.py new file mode 100644 index 0000000..a720ee0 --- /dev/null +++ b/backend/omni/src/modai/modules/audio_realtime/openai_audio_realtime.py @@ -0,0 +1,196 @@ +""" +OpenAI Realtime Module: WebSocket proxy for the OpenAI Realtime API. + +Flow: + 1. Client connects to ``GET /api/realtime?model=/`` via WebSocket. + 2. Backend validates the session, resolves the provider, and opens a WebSocket + to the provider's realtime endpoint (``wss://…/v1/realtime?model=…``). + 3. All JSON events are proxied bidirectionally until either side closes. + 4. The provider's API key is never sent to the client. +""" + +import asyncio +import logging +from typing import Any + +import websockets +import websockets.exceptions +from fastapi import Query, WebSocket +from fastapi.websockets import WebSocketState + +from modai.module import ModuleDependencies +from modai.modules.model_provider.module import ModelProviderModule +from modai.modules.audio_realtime.module import AudioRealtimeWebModule +from modai.modules.session.module import SessionModule + +logger = logging.getLogger(__name__) + +# WebSocket close codes (4000–4999 are application-defined) +_WS_BAD_REQUEST = 4000 +_WS_UNAUTHORIZED = 4001 +_WS_NOT_FOUND = 4004 +_WS_INTERNAL_ERROR = 4500 + + +class OpenAIAudioRealtimeModule(AudioRealtimeWebModule): + """ + OpenAI/compatible implementation of the Realtime WebSocket proxy. + + Resolves the configured LLM provider by name and proxies all events + between the client WebSocket and the provider's realtime WebSocket. + + No optional config keys at this time. + """ + + def __init__(self, dependencies: ModuleDependencies, config: dict[str, Any]): + super().__init__(dependencies, config) + + self.provider_module: ModelProviderModule = dependencies.get_module( + "llm_provider_module" + ) + if not self.provider_module: + raise ValueError( + "OpenAIAudioRealtimeModule requires 'llm_provider_module' module dependency" + ) + + self.session_module: SessionModule = dependencies.get_module("session") + if not self.session_module: + raise ValueError( + "OpenAIAudioRealtimeModule requires 'session' module dependency" + ) + + async def websocket_proxy( + self, + ws: WebSocket, + model: str = Query(...), + ) -> None: + await ws.accept() + + try: + self.session_module.validate_session(ws) + except Exception: + await ws.close(code=_WS_UNAUTHORIZED, reason="Unauthorized") + return + + try: + providers_response = await self.provider_module.get_providers( + request=ws, + limit=None, + offset=None, + ) + except Exception: + logger.exception("Failed to retrieve providers") + await ws.close( + code=_WS_INTERNAL_ERROR, reason="Failed to retrieve providers" + ) + return + + resolution = _resolve_provider_and_model(model, providers_response.providers) + if not resolution: + logger.warning("Realtime: no provider matched for model '%s'", model) + await ws.close( + code=_WS_NOT_FOUND, + reason=f"No provider found for model '{model}'", + ) + return + provider, model_name = resolution + + if not provider.base_url: + logger.warning( + "Realtime: provider '%s' has no base_url configured", provider.name + ) + await ws.close( + code=_WS_NOT_FOUND, + reason=f"Provider '{provider.name}' has no base_url configured", + ) + return + + upstream_url = _build_ws_url(provider.base_url, model_name) + headers = {"Authorization": f"Bearer {provider.api_key}"} + logger.info("Realtime: connecting to upstream %s", upstream_url) + + try: + async with websockets.connect( + upstream_url, additional_headers=headers + ) as upstream: + logger.info("Realtime: upstream connected") + await _proxy(ws, upstream) + logger.info("Realtime: proxy finished") + except websockets.exceptions.InvalidStatus as exc: + logger.warning( + "Realtime: upstream rejected with HTTP %s", exc.response.status_code + ) + except Exception: + logger.exception("Realtime WebSocket proxy error") + finally: + if ws.client_state != WebSocketState.DISCONNECTED: + await ws.close() + + +async def _proxy( + client: WebSocket, + upstream: websockets.ClientConnection, +) -> None: + """Proxy frames bidirectionally until either side closes.""" + + async def client_to_upstream() -> None: + try: + while True: + data = await client.receive_text() + await upstream.send(data) + except Exception: + pass + + async def upstream_to_client() -> None: + try: + async for message in upstream: + text = message if isinstance(message, str) else message.decode() + await client.send_text(text) + except Exception: + pass + finally: + logger.info( + "Realtime: upstream closed (code=%s reason=%r)", + upstream.close_code, + upstream.close_reason, + ) + + tasks = [ + asyncio.create_task(client_to_upstream()), + asyncio.create_task(upstream_to_client()), + ] + _, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED) + for task in pending: + task.cancel() + await asyncio.gather(*pending, return_exceptions=True) + + +def _resolve_provider_and_model( + model: str, providers: list +) -> tuple[object, str] | None: + """ + Resolve a (provider, raw_model_name) pair from a model path string. + + The format is ``/`` where ```` may + itself contain slashes (e.g. ``llmgateway/openai/gpt-4o-realtime``). + Everything before the first ``/`` is matched against configured provider + names; everything after becomes the raw model name forwarded upstream. + """ + if not model or "/" not in model: + return None + provider_name, raw_model = model.split("/", 1) + matched = next((p for p in providers if p.name == provider_name), None) + return (matched, raw_model) if matched else None + return None + + +def _build_ws_url(base_url: str, model_name: str) -> str: + """Build the upstream WebSocket URL for the realtime endpoint. + + Converts ``https://`` → ``wss://`` (or ``http://`` → ``ws://``) and + appends ``/realtime?model=``. + The ``base_url`` must already include the full path prefix (e.g. ``https://api.openai.com/v1``). + """ + url = base_url.rstrip("/") + url = url.replace("https://", "wss://").replace("http://", "ws://") + return f"{url}/realtime?model={model_name}" diff --git a/backend/omni/src/modai/modules/tools/tool_registry_predefined_vars.py b/backend/omni/src/modai/modules/tools/tool_registry_predefined_vars.py index 1a87ccf..b59e80c 100644 --- a/backend/omni/src/modai/modules/tools/tool_registry_predefined_vars.py +++ b/backend/omni/src/modai/modules/tools/tool_registry_predefined_vars.py @@ -93,7 +93,7 @@ def __init__(self, dependencies: ModuleDependencies, config: dict[str, Any]): super().__init__(dependencies, config) self._inner_registry: ToolRegistryModule = dependencies.get_module( "delegate_registry" - ) # type: ignore[assignment] + ) self._variable_mappings: list[dict[str, str]] = config.get( "variable_mappings", [] ) diff --git a/backend/omni/src/modai/modules/user_settings/README.md b/backend/omni/src/modai/modules/user_settings/README.md index 1cf2d83..07de486 100644 --- a/backend/omni/src/modai/modules/user_settings/README.md +++ b/backend/omni/src/modai/modules/user_settings/README.md @@ -187,7 +187,7 @@ user_settings: ## Usage Examples -### Frontend Integration +### Client Integration ```typescript // Get all user settings const allSettings = await fetch('/api/user/123/settings', { diff --git a/backend/omni/uv.lock b/backend/omni/uv.lock index 3bde2fc..d6380b5 100644 --- a/backend/omni/uv.lock +++ b/backend/omni/uv.lock @@ -927,6 +927,7 @@ dependencies = [ { name = "strands-agents" }, { name = "strands-agents-tools" }, { name = "uvicorn" }, + { name = "websockets" }, ] [package.dev-dependencies] @@ -942,10 +943,10 @@ dev = [ [package.metadata] requires-dist = [ - { name = "authlib", extras = ["httpx"], specifier = ">=1.6.9" }, + { name = "authlib", extras = ["httpx"] }, { name = "fastapi" }, - { name = "httpx", specifier = ">=0.28.1" }, - { name = "itsdangerous", specifier = ">=2.2.0" }, + { name = "httpx" }, + { name = "itsdangerous" }, { name = "openai" }, { name = "pydantic" }, { name = "pyjwt" }, @@ -955,11 +956,12 @@ requires-dist = [ { name = "strands-agents" }, { name = "strands-agents-tools" }, { name = "uvicorn" }, + { name = "websockets" }, ] [package.metadata.requires-dev] dev = [ - { name = "cryptography", specifier = ">=46.0.7" }, + { name = "cryptography" }, { name = "datamodel-code-generator", extras = ["ruff"] }, { name = "pytest" }, { name = "pytest-asyncio" }, @@ -2034,6 +2036,42 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/68/5a/199c59e0a824a3db2b89c5d2dade7ab5f9624dbf6448dc291b46d5ec94d3/wcwidth-0.6.0-py3-none-any.whl", hash = "sha256:1a3a1e510b553315f8e146c54764f4fb6264ffad731b3d78088cdb1478ffbdad", size = 94189, upload-time = "2026-02-06T19:19:39.646Z" }, ] +[[package]] +name = "websockets" +version = "16.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/04/24/4b2031d72e840ce4c1ccb255f693b15c334757fc50023e4db9537080b8c4/websockets-16.0.tar.gz", hash = "sha256:5f6261a5e56e8d5c42a4497b364ea24d94d9563e8fbd44e78ac40879c60179b5", size = 179346, upload-time = "2026-01-10T09:23:47.181Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cc/9c/baa8456050d1c1b08dd0ec7346026668cbc6f145ab4e314d707bb845bf0d/websockets-16.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:878b336ac47938b474c8f982ac2f7266a540adc3fa4ad74ae96fea9823a02cc9", size = 177364, upload-time = "2026-01-10T09:22:59.333Z" }, + { url = "https://files.pythonhosted.org/packages/7e/0c/8811fc53e9bcff68fe7de2bcbe75116a8d959ac699a3200f4847a8925210/websockets-16.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:52a0fec0e6c8d9a784c2c78276a48a2bdf099e4ccc2a4cad53b27718dbfd0230", size = 175039, upload-time = "2026-01-10T09:23:01.171Z" }, + { url = "https://files.pythonhosted.org/packages/aa/82/39a5f910cb99ec0b59e482971238c845af9220d3ab9fa76dd9162cda9d62/websockets-16.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e6578ed5b6981005df1860a56e3617f14a6c307e6a71b4fff8c48fdc50f3ed2c", size = 175323, upload-time = "2026-01-10T09:23:02.341Z" }, + { url = "https://files.pythonhosted.org/packages/bd/28/0a25ee5342eb5d5f297d992a77e56892ecb65e7854c7898fb7d35e9b33bd/websockets-16.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:95724e638f0f9c350bb1c2b0a7ad0e83d9cc0c9259f3ea94e40d7b02a2179ae5", size = 184975, upload-time = "2026-01-10T09:23:03.756Z" }, + { url = "https://files.pythonhosted.org/packages/f9/66/27ea52741752f5107c2e41fda05e8395a682a1e11c4e592a809a90c6a506/websockets-16.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c0204dc62a89dc9d50d682412c10b3542d748260d743500a85c13cd1ee4bde82", size = 186203, upload-time = "2026-01-10T09:23:05.01Z" }, + { url = "https://files.pythonhosted.org/packages/37/e5/8e32857371406a757816a2b471939d51c463509be73fa538216ea52b792a/websockets-16.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:52ac480f44d32970d66763115edea932f1c5b1312de36df06d6b219f6741eed8", size = 185653, upload-time = "2026-01-10T09:23:06.301Z" }, + { url = "https://files.pythonhosted.org/packages/9b/67/f926bac29882894669368dc73f4da900fcdf47955d0a0185d60103df5737/websockets-16.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6e5a82b677f8f6f59e8dfc34ec06ca6b5b48bc4fcda346acd093694cc2c24d8f", size = 184920, upload-time = "2026-01-10T09:23:07.492Z" }, + { url = "https://files.pythonhosted.org/packages/3c/a1/3d6ccdcd125b0a42a311bcd15a7f705d688f73b2a22d8cf1c0875d35d34a/websockets-16.0-cp313-cp313-win32.whl", hash = "sha256:abf050a199613f64c886ea10f38b47770a65154dc37181bfaff70c160f45315a", size = 178255, upload-time = "2026-01-10T09:23:09.245Z" }, + { url = "https://files.pythonhosted.org/packages/6b/ae/90366304d7c2ce80f9b826096a9e9048b4bb760e44d3b873bb272cba696b/websockets-16.0-cp313-cp313-win_amd64.whl", hash = "sha256:3425ac5cf448801335d6fdc7ae1eb22072055417a96cc6b31b3861f455fbc156", size = 178689, upload-time = "2026-01-10T09:23:10.483Z" }, + { url = "https://files.pythonhosted.org/packages/f3/1d/e88022630271f5bd349ed82417136281931e558d628dd52c4d8621b4a0b2/websockets-16.0-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:8cc451a50f2aee53042ac52d2d053d08bf89bcb31ae799cb4487587661c038a0", size = 177406, upload-time = "2026-01-10T09:23:12.178Z" }, + { url = "https://files.pythonhosted.org/packages/f2/78/e63be1bf0724eeb4616efb1ae1c9044f7c3953b7957799abb5915bffd38e/websockets-16.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:daa3b6ff70a9241cf6c7fc9e949d41232d9d7d26fd3522b1ad2b4d62487e9904", size = 175085, upload-time = "2026-01-10T09:23:13.511Z" }, + { url = "https://files.pythonhosted.org/packages/bb/f4/d3c9220d818ee955ae390cf319a7c7a467beceb24f05ee7aaaa2414345ba/websockets-16.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:fd3cb4adb94a2a6e2b7c0d8d05cb94e6f1c81a0cf9dc2694fb65c7e8d94c42e4", size = 175328, upload-time = "2026-01-10T09:23:14.727Z" }, + { url = "https://files.pythonhosted.org/packages/63/bc/d3e208028de777087e6fb2b122051a6ff7bbcca0d6df9d9c2bf1dd869ae9/websockets-16.0-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:781caf5e8eee67f663126490c2f96f40906594cb86b408a703630f95550a8c3e", size = 185044, upload-time = "2026-01-10T09:23:15.939Z" }, + { url = "https://files.pythonhosted.org/packages/ad/6e/9a0927ac24bd33a0a9af834d89e0abc7cfd8e13bed17a86407a66773cc0e/websockets-16.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:caab51a72c51973ca21fa8a18bd8165e1a0183f1ac7066a182ff27107b71e1a4", size = 186279, upload-time = "2026-01-10T09:23:17.148Z" }, + { url = "https://files.pythonhosted.org/packages/b9/ca/bf1c68440d7a868180e11be653c85959502efd3a709323230314fda6e0b3/websockets-16.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:19c4dc84098e523fd63711e563077d39e90ec6702aff4b5d9e344a60cb3c0cb1", size = 185711, upload-time = "2026-01-10T09:23:18.372Z" }, + { url = "https://files.pythonhosted.org/packages/c4/f8/fdc34643a989561f217bb477cbc47a3a07212cbda91c0e4389c43c296ebf/websockets-16.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:a5e18a238a2b2249c9a9235466b90e96ae4795672598a58772dd806edc7ac6d3", size = 184982, upload-time = "2026-01-10T09:23:19.652Z" }, + { url = "https://files.pythonhosted.org/packages/dd/d1/574fa27e233764dbac9c52730d63fcf2823b16f0856b3329fc6268d6ae4f/websockets-16.0-cp314-cp314-win32.whl", hash = "sha256:a069d734c4a043182729edd3e9f247c3b2a4035415a9172fd0f1b71658a320a8", size = 177915, upload-time = "2026-01-10T09:23:21.458Z" }, + { url = "https://files.pythonhosted.org/packages/8a/f1/ae6b937bf3126b5134ce1f482365fde31a357c784ac51852978768b5eff4/websockets-16.0-cp314-cp314-win_amd64.whl", hash = "sha256:c0ee0e63f23914732c6d7e0cce24915c48f3f1512ec1d079ed01fc629dab269d", size = 178381, upload-time = "2026-01-10T09:23:22.715Z" }, + { url = "https://files.pythonhosted.org/packages/06/9b/f791d1db48403e1f0a27577a6beb37afae94254a8c6f08be4a23e4930bc0/websockets-16.0-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:a35539cacc3febb22b8f4d4a99cc79b104226a756aa7400adc722e83b0d03244", size = 177737, upload-time = "2026-01-10T09:23:24.523Z" }, + { url = "https://files.pythonhosted.org/packages/bd/40/53ad02341fa33b3ce489023f635367a4ac98b73570102ad2cdd770dacc9a/websockets-16.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:b784ca5de850f4ce93ec85d3269d24d4c82f22b7212023c974c401d4980ebc5e", size = 175268, upload-time = "2026-01-10T09:23:25.781Z" }, + { url = "https://files.pythonhosted.org/packages/74/9b/6158d4e459b984f949dcbbb0c5d270154c7618e11c01029b9bbd1bb4c4f9/websockets-16.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:569d01a4e7fba956c5ae4fc988f0d4e187900f5497ce46339c996dbf24f17641", size = 175486, upload-time = "2026-01-10T09:23:27.033Z" }, + { url = "https://files.pythonhosted.org/packages/e5/2d/7583b30208b639c8090206f95073646c2c9ffd66f44df967981a64f849ad/websockets-16.0-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:50f23cdd8343b984957e4077839841146f67a3d31ab0d00e6b824e74c5b2f6e8", size = 185331, upload-time = "2026-01-10T09:23:28.259Z" }, + { url = "https://files.pythonhosted.org/packages/45/b0/cce3784eb519b7b5ad680d14b9673a31ab8dcb7aad8b64d81709d2430aa8/websockets-16.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:152284a83a00c59b759697b7f9e9cddf4e3c7861dd0d964b472b70f78f89e80e", size = 186501, upload-time = "2026-01-10T09:23:29.449Z" }, + { url = "https://files.pythonhosted.org/packages/19/60/b8ebe4c7e89fb5f6cdf080623c9d92789a53636950f7abacfc33fe2b3135/websockets-16.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:bc59589ab64b0022385f429b94697348a6a234e8ce22544e3681b2e9331b5944", size = 186062, upload-time = "2026-01-10T09:23:31.368Z" }, + { url = "https://files.pythonhosted.org/packages/88/a8/a080593f89b0138b6cba1b28f8df5673b5506f72879322288b031337c0b8/websockets-16.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:32da954ffa2814258030e5a57bc73a3635463238e797c7375dc8091327434206", size = 185356, upload-time = "2026-01-10T09:23:32.627Z" }, + { url = "https://files.pythonhosted.org/packages/c2/b6/b9afed2afadddaf5ebb2afa801abf4b0868f42f8539bfe4b071b5266c9fe/websockets-16.0-cp314-cp314t-win32.whl", hash = "sha256:5a4b4cc550cb665dd8a47f868c8d04c8230f857363ad3c9caf7a0c3bf8c61ca6", size = 178085, upload-time = "2026-01-10T09:23:33.816Z" }, + { url = "https://files.pythonhosted.org/packages/9f/3e/28135a24e384493fa804216b79a6a6759a38cc4ff59118787b9fb693df93/websockets-16.0-cp314-cp314t-win_amd64.whl", hash = "sha256:b14dc141ed6d2dde437cddb216004bcac6a1df0935d79656387bd41632ba0bbd", size = 178531, upload-time = "2026-01-10T09:23:35.016Z" }, + { url = "https://files.pythonhosted.org/packages/6f/28/258ebab549c2bf3e64d2b0217b973467394a9cea8c42f70418ca2c5d0d2e/websockets-16.0-py3-none-any.whl", hash = "sha256:1637db62fad1dc833276dded54215f2c7fa46912301a24bd94d45d46a011ceec", size = 171598, upload-time = "2026-01-10T09:23:45.395Z" }, +] + [[package]] name = "werkzeug" version = "3.1.6" diff --git a/frontend/omni/CHANGELOG.md b/frontend/omni/CHANGELOG.md index 5cfcf10..de8183c 100644 --- a/frontend/omni/CHANGELOG.md +++ b/frontend/omni/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Voice button in the chat input: speak directly to the AI and hear its response out loud. Conversations appear in the chat window. +- **Audio settings page** (Global Settings → Audio) + ### Changed - The sidebar is now resizable. diff --git a/frontend/omni/public/modules_browser_only.json b/frontend/omni/public/modules_browser_only.json index 4642b28..6b29390 100644 --- a/frontend/omni/public/modules_browser_only.json +++ b/frontend/omni/public/modules_browser_only.json @@ -18,6 +18,7 @@ "chat-route", "providers-route", "localization-route", + "audio-route", "chat-fallback-route", "sidebar-layout-route" ] @@ -72,6 +73,12 @@ "path": "@/modules/markdown-renderer/markedRenderer/create", "dependencies": {} }, + { + "id": "audio-service", + "type": "AudioService", + "path": "@/modules/audio-service/webAudioService/create", + "dependencies": {} + }, { "id": "chat", "type": "ChatComponent", @@ -80,6 +87,7 @@ "module:chatService": "chat-service", "module:llmProviderService": "llm-provider-service", "module:toolsService": "tools-service", + "module:audioService": "audio-service", "module:markdownRenderers": ["katex-renderer", "marked-renderer"] } }, @@ -132,7 +140,10 @@ "type": "SidebarTopItem", "path": "@/modules/global-settings/globalSettingsNavigationItem", "dependencies": { - "module:globalSettingsSidebarItems": ["providers-navigation-item"] + "module:globalSettingsSidebarItems": [ + "providers-navigation-item", + "audio-navigation-item" + ] } }, { @@ -168,6 +179,20 @@ "type": "Route", "path": "@/modules/user-settings/localizationRouteDefinition/create", "dependencies": {} + }, + { + "id": "audio-navigation-item", + "type": "GlobalSettingsSidebarItem", + "path": "@/modules/audio-settings/audioNavigationItem", + "dependencies": {} + }, + { + "id": "audio-route", + "type": "Route", + "path": "@/modules/audio-settings/audioRouteDefinition/create", + "dependencies": { + "module:llmProviderService": "llm-provider-service" + } } ] } diff --git a/frontend/omni/public/modules_with_backend.json b/frontend/omni/public/modules_with_backend.json index 99e8c38..bf09876 100644 --- a/frontend/omni/public/modules_with_backend.json +++ b/frontend/omni/public/modules_with_backend.json @@ -28,6 +28,7 @@ "chat-route", "providers-route", "localization-route", + "audio-route", "chat-fallback-route", "sidebar-layout-route" ] @@ -108,6 +109,12 @@ "path": "@/modules/markdown-renderer/markedRenderer/create", "dependencies": {} }, + { + "id": "audio-service", + "type": "AudioService", + "path": "@/modules/audio-service/webAudioService/create", + "dependencies": {} + }, { "id": "chat", "type": "ChatComponent", @@ -116,6 +123,7 @@ "module:chatService": "chat-service", "module:llmProviderService": "llm-provider-service", "module:toolsService": "tools-service", + "module:audioService": "audio-service", "module:markdownRenderers": ["katex-renderer", "marked-renderer"] } }, @@ -170,7 +178,10 @@ "type": "SidebarTopItem", "path": "@/modules/global-settings/globalSettingsNavigationItem", "dependencies": { - "module:globalSettingsSidebarItems": ["providers-navigation-item"] + "module:globalSettingsSidebarItems": [ + "providers-navigation-item", + "audio-navigation-item" + ] } }, { @@ -207,6 +218,20 @@ "path": "@/modules/user-settings/localizationRouteDefinition/create", "dependencies": {} }, + { + "id": "audio-navigation-item", + "type": "GlobalSettingsSidebarItem", + "path": "@/modules/audio-settings/audioNavigationItem", + "dependencies": {} + }, + { + "id": "audio-route", + "type": "Route", + "path": "@/modules/audio-settings/audioRouteDefinition/create", + "dependencies": { + "module:llmProviderService": "llm-provider-service" + } + }, { "id": "logout-item", "path": "@/modules/authentication/LogoutItem", diff --git a/frontend/omni/src/modules/audio-service/index.svelte.ts b/frontend/omni/src/modules/audio-service/index.svelte.ts new file mode 100644 index 0000000..0323796 --- /dev/null +++ b/frontend/omni/src/modules/audio-service/index.svelte.ts @@ -0,0 +1,23 @@ +import type { ProviderModel } from "@/modules/llm-provider-service/index.svelte.js"; + +export type RealtimeStatus = "idle" | "connecting" | "active" | "error"; + +export interface RealtimeMessageCallback { + onDelta(role: "user" | "assistant", delta: string): void; + onDone(role: "user" | "assistant"): void; +} + +export interface RealtimeSession { + readonly status: RealtimeStatus; + start(): Promise; + stop(): void; + cleanup(): void; +} + +export interface AudioService { + createSession( + model: () => ProviderModel | null, + transcriptModel: () => ProviderModel | null, + onMessage: RealtimeMessageCallback, + ): RealtimeSession; +} diff --git a/frontend/omni/src/modules/audio-service/realtimeAudio.ts b/frontend/omni/src/modules/audio-service/realtimeAudio.ts new file mode 100644 index 0000000..17f278c --- /dev/null +++ b/frontend/omni/src/modules/audio-service/realtimeAudio.ts @@ -0,0 +1,32 @@ +/** + * Schedules a base64-encoded PCM16 audio chunk for playback via the Web Audio API. + * Chunks are scheduled contiguously to avoid gaps between them. + */ +export function playAudioDelta( + audioContext: AudioContext, + base64: string, + playbackTimeRef: { value: number }, +): void { + if (audioContext.state === "suspended") { + void audioContext.resume(); + } + // Decode base64 → PCM16 → Float32 + const binary = atob(base64); + const bytes = new Uint8Array(binary.length); + for (let i = 0; i < binary.length; i++) bytes[i] = binary.charCodeAt(i); + const int16 = new Int16Array(bytes.buffer); + const float32 = new Float32Array(int16.length); + for (let i = 0; i < int16.length; i++) float32[i] = int16[i] / 32768; + + const buffer = audioContext.createBuffer(1, float32.length, 24000); + buffer.copyToChannel(float32, 0); + + const source = audioContext.createBufferSource(); + source.buffer = buffer; + source.connect(audioContext.destination); + + // Schedule contiguously so chunks play without gaps + const startAt = Math.max(playbackTimeRef.value, audioContext.currentTime); + source.start(startAt); + playbackTimeRef.value = startAt + buffer.duration; +} diff --git a/frontend/omni/src/modules/audio-service/realtimeConfig.ts b/frontend/omni/src/modules/audio-service/realtimeConfig.ts new file mode 100644 index 0000000..eefe832 --- /dev/null +++ b/frontend/omni/src/modules/audio-service/realtimeConfig.ts @@ -0,0 +1,43 @@ +/** AudioWorklet processor source code (loaded via Blob URL). */ +export const WORKLET_CODE = ` +class PCM16Processor extends AudioWorkletProcessor { + process(inputs) { + const channel = inputs[0]?.[0]; + if (!channel) return true; + const int16 = new Int16Array(channel.length); + for (let i = 0; i < channel.length; i++) { + int16[i] = Math.max(-32768, Math.min(32767, channel[i] * 32767 | 0)); + } + this.port.postMessage(int16.buffer, [int16.buffer]); + return true; + } +} +registerProcessor('pcm16-processor', PCM16Processor); +`; + +// Important to read: https://developers.openai.com/api/docs/guides/realtime-conversations#handling-audio-with-websockets + +/** Builds the session.update payload with the configured transcript model. */ +export function buildSessionUpdatePayload(transcriptModelName: string): string { + return JSON.stringify({ + type: "session.update", + session: { + type: "realtime", + output_modalities: ["audio"], + instructions: "Respond very very very briefly.", + audio: { + input: { + format: { type: "audio/pcm", rate: 24000 }, + ...(transcriptModelName + ? { transcription: { model: transcriptModelName } } + : {}), + turn_detection: { type: "semantic_vad" }, + }, + output: { + format: { type: "audio/pcm", rate: 24000 }, + voice: "alloy", + }, + }, + }, + }); +} diff --git a/frontend/omni/src/modules/audio-service/realtimeEventHandler.ts b/frontend/omni/src/modules/audio-service/realtimeEventHandler.ts new file mode 100644 index 0000000..17a6bbd --- /dev/null +++ b/frontend/omni/src/modules/audio-service/realtimeEventHandler.ts @@ -0,0 +1,64 @@ +import type { RealtimeMessageCallback } from "./index.svelte.js"; +import { playAudioDelta } from "./realtimeAudio.js"; + +export function handleRealtimeEvent( + event: MessageEvent, + audioContext: AudioContext | null, + playbackTimeRef: { value: number }, + onMessage: RealtimeMessageCallback, + userDeltaRef: { received: boolean }, +): void { + try { + const msg = JSON.parse(event.data as string) as Record; + + if ( + (msg.type === "response.audio.delta" || + msg.type === "response.output_audio.delta") && + typeof msg.delta === "string" && + audioContext + ) { + playAudioDelta(audioContext, msg.delta, playbackTimeRef); + } else if ( + (msg.type === "response.audio_transcript.delta" || + msg.type === "response.output_audio_transcript.delta") && + typeof msg.delta === "string" && + msg.delta + ) { + onMessage.onDelta("assistant", msg.delta); + } else if ( + msg.type === "response.audio_transcript.done" || + msg.type === "response.output_audio_transcript.done" + ) { + onMessage.onDone("assistant"); + } else if ( + msg.type === "conversation.item.input_audio_transcription.delta" && + typeof msg.delta === "string" && + msg.delta + ) { + userDeltaRef.received = true; + onMessage.onDelta("user", msg.delta); + } else if ( + msg.type === "conversation.item.input_audio_transcription.completed" + ) { + // Fallback: if no deltas arrived, use the full transcript from completed + if ( + !userDeltaRef.received && + typeof msg.transcript === "string" && + msg.transcript + ) { + onMessage.onDelta("user", msg.transcript); + } + userDeltaRef.received = false; + onMessage.onDone("user"); + } else if (msg.type === "session.created") { + // No need to do anything, we optimistically updated the UI on start() + } else if (msg.type === "error") { + console.error( + "[Realtime] provider error:", + JSON.stringify(msg.error), + ); + } + } catch (e) { + console.error("[Realtime] Failed to parse event:", e); + } +} diff --git a/frontend/omni/src/modules/audio-service/realtimeMicSetup.ts b/frontend/omni/src/modules/audio-service/realtimeMicSetup.ts new file mode 100644 index 0000000..3e72f94 --- /dev/null +++ b/frontend/omni/src/modules/audio-service/realtimeMicSetup.ts @@ -0,0 +1,37 @@ +import { WORKLET_CODE } from "./realtimeConfig.js"; + +export async function loadAudioWorklet(ctx: AudioContext): Promise { + const blob = new Blob([WORKLET_CODE], { type: "application/javascript" }); + const blobUrl = URL.createObjectURL(blob); + await ctx.audioWorklet.addModule(blobUrl); + URL.revokeObjectURL(blobUrl); +} + +export function connectMicToWorklet( + ctx: AudioContext, + micStream: MediaStream, + sendAudio: (payload: string) => void, +): void { + const micSource = ctx.createMediaStreamSource(micStream); + const workletNode = new AudioWorkletNode(ctx, "pcm16-processor"); + + workletNode.port.onmessage = (e: MessageEvent) => { + const bytes = new Uint8Array(e.data); + let binary = ""; + for (let i = 0; i < bytes.byteLength; i++) { + binary += String.fromCharCode(bytes[i]); + } + sendAudio( + JSON.stringify({ + type: "input_audio_buffer.append", + audio: btoa(binary), + }), + ); + }; + + const silentOut = ctx.createGain(); + silentOut.gain.value = 0; + silentOut.connect(ctx.destination); + micSource.connect(workletNode); + workletNode.connect(silentOut); +} diff --git a/frontend/omni/src/modules/audio-service/realtimeSession.svelte.ts b/frontend/omni/src/modules/audio-service/realtimeSession.svelte.ts new file mode 100644 index 0000000..af9822e --- /dev/null +++ b/frontend/omni/src/modules/audio-service/realtimeSession.svelte.ts @@ -0,0 +1,120 @@ +import type { ProviderModel } from "@/modules/llm-provider-service/index.svelte.js"; +import type { + RealtimeMessageCallback, + RealtimeSession, + RealtimeStatus, +} from "./index.svelte.js"; +import { buildSessionUpdatePayload } from "./realtimeConfig.js"; +import { handleRealtimeEvent } from "./realtimeEventHandler.js"; +import { connectMicToWorklet, loadAudioWorklet } from "./realtimeMicSetup.js"; +import { buildRealtimeWsParams } from "./realtimeWsParams.js"; + +export function createRealtimeSession( + model: () => ProviderModel | null, + transcriptModel: () => ProviderModel | null, + onMessage: RealtimeMessageCallback, +): RealtimeSession { + let status = $state("idle"); + let ws: WebSocket | null = null; + let audioContext: AudioContext | null = null; + let micStream: MediaStream | null = null; + const playbackTimeRef = { value: 0 }; + const userDeltaRef = { received: false }; + + async function start() { + status = "connecting"; + try { + micStream = await navigator.mediaDevices.getUserMedia({ + audio: true, + }); + + const ctx = new AudioContext({ sampleRate: 24000 }); + await ctx.resume(); + audioContext = ctx; + playbackTimeRef.value = ctx.currentTime; + + await loadAudioWorklet(ctx); + + const m = model(); + const { url: wsUrl, protocols } = buildRealtimeWsParams(m); + console.log("[Realtime] connecting to", wsUrl); + ws = + protocols.length > 0 + ? new WebSocket(wsUrl, protocols) + : new WebSocket(wsUrl); + + const localWs = ws; + await new Promise((resolve, reject) => { + localWs.onopen = () => { + const payload = buildSessionUpdatePayload( + transcriptModel()?.modelName ?? "", + ); + console.log( + "[Realtime] WebSocket connected, sending session.update:", + payload, + ); + ws?.send(payload); + resolve(); + }; + localWs.onerror = () => + reject(new Error("WebSocket connection failed")); + }); + + ws.onmessage = (e) => + handleRealtimeEvent( + e, + audioContext, + playbackTimeRef, + onMessage, + userDeltaRef, + ); + ws.onerror = () => { + cleanup(); + status = "error"; + }; + ws.onclose = (e) => { + if (status === "active" || status === "connecting") { + cleanup(); + status = e.wasClean ? "idle" : "error"; + } + }; + + connectMicToWorklet(ctx, micStream, (payload) => { + if (ws?.readyState === WebSocket.OPEN) ws.send(payload); + }); + + status = "active"; + } catch (err) { + console.error("[Realtime] Failed to start:", err); + cleanup(); + status = "error"; + } + } + + function stop() { + cleanup(); + status = "idle"; + } + + function cleanup() { + ws?.close(); + ws = null; + micStream?.getTracks().forEach((t) => { + t.stop(); + }); + micStream = null; + audioContext?.close(); + audioContext = null; + playbackTimeRef.value = 0; + userDeltaRef.received = false; + } + + return { + get status() { + return status; + }, + start, + stop, + cleanup, + }; +} diff --git a/frontend/omni/src/modules/audio-service/realtimeWsParams.ts b/frontend/omni/src/modules/audio-service/realtimeWsParams.ts new file mode 100644 index 0000000..3e38b69 --- /dev/null +++ b/frontend/omni/src/modules/audio-service/realtimeWsParams.ts @@ -0,0 +1,41 @@ +import type { ProviderModel } from "@/modules/llm-provider-service/index.svelte.js"; + +/** + * Build the WebSocket URL and authentication protocols for the realtime endpoint. + * + * - If providerBaseUrl is relative (starts with "/") the request is routed + * through the modAI backend proxy which adds auth headers server-side. + * - If providerBaseUrl is an absolute URL the browser connects directly to + * the provider. Since browser WebSockets cannot set custom headers, the + * API key is passed via the WebSocket subprotocol mechanism that OpenAI + * supports for browser clients. + */ +export function buildRealtimeWsParams(m: ProviderModel | null): { + url: string; + protocols: string[]; +} { + const proto = window.location.protocol === "https:" ? "wss:" : "ws:"; + + if (!m?.providerBaseUrl.startsWith("http")) { + // Backend proxy mode — backend resolves the provider and adds auth + const modelParam = m + ? encodeURIComponent(`${m.providerName}/${m.modelName}`) + : ""; + return { + url: `${proto}//${window.location.host}/api/realtime?model=${modelParam}`, + protocols: [], + }; + } + + // Direct browser mode — pass auth via WebSocket subprotocol (the only + // mechanism available, as browser WebSocket cannot set request headers). + const wsBase = m.providerBaseUrl + .replace("https://", "wss://") + .replace("http://", "ws://") + .replace(/\/$/, ""); + const url = `${wsBase}/realtime?model=${encodeURIComponent(m.modelName)}`; + const protocols = m.providerApiKey + ? ["realtime", `openai-insecure-api-key.${m.providerApiKey}`] + : []; + return { url, protocols }; +} diff --git a/frontend/omni/src/modules/audio-service/webAudioService.svelte.ts b/frontend/omni/src/modules/audio-service/webAudioService.svelte.ts new file mode 100644 index 0000000..6f9ff0e --- /dev/null +++ b/frontend/omni/src/modules/audio-service/webAudioService.svelte.ts @@ -0,0 +1,20 @@ +import type { ModuleDependencies } from "@/core/module-system/index.js"; +import type { ProviderModel } from "@/modules/llm-provider-service/index.svelte.js"; +import type { + AudioService, + RealtimeMessageCallback, + RealtimeSession, +} from "./index.svelte.js"; +import { createRealtimeSession } from "./realtimeSession.svelte.js"; + +export function create(_deps: ModuleDependencies): AudioService { + return { + createSession( + model: () => ProviderModel | null, + transcriptModel: () => ProviderModel | null, + onMessage: RealtimeMessageCallback, + ): RealtimeSession { + return createRealtimeSession(model, transcriptModel, onMessage); + }, + }; +} diff --git a/frontend/omni/src/modules/audio-settings/AudioRoute.svelte b/frontend/omni/src/modules/audio-settings/AudioRoute.svelte new file mode 100644 index 0000000..986019b --- /dev/null +++ b/frontend/omni/src/modules/audio-settings/AudioRoute.svelte @@ -0,0 +1,248 @@ + + +
+
+

+ + {t("title", { defaultValue: "Audio" })} +

+

+ {t("subtitle", { defaultValue: "Configure the model used for real-time voice chat." })} +

+
+ +
+ +
+ + {t("modelLabel", { defaultValue: "Realtime voice model" })} + + + {#if loading} +

+ {t("loadingModels", { defaultValue: "Loading models..." })} +

+ {:else if allModels.length === 0} +

+ {t("noProviders", { defaultValue: "No providers configured. Add a provider in Global Settings → Providers first." })} +

+ {:else} + + + {#snippet child({ props })} + + {/snippet} + + + + + + {t("noModelsFound", { defaultValue: "No models found." })} + {#each voiceProviderGroups as group} + + {#each group.models as m} + handleVoiceSelect(m)} + > + {m.modelName} + {#if isSameModel(selectedVoiceModel, m)} + + {/if} + + {/each} + + {/each} + + + + + {/if} +
+ + +
+ + {t("transcriptModelLabel", { defaultValue: "Transcript model" })} + + + {#if loading} +

+ {t("loadingModels", { defaultValue: "Loading models..." })} +

+ {:else if allModels.length === 0} +

+ {t("noProviders", { defaultValue: "No providers configured. Add a provider in Global Settings → Providers first." })} +

+ {:else} + + + {#snippet child({ props })} + + {/snippet} + + + + + + {t("noModelsFound", { defaultValue: "No models found." })} + {#each transcriptProviderGroups as group} + + {#each group.models as m} + handleTranscriptSelect(m)} + > + {m.modelName} + {#if isSameModel(selectedTranscriptModel, m)} + + {/if} + + {/each} + + {/each} + + + + + {/if} +
+ + {#if !loading && allModels.length > 0} + {#if noRealtimeModels} +

+ {t("noRealtimeModels", { defaultValue: "No dedicated realtime models found — showing all models. Only models with 'realtime' in their name support voice." })} +

+ {/if} +

+ {t("modelHint", { defaultValue: "Only gpt-4o-realtime-preview or gpt-4o-mini-realtime-preview work for voice." })} +

+ {/if} +
+
+ diff --git a/frontend/omni/src/modules/audio-settings/audioNavigationItem.svelte b/frontend/omni/src/modules/audio-settings/audioNavigationItem.svelte new file mode 100644 index 0000000..82fdfd4 --- /dev/null +++ b/frontend/omni/src/modules/audio-settings/audioNavigationItem.svelte @@ -0,0 +1,18 @@ + + + + router.navigate(AUDIO_PATH)}> + + {t("navLabel", { defaultValue: "Audio" })} + + diff --git a/frontend/omni/src/modules/audio-settings/audioRouteDefinition.svelte.ts b/frontend/omni/src/modules/audio-settings/audioRouteDefinition.svelte.ts new file mode 100644 index 0000000..fe8e52b --- /dev/null +++ b/frontend/omni/src/modules/audio-settings/audioRouteDefinition.svelte.ts @@ -0,0 +1,12 @@ +import type { Routes } from "../router/index.svelte"; +import AudioRoute from "./AudioRoute.svelte"; + +export const AUDIO_PATH = "/settings/audio"; + +export function create(): Routes { + return { + "/settings": { + "/audio": AudioRoute, + }, + }; +} diff --git a/frontend/omni/src/modules/audio-settings/audioSettings.svelte.ts b/frontend/omni/src/modules/audio-settings/audioSettings.svelte.ts new file mode 100644 index 0000000..a02ff98 --- /dev/null +++ b/frontend/omni/src/modules/audio-settings/audioSettings.svelte.ts @@ -0,0 +1,67 @@ +import type { ProviderModel } from "@/modules/llm-provider-service/index.svelte.js"; + +const STORAGE_KEY = "modai-audio-realtime-model"; +const TRANSCRIPT_STORAGE_KEY = "modai-audio-transcript-model"; + +function isProviderModel(value: unknown): value is ProviderModel { + if (typeof value !== "object" || value === null) return false; + const v = value as Record; + return ( + typeof v.providerName === "string" && + typeof v.providerBaseUrl === "string" && + typeof v.modelName === "string" + ); +} + +function loadStoredProviderModel(key: string): ProviderModel | null { + if (typeof localStorage === "undefined") return null; + const stored = localStorage.getItem(key); + if (!stored) return null; + try { + const parsed: unknown = JSON.parse(stored); + if (isProviderModel(parsed)) return parsed; + // Stale string format or invalid shape — discard + localStorage.removeItem(key); + return null; + } catch { + localStorage.removeItem(key); + return null; + } +} + +let realtimeModel = $state( + loadStoredProviderModel(STORAGE_KEY), +); +let transcriptModel = $state( + loadStoredProviderModel(TRANSCRIPT_STORAGE_KEY), +); + +/** Returns the currently configured realtime voice model (reactive), or null if none selected. */ +export function getRealtimeModel(): ProviderModel | null { + return realtimeModel; +} + +/** Persist a new realtime voice model selection. */ +export function setRealtimeModel(model: ProviderModel): void { + realtimeModel = model; + localStorage.setItem(STORAGE_KEY, JSON.stringify(model)); +} + +/** Returns the currently configured transcript model (reactive), or null if none selected. */ +export function getTranscriptModel(): ProviderModel | null { + return transcriptModel; +} + +/** Persist a new transcript model selection. */ +export function setTranscriptModel(model: ProviderModel): void { + transcriptModel = model; + localStorage.setItem(TRANSCRIPT_STORAGE_KEY, JSON.stringify(model)); +} + +/** + * Returns true when the given model ID is a realtime-capable model. + * Identified by the presence of "realtime" in the model ID (case-insensitive). + */ +export function isRealtimeModel(modelId: string): boolean { + return modelId.toLowerCase().includes("realtime"); +} diff --git a/frontend/omni/src/modules/audio-settings/locales/de.json b/frontend/omni/src/modules/audio-settings/locales/de.json new file mode 100644 index 0000000..6dac8df --- /dev/null +++ b/frontend/omni/src/modules/audio-settings/locales/de.json @@ -0,0 +1,12 @@ +{ + "navLabel": "Audio", + "title": "Audio", + "subtitle": "Konfigurieren Sie das Modell für Echtzeit-Sprachgespräche.", + "modelLabel": "Realtime-Sprachmodell", + "loadingModels": "Modelle werden geladen...", + "noProviders": "Keine Anbieter konfiguriert. Fügen Sie zuerst einen Anbieter unter Globale Einstellungen → Anbieter hinzu.", + "noRealtimeModels": "Keine dedizierten Realtime-Modelle gefunden – alle Modelle werden angezeigt. Nur Modelle mit 'realtime' im Namen unterstützen Sprache.", + "modelHint": "Nur gpt-4o-realtime-preview oder gpt-4o-mini-realtime-preview funktionieren für Sprachgespräche.", + "searchModels": "Modelle suchen...", + "noModelsFound": "Keine Modelle gefunden." +} diff --git a/frontend/omni/src/modules/chat/ChatComponent.svelte b/frontend/omni/src/modules/chat/ChatComponent.svelte index 744ed1e..27aef6a 100644 --- a/frontend/omni/src/modules/chat/ChatComponent.svelte +++ b/frontend/omni/src/modules/chat/ChatComponent.svelte @@ -1,6 +1,10 @@
0} renderers={markdownRenderers} /> - {#if messages.length === 0 && availableModels.length > 0} + {#if chat.messages.length === 0 && availableModels.length > 0} {/if} {#if availableTools.length > 0} @@ -198,5 +149,17 @@ function makeMessageId(): string { bind:selectedModel {selectedModelData} /> + + {#snippet rightActions()} + {#if resolvedRealtimeModel} + + {/if} + {/snippet}
diff --git a/frontend/omni/src/modules/chat/ChatInputPanel.svelte b/frontend/omni/src/modules/chat/ChatInputPanel.svelte index 09db2cb..4188116 100644 --- a/frontend/omni/src/modules/chat/ChatInputPanel.svelte +++ b/frontend/omni/src/modules/chat/ChatInputPanel.svelte @@ -11,11 +11,13 @@ let { canChat, isIdle, children, + rightActions, onsend, }: { canChat: boolean; isIdle: boolean; children?: Snippet; + rightActions?: Snippet; onsend: (text: string) => void; } = $props(); @@ -60,6 +62,8 @@ function sendMessage() {
+ {@render rightActions?.()} + +{:else if session.status === "active"} + +{:else if session.status === "error"} + +{:else} + +{/if} diff --git a/frontend/omni/src/modules/chat/chatMessages.svelte.ts b/frontend/omni/src/modules/chat/chatMessages.svelte.ts new file mode 100644 index 0000000..ec13992 --- /dev/null +++ b/frontend/omni/src/modules/chat/chatMessages.svelte.ts @@ -0,0 +1,187 @@ +import type { UIMessage } from "ai"; +import type { RealtimeMessageCallback } from "@/modules/audio-service/index.svelte.js"; +import type { ChatService } from "@/modules/chat-service/index.svelte.js"; +import type { ProviderModel } from "@/modules/llm-provider-service/index.svelte.js"; +import type { OpenAIFunctionTool } from "@/modules/tools-service/index.svelte.js"; + +export function createChatMessages(chatService: ChatService) { + let messages = $state[]>([]); + let chatStatus = $state<"ready" | "submitted" | "streaming">("ready"); + const isIdle = $derived( + chatStatus !== "streaming" && chatStatus !== "submitted", + ); + + function makeMessageId(): string { + return `${Date.now()}_${Math.random().toString(36).slice(2, 9)}`; + } + + const streamingIds: Record<"user" | "assistant", string | null> = { + user: null, + assistant: null, + }; + + // User transcripts arrive late (after assistant has already responded). + // When the assistant starts a new turn, pre-create a user placeholder + // before the assistant message so ordering is correct in the UI. + const pendingUserIds: string[] = []; + + const realtimeCallbacks: RealtimeMessageCallback = { + onDelta(role, delta) { + let id = streamingIds[role]; + if (!id) { + if (role === "assistant") { + // Pre-insert user placeholder first, then assistant message + const userId = makeMessageId(); + pendingUserIds.push(userId); + id = makeMessageId(); + streamingIds.assistant = id; + messages = [ + ...messages, + { + id: userId, + role: "user", + parts: [{ type: "text", text: "" }], + }, + { + id, + role: "assistant", + parts: [{ type: "text", text: "" }], + }, + ]; + } else { + // Fill in the pending placeholder if one was pre-created + const pendingId = pendingUserIds.shift(); + if (pendingId) { + id = pendingId; + streamingIds.user = id; + } else { + id = makeMessageId(); + streamingIds.user = id; + messages = [ + ...messages, + { + id, + role: "user", + parts: [{ type: "text", text: "" }], + }, + ]; + } + } + } + messages = messages.map((m) => { + if (m.id !== id) return m; + const prev = m.parts.find((p) => p.type === "text")?.text ?? ""; + return { ...m, parts: [{ type: "text", text: prev + delta }] }; + }); + }, + onDone(role) { + if (role === "user") { + const id = streamingIds.user ?? pendingUserIds.shift(); + if (id) { + const text = + messages + .find((m) => m.id === id) + ?.parts.find((p) => p.type === "text")?.text ?? ""; + if (!text) { + messages = messages.filter((m) => m.id !== id); + } + } + } else if (role === "assistant") { + // If no user transcript arrived for this turn, remove the placeholder + const pendingId = pendingUserIds.shift(); + if (pendingId && streamingIds.user === null) { + messages = messages.filter((m) => m.id !== pendingId); + } + } + streamingIds[role] = null; + }, + }; + + async function handleSend( + text: string, + selectedModelData: ProviderModel | undefined, + tools: OpenAIFunctionTool[], + ) { + if (!selectedModelData) return; + + const userMessage: UIMessage = { + id: makeMessageId(), + role: "user", + parts: [{ type: "text", text }], + }; + const assistantMessageId = makeMessageId(); + const conversationForModel = [...messages, userMessage]; + messages = [ + ...conversationForModel, + { + id: assistantMessageId, + role: "assistant", + parts: [{ type: "text", text: "" }], + metadata: { modelName: selectedModelData.modelName }, + }, + ]; + chatStatus = "submitted"; + + try { + chatStatus = "streaming"; + for await (const textPart of chatService.streamChat( + selectedModelData, + conversationForModel, + tools, + )) { + messages = messages.map((message) => { + if ( + message.id !== assistantMessageId || + message.role !== "assistant" + ) + return message; + const previousText = + message.parts.find((part) => part.type === "text") + ?.text ?? ""; + return { + ...message, + parts: [ + { + type: "text", + text: `${previousText}${textPart}`, + }, + ], + }; + }); + } + } catch { + messages = messages.map((message) => { + if ( + message.id !== assistantMessageId || + message.role !== "assistant" + ) + return message; + return { + ...message, + parts: [ + { + type: "text", + text: "Could not reach the selected provider. Check URL, API key, and CORS settings.", + }, + ], + }; + }); + } finally { + chatStatus = "ready"; + } + } + + return { + get messages() { + return messages; + }, + get chatStatus() { + return chatStatus; + }, + get isIdle() { + return isIdle; + }, + realtimeCallbacks, + handleSend, + }; +} diff --git a/frontend/omni/src/modules/chat/locales/de.json b/frontend/omni/src/modules/chat/locales/de.json index 34c20a3..d4e893d 100644 --- a/frontend/omni/src/modules/chat/locales/de.json +++ b/frontend/omni/src/modules/chat/locales/de.json @@ -16,6 +16,11 @@ "navLabel": "Neuer Chat", "chatLabel": "Chat", "newLabel": "Neu", + "startVoice": "Sprache", + "stopVoice": "Sprache beenden", + "connectingVoice": "Verbinde...", + "retryVoice": "Sprache erneut versuchen", + "voiceError": "Sprache fehlgeschlagen – Klicken zum Wiederholen", "suggestions": [ "Was sind die neuesten KI-Trends?", "Wie funktioniert maschinelles Lernen?", diff --git a/frontend/omni/vite.config.ts b/frontend/omni/vite.config.ts index c6a3222..a7a04e0 100644 --- a/frontend/omni/vite.config.ts +++ b/frontend/omni/vite.config.ts @@ -7,7 +7,10 @@ export default defineConfig({ plugins: [tailwindcss(), svelte()], server: { proxy: { - "/api": "http://localhost:8000", + "/api": { + target: "http://localhost:8000", + ws: true, + }, }, }, preview: {