diff --git a/README.md b/README.md index ea677b44..2fdf8bb4 100644 --- a/README.md +++ b/README.md @@ -177,6 +177,7 @@ When Node Mode is enabled in Settings, your Windows PC becomes a **node** that t | **Canvas** | `canvas.present`, `canvas.hide`, `canvas.navigate`, `canvas.eval`, `canvas.snapshot`, `canvas.a2ui.push`, `canvas.a2ui.pushJSONL`, `canvas.a2ui.reset` | Display and control a WebView2 window | | **Screen** | `screen.snapshot`, `screen.record` | Capture screenshots and fixed-duration MP4 screen recordings | | **Camera** | `camera.list`, `camera.snap`, `camera.clip` | Enumerate cameras and capture still photos or short video clips | +| **Speech-to-text** | `stt.transcribe` | Capture audio from the default microphone for a bounded duration and return transcribed text. Default-off; opt-in via Settings. When enabled, advertised to both gateway callers (subject to gateway allowlist) and local MCP clients (subject to bearer token). | | **Location** | `location.get` | Return Windows geolocation when permission is available | | **Device** | `device.info`, `device.status` | Return Windows host/app metadata and lightweight status | | **Text-to-speech** | `tts.speak` | Speak text aloud through Windows speech synthesis, or ElevenLabs when configured | diff --git a/docs/WINDOWS_NODE_TESTING.md b/docs/WINDOWS_NODE_TESTING.md index 2c20c295..8ac4b06f 100644 --- a/docs/WINDOWS_NODE_TESTING.md +++ b/docs/WINDOWS_NODE_TESTING.md @@ -61,6 +61,7 @@ These features need the gateway to send `node.invoke` commands: | `location.get` | Get Windows location | Uses Windows location permission/settings | | `device.info` / `device.status` | Device metadata/status | Returns host/app/locale plus battery/storage/network/uptime payloads | | `browser.proxy` | Proxy browser-control host requests | Requires Browser proxy bridge enabled, a compatible browser-control host listening on gateway port + 2, and matching browser-control auth | +| `stt.transcribe` | Speech-to-text from default microphone | Default-off; bounded `maxDurationMs` ≤ 30000; concatenates phrases until duration elapses; requires explicit gateway allowlist | | `tts.speak` | Speak text aloud | Requires Text-to-speech playback enabled in Settings; gateway mode also requires `tts.speak` in `gateway.nodes.allowCommands` | ## Capabilities Advertised @@ -112,6 +113,40 @@ When the node connects, it advertises these capabilities: - If you see "Camera access blocked", enable camera access for desktop apps in Windows Privacy settings - Packaged MSIX builds will show the system consent prompt automatically +### `stt.transcribe` returns "Speech recognition failed" or "Internal Speech Error" +- Open Windows Settings → Privacy & security → Speech (`ms-settings:privacy-speech`) +- Turn **Online speech recognition** = On. The Windows speech recognizer's default dictation grammar often fails without it, and Windows surfaces an unmapped HRESULT as "Internal Speech Error" +- Open Windows Settings → Time & language → Language & region (`ms-settings:regionlanguage`), select your display language → Language options, and confirm **Speech** appears under Installed features (install it if not, ~50 MB; reboot or sign out/in afterward) +- Verify the recognizer end-to-end with `ms-settings:speech` → "Microphone" → **Get started** before re-trying `stt.transcribe` + +### `stt.transcribe` returns "Microphone permission denied" +- Open Windows Settings → Privacy & security → Microphone +- Ensure **Microphone access** (top-level toggle) is on +- For **unpackaged** tray builds (the default `.\build.ps1` output): ensure **Let desktop apps access your microphone** is on. The tray exe will **not** appear as its own row — desktop-app access is granted as a group, not per-app +- For **packaged MSIX** tray builds: the tray appears as its own entry under "Let apps access your microphone" and must be individually enabled (the OS shows a consent prompt on first use) +- After changing permissions, re-pair the node so the gateway picks up the new advertised command + +### `stt.transcribe` returns "Language pack 'X' is not installed" +- Open Windows Settings → Time & language → Language & region +- Add the requested display language and ensure the **Speech** optional feature is installed +- Restart the tray after installing the speech pack + +### Manual STT validation +1. Enable Node Mode in Settings. +2. Enable **Speech-to-text (microphone)** in Settings → Node mode. +3. Append `stt.transcribe` to your existing gateway allowlist (do **not** copy a literal `...` — substitute the commands you already allow). For example, starting from the recommended Windows safe companion list: + ```bash + openclaw config set gateway.nodes.allowCommands '["canvas.present","canvas.hide","canvas.navigate","canvas.eval","canvas.snapshot","canvas.a2ui.push","canvas.a2ui.pushJSONL","canvas.a2ui.reset","camera.list","location.get","screen.snapshot","device.info","device.status","system.execApprovals.get","system.execApprovals.set","stt.transcribe"]' + openclaw gateway restart + ``` +4. Re-pair or re-approve the node so the gateway refreshes its command snapshot. +5. Invoke and speak a short phrase: + ```bash + openclaw nodes invoke --node --command stt.transcribe \ + --params '{"maxDurationMs":5000,"language":"en-US"}' + ``` +6. The Windows microphone OS indicator should appear during recognition. Confirm a `transcribed:true` payload returns the text. + ## Remaining Work (Roadmap) 1. ~~**system.run + exec approvals**~~ ✅ Implemented diff --git a/docs/gateway-node-integration.md b/docs/gateway-node-integration.md index 17fe3a9f..50e082d1 100644 --- a/docs/gateway-node-integration.md +++ b/docs/gateway-node-integration.md @@ -336,7 +336,7 @@ Recommended gateway defaults: | Command bucket | Windows default? | Reason | |----------------|------------------|--------| | Safe declared companion commands: `canvas.*`, `camera.list`, `location.get`, `screen.snapshot`, `device.info`, `device.status` | Yes | Matches macOS parity and only applies when declared by the node | -| Dangerous/privacy-heavy commands: `camera.snap`, `camera.clip`, `screen.record`, write commands like `contacts.add` | No | Existing gateway model already requires explicit `gateway.nodes.allowCommands` | +| Dangerous/privacy-heavy commands: `camera.snap`, `camera.clip`, `screen.record`, `stt.transcribe`, write commands like `contacts.add` | No | Existing gateway model already requires explicit `gateway.nodes.allowCommands` | | Exec commands: `system.run`, `system.run.prepare`, `system.which`, `system.notify`, `browser.proxy` | Yes | Existing Windows headless-host behavior | Until the gateway expands Windows safe defaults, the practical local solution is: @@ -364,6 +364,7 @@ Privacy-sensitive commands should stay out of the default safe list and should o camera.snap camera.clip screen.record +stt.transcribe ``` After changing either `gateway.nodes.allowCommands` or `gateway.nodes.denyCommands`, re-approve or re-pair the Windows node. Approved device records may keep a snapshot of the commands that were visible at approval time, so a gateway restart alone may not refresh existing approvals. @@ -424,6 +425,7 @@ Proposal: - `camera.snap` - `camera.clip` - `screen.record` + - `stt.transcribe` - write commands such as `contacts.add`, `calendar.add`, etc. This does not grant capabilities to headless Windows hosts by itself. A command still has to pass both gates: the node must declare it in `commands`, and the gateway policy must allow it. Headless Windows node hosts that only declare `system.run` / `system.which` remain exec-only. @@ -441,7 +443,7 @@ When shipping the Windows node, README/wiki should tell users: > ``` > Then re-pair the node (`openclaw devices reject ` + re-approve). > -> Add `camera.snap`, `camera.clip`, and `screen.record` only when you explicitly want to allow privacy-sensitive camera or screen capture. +> Add `camera.snap`, `camera.clip`, `screen.record`, and `stt.transcribe` only when you explicitly want to allow privacy-sensitive camera, screen, or microphone capture. > > The Windows tray Command Center (`openclaw://commandcenter`) surfaces these policy problems directly: it separates safe companion allowlist fixes from privacy-sensitive opt-ins and provides copyable repair text for safe fixes or pending pairing approval. diff --git a/src/OpenClaw.Shared/Audio/AudioModels.cs b/src/OpenClaw.Shared/Audio/AudioModels.cs new file mode 100644 index 00000000..8016ead8 --- /dev/null +++ b/src/OpenClaw.Shared/Audio/AudioModels.cs @@ -0,0 +1,70 @@ +using System; +using System.Collections.Generic; + +namespace OpenClaw.Shared.Audio; + +/// Result of a speech-to-text transcription segment. +public sealed class TranscriptionResult +{ + public string Text { get; init; } = ""; + public TimeSpan Start { get; init; } + public TimeSpan End { get; init; } + public string Language { get; init; } = "en"; +} + +/// +/// Aggregated result of a single silence-bounded utterance — i.e. all the +/// Whisper segments produced from one VAD-bounded speech burst, combined. +/// Consumers that need "what the user said" (chat submission, stt.listen) +/// should listen for this event instead of per-segment TranscriptionResult +/// to avoid sending partial text. +/// +public sealed class UtteranceResult +{ + /// Concatenated text across all segments, single-spaced. + public string Text { get; init; } = ""; + /// Language detected on the first segment, or null if no segments. + public string? Language { get; init; } + /// Start of the first segment relative to capture start. + public TimeSpan Start { get; init; } + /// End of the last segment relative to capture start. + public TimeSpan End { get; init; } + /// Immutable snapshot of the per-segment results. + public IReadOnlyList Segments { get; init; } = Array.Empty(); +} + +/// Voice-activity detection event. +public sealed class VadEvent +{ + public bool IsSpeaking { get; init; } + public float Probability { get; init; } +} + +/// Configuration for the audio pipeline. +public sealed class AudioPipelineOptions +{ + /// Path to the Whisper GGML model file. + public string ModelPath { get; init; } = ""; + + /// Language code for STT (e.g. "en", "auto"). + public string Language { get; init; } = "auto"; + + /// Seconds of silence before a speech segment is finalized. + public float SilenceTimeoutSeconds { get; init; } = 1.5f; + + /// Optional audio device ID. Null = system default microphone. + public string? DeviceId { get; init; } + + /// VAD probability threshold (0.0–1.0). Audio above this is considered speech. + public float VadThreshold { get; init; } = 0.3f; +} + +/// Pipeline state. +public enum AudioPipelineState +{ + Stopped, + Starting, + Listening, + Processing, + Error +} diff --git a/src/OpenClaw.Shared/Audio/PiperVoiceManager.cs b/src/OpenClaw.Shared/Audio/PiperVoiceManager.cs new file mode 100644 index 00000000..5c9f3a5f --- /dev/null +++ b/src/OpenClaw.Shared/Audio/PiperVoiceManager.cs @@ -0,0 +1,390 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.IO; +using System.IO.Compression; +using System.Net.Http; +using System.Threading; +using System.Threading.Tasks; + +namespace OpenClaw.Shared.Audio; + +/// +/// Manages downloads and on-disk lifecycle for Piper TTS voices. +/// +/// Each "voice" is a sherpa-onnx pre-packaged tarball that contains +/// everything needed for offline synthesis — the .onnx model, the +/// tokens.txt phoneme map, and the language-specific espeak-ng-data. +/// We use the sherpa-onnx repackaged distribution rather than the raw +/// HuggingFace Piper voices because the latter requires the user (or +/// us) to ship espeak-ng-data separately (~80 MB shared across voices). +/// +/// Storage layout under the tray's data directory: +/// models/piper/<voice-id>/ +/// <voice-id>.onnx +/// tokens.txt +/// espeak-ng-data/... +/// +/// Each voice is ~50 MB compressed, ~80 MB extracted (with espeak data). +/// +/// **TODO (pre-GA):** SHA-256 verification of downloaded tarballs before +/// extraction (Audio_FollowUps.md §2). The current implementation trusts +/// HTTPS + the system trust chain only. +/// +public sealed class PiperVoiceManager +{ + private readonly string _voicesDirectory; + private readonly IOpenClawLogger _logger; + // Per-voice single-flight gate: prevents racing the same voice download + // from two callers (e.g. UI and a programmatic caller). Static so two + // PiperVoiceManager instances over the same data directory still + // coalesce against the same in-flight task. + private static readonly ConcurrentDictionary> InFlightDownloads = new(StringComparer.OrdinalIgnoreCase); + + /// + /// Curated catalog of Piper voices we offer in the UI. Each entry is + /// a sherpa-onnx pre-packaged tarball from the project's GitHub + /// releases. To add a voice: pick its key from + /// https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models, + /// download the tarball, compute its SHA-256, and pin it below. + /// Sizes shown in the UI are approximate compressed sizes. + /// + /// SECURITY — pinned SHA-256 hashes (lowercase hex) verified against + /// the sherpa-onnx GitHub release on 2026-05-05. Downloads with a + /// different hash are rejected and the partial tarball is deleted. + /// Before any public release: re-verify each hash from an independent + /// source and document provenance in Audio_FollowUps.md §2. + /// + public static readonly PiperVoiceInfo[] AvailableVoices = + [ + new("en_US-amy-low", "English (US) — Amy (low quality, fast)", "en-US", + "https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2", + "c70f5284a09a7fd4ed203b39b2ff51cac1432b422b852eb647b481dade3cf639"), + new("en_US-libritts-high","English (US) — LibriTTS (high quality)", "en-US", + "https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-libritts-high.tar.bz2", + "d9d35056703fd38ed38e95c202a50f603fefdc8a92a7b6332c4f1a41616eac72"), + new("en_GB-alan-low", "English (GB) — Alan (low quality, fast)", "en-GB", + "https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_GB-alan-low.tar.bz2", + "1308e730b7a12c3b64b669d65daa0138fcb83b1a086edee92fa9fa68cb0290dd"), + new("fr_FR-siwis-low", "Français (FR) — Siwis (low quality, fast)","fr-FR", + "https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-fr_FR-siwis-low.tar.bz2", + "3d69170c160c8375c4123901a72a3845222b39456d39ab74f5bbd7310952b5af"), + new("de_DE-thorsten-low","Deutsch (DE) — Thorsten (low quality)", "de-DE", + "https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-de_DE-thorsten-low.tar.bz2", + "41fab35910fdcec4696b031951d8fd6c262e594cf77b35e1068fadbeb5a091a6"), + new("zh_CN-huayan-medium","中文 (CN) — Huayan (medium quality)", "zh-CN", + "https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-zh_CN-huayan-medium.tar.bz2", + "dbdfec42b91d9cee31cce9ff4b3e9c305eb6fbf60546d071f7e46273554cce6b"), + ]; + + public PiperVoiceManager(string dataDirectory, IOpenClawLogger logger) + { + _voicesDirectory = Path.Combine(dataDirectory, "models", "piper"); + _logger = logger; + Directory.CreateDirectory(_voicesDirectory); + } + + /// Root directory where this voice's files live (created lazily). + public string GetVoiceDirectory(string voiceId) + { + var info = FindVoice(voiceId); + return Path.Combine(_voicesDirectory, info.VoiceId); + } + + /// Path to the .onnx model file for a downloaded voice. + public string GetModelPath(string voiceId) + { + var dir = GetVoiceDirectory(voiceId); + // sherpa-onnx tarballs put files at the root of the voice dir; the + // model file is named after the voice id. + return Path.Combine(dir, $"{voiceId}.onnx"); + } + + /// Path to tokens.txt (phoneme map). + public string GetTokensPath(string voiceId) => Path.Combine(GetVoiceDirectory(voiceId), "tokens.txt"); + + /// Path to the espeak-ng-data directory bundled with this voice. + public string GetEspeakDataDir(string voiceId) => Path.Combine(GetVoiceDirectory(voiceId), "espeak-ng-data"); + + /// True when all three files are present on disk. + public bool IsVoiceDownloaded(string voiceId) + { + try + { + return File.Exists(GetModelPath(voiceId)) + && File.Exists(GetTokensPath(voiceId)) + && Directory.Exists(GetEspeakDataDir(voiceId)); + } + catch + { + // FindVoice throws on unknown voiceId — treat as not-downloaded. + return false; + } + } + + /// + /// Download and extract a Piper voice from the sherpa-onnx release. + /// Reports progress as bytes downloaded / total bytes (extraction + /// progress is not reported separately). + /// Per-voice single-flight: concurrent calls for the same voice await + /// the in-flight download instead of racing on the same temp tarball. + /// + public Task DownloadVoiceAsync( + string voiceId, + IProgress<(long downloaded, long total)>? progress = null, + CancellationToken cancellationToken = default) + { + var info = FindVoice(voiceId); + if (IsVoiceDownloaded(info.VoiceId)) + { + _logger.Info($"Piper voice '{info.VoiceId}' already downloaded"); + return Task.CompletedTask; + } + + // Preflight: bail out before downloading 50-150 MB if the OS isn't + // capable of extracting the .tar.bz2 we'd produce. tar.exe ships with + // Windows 10 1803+; older systems would fail at the extract step + // after a long, wasted download. + EnsureExtractorAvailable(); + + var key = info.VoiceId; + return SingleFlightDownload.RunAsync( + InFlightDownloads, + key, + token => DownloadVoiceCoreAsync(info, progress, token), + cancellationToken); + } + + private async Task DownloadVoiceCoreAsync( + PiperVoiceInfo info, + IProgress<(long downloaded, long total)>? progress, + CancellationToken cancellationToken) + { + // SECURITY: refuse to install any voice that doesn't have a pinned + // hash. See Audio_FollowUps.md §2. + if (string.IsNullOrWhiteSpace(info.Sha256)) + { + throw new InvalidOperationException( + $"Piper voice '{info.VoiceId}' has no pinned SHA-256; refusing to download. " + + "Add a verified hash to AvailableVoices before enabling this voice."); + } + + var voiceDir = Path.Combine(_voicesDirectory, info.VoiceId); + Directory.CreateDirectory(voiceDir); + var tarballPath = Path.Combine(voiceDir, $"{info.VoiceId}.tar.bz2.tmp"); + _logger.Info($"Downloading Piper voice '{info.VoiceId}' from {info.DownloadUrl}"); + + try + { + using var httpClient = new HttpClient(); + httpClient.Timeout = TimeSpan.FromMinutes(10); + using var response = await httpClient.GetAsync(info.DownloadUrl, HttpCompletionOption.ResponseHeadersRead, cancellationToken).ConfigureAwait(false); + response.EnsureSuccessStatusCode(); + + var totalBytes = response.Content.Headers.ContentLength ?? 0; + using (var contentStream = await response.Content.ReadAsStreamAsync(cancellationToken).ConfigureAwait(false)) + using (var fileStream = new FileStream(tarballPath, FileMode.Create, FileAccess.Write, FileShare.None, 81920)) + { + var buffer = new byte[81920]; + long downloaded = 0; + int bytesRead; + while ((bytesRead = await contentStream.ReadAsync(buffer, cancellationToken).ConfigureAwait(false)) > 0) + { + await fileStream.WriteAsync(buffer.AsMemory(0, bytesRead), cancellationToken).ConfigureAwait(false); + downloaded += bytesRead; + progress?.Report((downloaded, totalBytes)); + } + } + + // SECURITY: verify SHA-256 of the downloaded tarball BEFORE we + // hand it to the extractor. tar reads file contents to disk; an + // attacker-controlled tarball could plant arbitrary files (path + // traversal aside, the .onnx model itself is loaded into the + // process). Fail closed on mismatch — partial dir cleanup runs + // in the catch block below. + await VerifyHashAsync(tarballPath, info.Sha256, info.VoiceId, cancellationToken); + + _logger.Info($"Extracting Piper voice '{info.VoiceId}'"); + ExtractTarBz2(tarballPath, voiceDir, cancellationToken); + + // Verify the extraction produced the files we expect; if not, + // tear the half-extracted dir down so a retry starts clean. + if (!IsVoiceDownloaded(info.VoiceId)) + { + throw new InvalidOperationException( + $"Extraction of Piper voice '{info.VoiceId}' did not produce the expected layout."); + } + + _logger.Info($"Piper voice '{info.VoiceId}' verified and ready at {voiceDir}"); + } + catch + { + // Best-effort cleanup — leaves the user able to retry without + // leftover partial files. + try { if (File.Exists(tarballPath)) File.Delete(tarballPath); } catch { /* swallow */ } + try { if (Directory.Exists(voiceDir) && !IsVoiceDownloaded(info.VoiceId)) Directory.Delete(voiceDir, recursive: true); } catch { /* swallow */ } + throw; + } + finally + { + try { if (File.Exists(tarballPath)) File.Delete(tarballPath); } catch { /* swallow */ } + } + } + + /// + /// Compute SHA-256 of and compare to + /// . Throws on mismatch (caller is + /// expected to delete the file). Does not echo the actual hash to + /// avoid handing attackers a confirmation oracle. + /// + private static async Task VerifyHashAsync(string filePath, string expectedHex, string assetName, CancellationToken cancellationToken) + { + using var sha = System.Security.Cryptography.SHA256.Create(); + await using var stream = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.Read, 81920, useAsync: true); + var actual = await sha.ComputeHashAsync(stream, cancellationToken); + var actualHex = Convert.ToHexString(actual).ToLowerInvariant(); + if (!string.Equals(actualHex, expectedHex, StringComparison.OrdinalIgnoreCase)) + { + throw new System.Security.SecurityException( + $"Piper voice '{assetName}' failed integrity check. The downloaded tarball does not match the pinned SHA-256."); + } + } + + /// Delete a downloaded voice directory. + public bool DeleteVoice(string voiceId) + { + var info = FindVoice(voiceId); + var dir = Path.Combine(_voicesDirectory, info.VoiceId); + if (!Directory.Exists(dir)) return false; + Directory.Delete(dir, recursive: true); + _logger.Info($"Deleted Piper voice '{info.VoiceId}'"); + return true; + } + + /// Total disk usage of a downloaded voice, or 0 if not downloaded. + public long GetVoiceSize(string voiceId) + { + var info = FindVoice(voiceId); + var dir = Path.Combine(_voicesDirectory, info.VoiceId); + if (!Directory.Exists(dir)) return 0; + long total = 0; + foreach (var f in Directory.EnumerateFiles(dir, "*", SearchOption.AllDirectories)) + { + try { total += new FileInfo(f).Length; } catch { /* skip */ } + } + return total; + } + + /// + /// Probe the bundled OS tar.exe used by . + /// Throws a clear error before any network I/O happens so users on + /// downlevel Windows aren't left with a half-downloaded tarball. + /// + private static void EnsureExtractorAvailable() + { + try + { + var psi = new System.Diagnostics.ProcessStartInfo + { + FileName = "tar", + ArgumentList = { "--version" }, + UseShellExecute = false, + CreateNoWindow = true, + RedirectStandardOutput = true, + RedirectStandardError = true, + }; + using var proc = System.Diagnostics.Process.Start(psi); + if (proc == null) + { + throw new InvalidOperationException("tar.exe not found on PATH."); + } + proc.WaitForExit(2000); + if (!proc.HasExited) + { + try { proc.Kill(entireProcessTree: true); } catch { /* swallow */ } + throw new InvalidOperationException("tar.exe didn't respond to --version."); + } + if (proc.ExitCode != 0) + { + throw new InvalidOperationException($"tar.exe --version returned exit code {proc.ExitCode}."); + } + } + catch (System.ComponentModel.Win32Exception ex) + { + throw new InvalidOperationException( + "Piper voices need bundled tar (Windows 10 1803+). " + + "Your system doesn't have tar on PATH; please update Windows or install a tar utility.", ex); + } + } + + /// + /// Extract a .tar.bz2 archive in-place. We use SharpCompress (already a + /// transitive dependency via PiperSharp's ecosystem, but explicit here) + /// so we don't need to shell out to tar.exe. + /// + private static void ExtractTarBz2(string archivePath, string destinationDir, CancellationToken cancellationToken) + { + // SharpCompress isn't a direct dep of OpenClaw.Shared today; we + // intentionally use the BCL .tar reader on top of a bzip2 stream + // from a small inline implementation. Keeping the dep surface small + // matters in this assembly because everything here is also referenced + // from OpenClaw.Cli. + // + // .NET 7+ ships System.Formats.Tar; bzip2 is not in the BCL, so we + // bring it in via a thin wrapper. For now the simplest-correct path + // is to call out to the OS-bundled `tar` (Win10 1803+ ships it), + // which transparently handles bz2. + var psi = new System.Diagnostics.ProcessStartInfo + { + FileName = "tar", + ArgumentList = { "-xjf", archivePath, "-C", destinationDir, "--strip-components=1" }, + UseShellExecute = false, + CreateNoWindow = true, + RedirectStandardError = true, + }; + using var proc = System.Diagnostics.Process.Start(psi) + ?? throw new InvalidOperationException("Could not start tar to extract Piper voice"); + + // Cancellation: kill the tar process if requested. + using var reg = cancellationToken.Register(() => { try { proc.Kill(entireProcessTree: true); } catch { /* swallow */ } }); + + proc.WaitForExit(); + if (proc.ExitCode != 0) + { + var err = proc.StandardError.ReadToEnd(); + throw new InvalidOperationException($"tar extraction failed (exit {proc.ExitCode}): {err}"); + } + } + + private static PiperVoiceInfo FindVoice(string voiceId) + { + foreach (var v in AvailableVoices) + { + if (string.Equals(v.VoiceId, voiceId, StringComparison.OrdinalIgnoreCase)) + return v; + } + var available = string.Join(", ", AvailableVoicesIds()); + throw new ArgumentException($"Unknown Piper voice: '{voiceId}'. Available: {available}"); + } + + private static IEnumerable AvailableVoicesIds() + { + foreach (var v in AvailableVoices) yield return v.VoiceId; + } +} + +/// Metadata about a Piper voice variant. +/// Short id, e.g. "en_US-amy-low". +/// Human-readable label for UI. +/// BCP-47 tag. +/// HTTPS URL of the .tar.bz2. +/// Pinned lowercase hex SHA-256 of the downloaded +/// tarball. MUST be set; downloads are refused when null. See the catalog +/// for the "verified on" date — these need re-verification before any +/// public release (see Audio_FollowUps.md §2). +public sealed record PiperVoiceInfo( + string VoiceId, + string DisplayName, + string LanguageTag, + string DownloadUrl, + string? Sha256); diff --git a/src/OpenClaw.Shared/Audio/SileroVadModelManifest.cs b/src/OpenClaw.Shared/Audio/SileroVadModelManifest.cs new file mode 100644 index 00000000..73637ba0 --- /dev/null +++ b/src/OpenClaw.Shared/Audio/SileroVadModelManifest.cs @@ -0,0 +1,28 @@ +namespace OpenClaw.Shared.Audio; + +/// +/// Pinned descriptor for the Silero VAD ONNX model that the audio +/// pipeline auto-downloads on first use. +/// +/// SECURITY — same fail-closed verification discipline as +/// and : +/// the runtime checks the downloaded file's SHA-256 against +/// before installing it. The pinned hash here was +/// captured against the upstream raw URL on 2026-05-05; re-verify from +/// an independent source before any public release (Audio_FollowUps.md +/// §2 captures the broader signed-manifest plan). +/// +public static class SileroVadModelManifest +{ + public const string FileName = "silero_vad.onnx"; + + public const string DownloadUrl = + "https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx"; + + /// Lowercase hex SHA-256 of the canonical upstream file. + public const string Sha256 = "1a153a22f4509e292a94e67d6f9b85e8deb25b4988682b7e174c65279d8788e3"; + + /// Approximate compressed size in bytes (UI hint; actual size + /// is asserted via the SHA-256 check). + public const long ApproximateSizeBytes = 2_327_524; +} diff --git a/src/OpenClaw.Shared/Audio/SingleFlightDownload.cs b/src/OpenClaw.Shared/Audio/SingleFlightDownload.cs new file mode 100644 index 00000000..215d46d6 --- /dev/null +++ b/src/OpenClaw.Shared/Audio/SingleFlightDownload.cs @@ -0,0 +1,52 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; + +namespace OpenClaw.Shared.Audio; + +internal static class SingleFlightDownload +{ + public static Task RunAsync( + ConcurrentDictionary> inFlight, + string key, + Func startDownload, + CancellationToken waitCancellationToken = default) + { + var candidate = new Lazy(() => + { + try + { + return startDownload(CancellationToken.None) + ?? Task.FromException(new InvalidOperationException("Download factory returned null.")); + } + catch (Exception ex) + { + return Task.FromException(ex); + } + }, LazyThreadSafetyMode.ExecutionAndPublication); + + var lazy = inFlight.GetOrAdd(key, candidate); + Task task; + try + { + task = lazy.Value; + } + catch + { + inFlight.TryRemove(new KeyValuePair>(key, lazy)); + throw; + } + + _ = task.ContinueWith( + _ => inFlight.TryRemove(new KeyValuePair>(key, lazy)), + CancellationToken.None, + TaskContinuationOptions.ExecuteSynchronously, + TaskScheduler.Default); + + return waitCancellationToken.CanBeCanceled + ? task.WaitAsync(waitCancellationToken) + : task; + } +} diff --git a/src/OpenClaw.Shared/Audio/SpeechToTextService.cs b/src/OpenClaw.Shared/Audio/SpeechToTextService.cs new file mode 100644 index 00000000..b0101c26 --- /dev/null +++ b/src/OpenClaw.Shared/Audio/SpeechToTextService.cs @@ -0,0 +1,182 @@ +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using Whisper.net; +using Whisper.net.Ggml; + +namespace OpenClaw.Shared.Audio; + +/// +/// Wraps Whisper.net for speech-to-text transcription. +/// Lazily loads the model on first use and caches the factory. +/// Thread-safe: concurrent calls are serialized by a semaphore. +/// +public sealed class SpeechToTextService : IDisposable +{ + private readonly IOpenClawLogger _logger; + private readonly SemaphoreSlim _gate = new(1, 1); + private WhisperFactory? _factory; + private string? _loadedModelPath; + + public bool IsModelLoaded => _factory != null; + public string? LoadedModelPath => _loadedModelPath; + + public SpeechToTextService(IOpenClawLogger logger) + { + _logger = logger; + } + + /// Load (or reload) the Whisper model from disk. + public void LoadModel(string modelPath) + { + if (!System.IO.File.Exists(modelPath)) + throw new System.IO.FileNotFoundException($"Whisper model not found: {modelPath}"); + + _factory?.Dispose(); + _factory = WhisperFactory.FromPath(modelPath); + _loadedModelPath = modelPath; + _logger.Info($"Whisper model loaded: {modelPath}"); + } + + /// Unload the current model and free memory. + public void UnloadModel() + { + _factory?.Dispose(); + _factory = null; + _loadedModelPath = null; + _logger.Info("Whisper model unloaded"); + } + + /// + /// Transcribe raw 16 kHz mono PCM float samples. + /// Returns all detected segments. + /// + public async Task> TranscribeAsync( + float[] samples, + string language = "auto", + CancellationToken cancellationToken = default) + { + if (_factory == null) + throw new InvalidOperationException("No Whisper model is loaded. Call LoadModel first."); + + await _gate.WaitAsync(cancellationToken); + try + { + // Whisper.net's WithLanguage expects either "auto" or a 2-letter + // ISO 639-1 code. The capability validator accepts the broader + // BCP-47 shape ("en-US", "zh-Hans-CN") because that's what the + // public docs advertise; normalize down here so Whisper actually + // sees something it understands. + var whisperLang = NormalizeForWhisper(language); + var builder = _factory.CreateBuilder() + .WithLanguage(whisperLang) + .WithThreads(Math.Max(1, Environment.ProcessorCount / 2)); + + using var processor = builder.Build(); + + using var wavStream = PcmToWavStream(samples, 16000); + + var results = new List(); + await foreach (var segment in processor.ProcessAsync(wavStream, cancellationToken)) + { + var text = segment.Text?.Trim(); + if (!string.IsNullOrEmpty(text)) + { + results.Add(new TranscriptionResult + { + Text = text, + Start = segment.Start, + End = segment.End, + Language = whisperLang + }); + } + } + + return results; + } + finally + { + _gate.Release(); + } + } + + /// + /// Convert raw 16-bit PCM float samples to a WAV MemoryStream. + /// Whisper.net processes WAV streams natively. + /// + private static System.IO.MemoryStream PcmToWavStream(float[] samples, int sampleRate) + { + var ms = new System.IO.MemoryStream(); + using var writer = new System.IO.BinaryWriter(ms, System.Text.Encoding.UTF8, leaveOpen: true); + + int bitsPerSample = 16; + short channels = 1; + int byteRate = sampleRate * channels * bitsPerSample / 8; + short blockAlign = (short)(channels * bitsPerSample / 8); + int dataSize = samples.Length * blockAlign; + + // RIFF header + writer.Write("RIFF"u8); + writer.Write(36 + dataSize); + writer.Write("WAVE"u8); + + // fmt subchunk + writer.Write("fmt "u8); + writer.Write(16); // subchunk size + writer.Write((short)1); // PCM format + writer.Write(channels); + writer.Write(sampleRate); + writer.Write(byteRate); + writer.Write(blockAlign); + writer.Write((short)bitsPerSample); + + // data subchunk + writer.Write("data"u8); + writer.Write(dataSize); + + // Convert float [-1.0, 1.0] to int16 + foreach (var sample in samples) + { + var clamped = Math.Clamp(sample, -1.0f, 1.0f); + var int16 = (short)(clamped * 32767); + writer.Write(int16); + } + + writer.Flush(); + ms.Position = 0; + return ms; + } + + /// + /// Reduce a BCP-47 tag (e.g. "en-US", "zh-Hans-CN") to the 2-letter + /// language subtag that Whisper.net's WithLanguage call expects. + /// "auto" passes through unchanged. Returns "auto" for nulls/whitespace + /// or values that don't begin with at least 2 ASCII letters. + /// + internal static string NormalizeForWhisper(string? language) + { + if (string.IsNullOrWhiteSpace(language)) return "auto"; + var trimmed = language.Trim(); + if (string.Equals(trimmed, "auto", StringComparison.OrdinalIgnoreCase)) return "auto"; + + // Take everything up to the first '-' (the primary subtag) and lowercase. + var dash = trimmed.IndexOf('-'); + var primary = (dash >= 0 ? trimmed[..dash] : trimmed).ToLowerInvariant(); + + // Whisper expects 2-letter ISO 639-1. If the caller handed us a + // 3-letter ISO 639-3 tag (no good cross-walk without a table) or + // garbage, fall back to auto-detection rather than silently + // sending an invalid value. + if (primary.Length != 2 || primary[0] is < 'a' or > 'z' || primary[1] is < 'a' or > 'z') + return "auto"; + + return primary; + } + + public void Dispose() + { + _factory?.Dispose(); + _gate.Dispose(); + } +} diff --git a/src/OpenClaw.Shared/Audio/VoiceActivityDetector.cs b/src/OpenClaw.Shared/Audio/VoiceActivityDetector.cs new file mode 100644 index 00000000..3112d3a4 --- /dev/null +++ b/src/OpenClaw.Shared/Audio/VoiceActivityDetector.cs @@ -0,0 +1,108 @@ +using System; +using Microsoft.ML.OnnxRuntime; +using Microsoft.ML.OnnxRuntime.Tensors; + +namespace OpenClaw.Shared.Audio; + +/// +/// Voice Activity Detection using Silero VAD ONNX model. +/// Processes 16 kHz mono audio in 512-sample chunks (~32 ms each) +/// and returns a speech probability per chunk. +/// +public sealed class VoiceActivityDetector : IDisposable +{ + private InferenceSession? _session; + private float[] _state; // internal RNN state: shape [2, 1, 128] + private readonly int _stateSize; + private readonly IOpenClawLogger _logger; + + /// Expected sample rate for input audio. + public const int SampleRate = 16000; + + /// Number of samples per VAD chunk (512 @ 16 kHz = 32 ms). + public const int ChunkSamples = 512; + + public bool IsLoaded => _session != null; + + public VoiceActivityDetector(IOpenClawLogger logger) + { + _logger = logger; + _stateSize = 2 * 1 * 128; + _state = new float[_stateSize]; + } + + /// Load the Silero VAD ONNX model from disk. + public void LoadModel(string modelPath) + { + if (!System.IO.File.Exists(modelPath)) + throw new System.IO.FileNotFoundException($"VAD model not found: {modelPath}"); + + var opts = new SessionOptions + { + InterOpNumThreads = 1, + IntraOpNumThreads = 1, + EnableCpuMemArena = true + }; + opts.GraphOptimizationLevel = GraphOptimizationLevel.ORT_ENABLE_ALL; + + _session?.Dispose(); + _session = new InferenceSession(modelPath, opts); + ResetState(); + _logger.Info($"Silero VAD model loaded: {modelPath}"); + } + + /// Reset the internal RNN state (call between utterances). + public void ResetState() + { + Array.Clear(_state, 0, _state.Length); + } + + /// + /// Process a single chunk of audio and return the speech probability (0.0–1.0). + /// Input must be exactly float samples at 16 kHz. + /// + public float ProcessChunk(float[] audioChunk) + { + if (_session == null) + throw new InvalidOperationException("VAD model not loaded. Call LoadModel first."); + + if (audioChunk.Length != ChunkSamples) + throw new ArgumentException($"Audio chunk must be exactly {ChunkSamples} samples, got {audioChunk.Length}"); + + // Build input tensors matching Silero VAD v5 expected shapes. + // See: github.com/snakers4/silero-vad/blob/master/examples/csharp/SileroVadOnnxModel.cs + var inputTensor = new DenseTensor(audioChunk, new[] { 1, ChunkSamples }); + var srTensor = new DenseTensor(new long[] { SampleRate }, new[] { 1 }); + var stateTensor = new DenseTensor(_state, new[] { 2, 1, 128 }); + + using var results = _session.Run(new List + { + NamedOnnxValue.CreateFromTensor("input", inputTensor), + NamedOnnxValue.CreateFromTensor("sr", srTensor), + NamedOnnxValue.CreateFromTensor("state", stateTensor) + }); + + float probability = 0f; + foreach (var result in results) + { + if (result.Name == "output") + { + var tensor = result.AsTensor(); + probability = tensor.Length > 0 ? tensor.GetValue(0) : 0f; + } + else if (result.Name == "stateN") + { + var newState = result.AsTensor(); + for (int i = 0; i < _stateSize && i < newState.Length; i++) + _state[i] = newState.GetValue(i); + } + } + + return probability; + } + + public void Dispose() + { + _session?.Dispose(); + } +} diff --git a/src/OpenClaw.Shared/Audio/WhisperModelManager.cs b/src/OpenClaw.Shared/Audio/WhisperModelManager.cs new file mode 100644 index 00000000..6b49341e --- /dev/null +++ b/src/OpenClaw.Shared/Audio/WhisperModelManager.cs @@ -0,0 +1,223 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.IO; +using System.Net.Http; +using System.Threading; +using System.Threading.Tasks; + +namespace OpenClaw.Shared.Audio; + +/// +/// Manages Whisper GGML model downloads, storage, and lifecycle. +/// Models are stored in %APPDATA%\OpenClawTray\models\ (or the +/// configured data directory). +/// +public sealed class WhisperModelManager +{ + private readonly string _modelsDirectory; + private readonly IOpenClawLogger _logger; + // Per-model single-flight gate: a manual auto-download (VoiceService + // EnsureInitializedAsync) and a UI-triggered download for the same + // model would otherwise both write the same .tmp file. Static so an + // additional manager instance constructed elsewhere (e.g. the Settings + // page's status-only check) doesn't bypass the lock. + private static readonly ConcurrentDictionary> InFlightDownloads = new(StringComparer.OrdinalIgnoreCase); + + /// + /// Known Whisper model definitions. + /// + /// SECURITY — pinned SHA-256 hashes (lowercase hex) verified against + /// HuggingFace on 2026-05-05. Downloads with a different hash are + /// rejected and the partial file is deleted. Before any public release: + /// re-verify each hash from an independent source and document the + /// provenance in Audio_FollowUps.md §2 (also consider replacing this + /// inline table with a signed manifest). + /// + public static readonly WhisperModelInfo[] AvailableModels = + [ + new("ggml-tiny.bin", "tiny", 77_691_713, "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin", + "be07e048e1e599ad46341c8d2a135645097a538221678b7acdd1b1919c6e1b21"), + new("ggml-base.bin", "base", 147_951_465, "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin", + "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe"), + new("ggml-small.bin", "small", 487_601_967, "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin", + "1be3a9b2063867b937e64e2ec7483364a79917e157fa98c5d94b5c1fffea987b"), + ]; + + public WhisperModelManager(string dataDirectory, IOpenClawLogger logger) + { + _modelsDirectory = Path.Combine(dataDirectory, "models"); + _logger = logger; + Directory.CreateDirectory(_modelsDirectory); + } + + /// Full file path for a given model name. + public string GetModelPath(string modelName) + { + var info = FindModel(modelName); + return Path.Combine(_modelsDirectory, info.FileName); + } + + /// Check whether a model file already exists on disk. + public bool IsModelDownloaded(string modelName) + { + var path = GetModelPath(modelName); + return File.Exists(path); + } + + /// Get the size of a downloaded model, or 0 if not downloaded. + public long GetModelSize(string modelName) + { + var path = GetModelPath(modelName); + return File.Exists(path) ? new FileInfo(path).Length : 0; + } + + /// + /// Download a model from HuggingFace if not already present. + /// Reports progress as bytes downloaded / total bytes. + /// Per-model single-flight: concurrent calls for the same model await + /// the in-flight download instead of racing on the same .tmp file. + /// + public Task DownloadModelAsync( + string modelName, + IProgress<(long downloaded, long total)>? progress = null, + CancellationToken cancellationToken = default) + { + var info = FindModel(modelName); + var destPath = Path.Combine(_modelsDirectory, info.FileName); + + if (File.Exists(destPath)) + { + _logger.Info($"Model '{modelName}' already exists at {destPath}"); + return Task.CompletedTask; + } + + // Use the canonical key (FileName) so two callers that pass "base" + // and "ggml-base.bin" still coalesce. + var key = info.FileName; + return SingleFlightDownload.RunAsync( + InFlightDownloads, + key, + token => DownloadModelCoreAsync(info, destPath, progress, token), + cancellationToken); + } + + private async Task DownloadModelCoreAsync( + WhisperModelInfo info, + string destPath, + IProgress<(long downloaded, long total)>? progress, + CancellationToken cancellationToken) + { + // SECURITY: a missing pinned hash is treated as a hard failure so we + // never install an unverified asset. The catalog above pins all + // shipped models; if you add a new one without a hash, this is the + // place that refuses to download it. See Audio_FollowUps.md §2. + if (string.IsNullOrWhiteSpace(info.Sha256)) + { + throw new InvalidOperationException( + $"Whisper model '{info.Name}' has no pinned SHA-256; refusing to download. " + + "Add a verified hash to AvailableModels before enabling this model."); + } + + _logger.Info($"Downloading model '{info.Name}' from {info.DownloadUrl}"); + var tempPath = destPath + ".tmp"; + + try + { + using var httpClient = new HttpClient(); + httpClient.Timeout = TimeSpan.FromMinutes(30); + using var response = await httpClient.GetAsync(info.DownloadUrl, HttpCompletionOption.ResponseHeadersRead, cancellationToken); + response.EnsureSuccessStatusCode(); + + var totalBytes = response.Content.Headers.ContentLength ?? info.ApproximateSizeBytes; + using (var contentStream = await response.Content.ReadAsStreamAsync(cancellationToken)) + using (var fileStream = new FileStream(tempPath, FileMode.Create, FileAccess.Write, FileShare.None, 81920)) + { + var buffer = new byte[81920]; + long downloadedBytes = 0; + int bytesRead; + + while ((bytesRead = await contentStream.ReadAsync(buffer, cancellationToken)) > 0) + { + await fileStream.WriteAsync(buffer.AsMemory(0, bytesRead), cancellationToken); + downloadedBytes += bytesRead; + progress?.Report((downloadedBytes, totalBytes)); + } + + await fileStream.FlushAsync(cancellationToken); + } + + // SECURITY: verify SHA-256 BEFORE the atomic rename, so a + // tampered file never lands at the canonical path. On mismatch + // we delete the temp file (no partial install) and surface a + // sanitized error — we deliberately do NOT echo the actual + // hash because that gives an attacker a confirmation oracle. + await VerifyHashAsync(tempPath, info.Sha256, info.Name, cancellationToken); + + File.Move(tempPath, destPath, overwrite: true); + _logger.Info($"Model '{info.Name}' downloaded and verified"); + } + catch + { + // Clean up partial download + try { if (File.Exists(tempPath)) File.Delete(tempPath); } catch { /* best effort */ } + throw; + } + } + + /// + /// Compute SHA-256 of and compare to + /// . Throws on mismatch (and the caller + /// is expected to delete the file). Does not echo the actual hash to + /// avoid handing attackers a confirmation oracle. + /// + private static async Task VerifyHashAsync(string filePath, string expectedHex, string assetName, CancellationToken cancellationToken) + { + using var sha = System.Security.Cryptography.SHA256.Create(); + await using var stream = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.Read, 81920, useAsync: true); + var actual = await sha.ComputeHashAsync(stream, cancellationToken); + var actualHex = Convert.ToHexString(actual).ToLowerInvariant(); + if (!string.Equals(actualHex, expectedHex, StringComparison.OrdinalIgnoreCase)) + { + throw new System.Security.SecurityException( + $"Whisper model '{assetName}' failed integrity check. The downloaded file does not match the pinned SHA-256."); + } + } + + /// Delete a downloaded model file. + public bool DeleteModel(string modelName) + { + var path = GetModelPath(modelName); + if (!File.Exists(path)) return false; + File.Delete(path); + _logger.Info($"Deleted model '{modelName}'"); + return true; + } + + private static WhisperModelInfo FindModel(string modelName) + { + foreach (var m in AvailableModels) + { + if (string.Equals(m.Name, modelName, StringComparison.OrdinalIgnoreCase)) + return m; + } + throw new ArgumentException($"Unknown model: '{modelName}'. Available: tiny, base, small"); + } +} + +/// Metadata about a Whisper model variant. +/// On-disk filename (e.g. "ggml-base.bin"). +/// Short identifier used by callers ("tiny" / "base" / "small"). +/// Approximate size hint for UI; the +/// actual size is asserted against after download. +/// HTTPS URL of the model file. +/// Pinned lowercase hex SHA-256 of the downloaded file. +/// MUST be set; downloads are refused when null. See the catalog for the +/// "verified on" date — these need re-verification before any public +/// release (see Audio_FollowUps.md §2). +public sealed record WhisperModelInfo( + string FileName, + string Name, + long ApproximateSizeBytes, + string DownloadUrl, + string? Sha256); diff --git a/src/OpenClaw.Shared/Capabilities/SttCapability.cs b/src/OpenClaw.Shared/Capabilities/SttCapability.cs new file mode 100644 index 00000000..4a944054 --- /dev/null +++ b/src/OpenClaw.Shared/Capabilities/SttCapability.cs @@ -0,0 +1,339 @@ +using System; +using System.Collections.Generic; +using System.Text.RegularExpressions; +using System.Threading; +using System.Threading.Tasks; + +namespace OpenClaw.Shared.Capabilities; + +/// +/// Speech-to-text node capability. Three commands: +/// +/// * — bounded fixed-duration capture + transcription. +/// Caller must specify maxDurationMs (capped at ). +/// Useful for quick "give me 5 seconds of audio" prompts. +/// +/// * — VAD-driven capture that returns when speech ends +/// or after timeoutMs (default , range +/// ..). +/// Useful for conversational "listen until I stop talking" prompts. +/// +/// * — reports engine readiness (no PII). +/// +/// The actual engine lives in the tray (Whisper.net + NAudio + Silero VAD). +/// Whisper is local-first and privacy-respecting; the legacy WinRT +/// SpeechRecognizer + desktop SAPI fallback was removed because both +/// stacks are old, can leak audio to the Microsoft cloud (online-speech), +/// and don't work in unpackaged builds. +/// +/// **Privacy invariants for the response surface:** +/// - Validation errors never echo the caller-supplied language string. +/// - Handler exceptions never propagate their Message into the response; +/// full detail stays in the local logger only. This is critical because +/// failed-invoke errors land in recent activity / support bundles. +/// - response carries no PII (no transcript fragments, +/// no language history, no device IDs, no model paths). +/// +public sealed class SttCapability : NodeCapabilityBase +{ + public const string TranscribeCommand = "stt.transcribe"; + public const string ListenCommand = "stt.listen"; + public const string StatusCommand = "stt.status"; + + public const int MaxTranscribeDurationMs = 30_000; + public const int MinListenTimeoutMs = 1_000; + public const int MaxListenTimeoutMs = 120_000; + public const int DefaultListenTimeoutMs = 30_000; + + public const string DefaultLanguage = "en-US"; + public const string AutoLanguage = "auto"; + + /// + /// Engine identifier returned in engineEffective on every successful + /// stt.* response. Currently always "whisper"; the field exists so + /// adding a future engine doesn't break the wire shape. + /// + public const string EngineWhisper = "whisper"; + + private static readonly string[] _commands = [TranscribeCommand, ListenCommand, StatusCommand]; + + // Conservative BCP-47 check: 2-3 letter language, optional script + // (4 letter), optional region (2 letter or 3 digit), each separated + // by a hyphen. Rejects whitespace and punctuation that would otherwise + // trip Windows.Globalization.Language ctor. The literal "auto" + // sentinel is accepted in addition (Whisper supports auto-detect). + private static readonly Regex BcpTagRegex = new( + "^[A-Za-z]{2,3}(?:-[A-Za-z]{4})?(?:-(?:[A-Za-z]{2}|[0-9]{3}))?$", + RegexOptions.Compiled); + + public override string Category => "stt"; + public override IReadOnlyList Commands => _commands; + + /// + /// Tray-side handler for : bounded fixed-duration + /// capture + transcription. + /// + public event Func>? TranscribeRequested; + + /// + /// Tray-side handler for : VAD-driven capture that + /// returns on end-of-speech or after timeoutMs. + /// + public event Func>? ListenRequested; + + /// + /// Tray-side handler for : returns per-engine readiness. + /// + public event Func>? StatusRequested; + + public SttCapability(IOpenClawLogger logger) : base(logger) { } + + /// + /// Trim and validate a single language tag. Returns the trimmed tag on + /// success, the literal sentinel on a case-insensitive + /// "auto" input, or null if the input is neither. + /// Public so UI surfaces can validate against the same rule the wire applies. + /// + public static string? NormalizeLanguageTag(string tag) + { + var trimmed = tag.Trim(); + if (string.Equals(trimmed, AutoLanguage, StringComparison.OrdinalIgnoreCase)) + return AutoLanguage; + return BcpTagRegex.IsMatch(trimmed) ? trimmed : null; + } + + /// + /// Resolve the language to use for a recognition call: per-call argument + /// wins, then configured setting, then . + /// Returns null if the resolved string fails validation. + /// + public static string? ResolveLanguage(string? requested, string? configured) + { + var candidate = !string.IsNullOrWhiteSpace(requested) + ? requested + : (!string.IsNullOrWhiteSpace(configured) ? configured : DefaultLanguage); + + return NormalizeLanguageTag(candidate!); + } + + public override Task ExecuteAsync(NodeInvokeRequest request) + => ExecuteAsync(request, CancellationToken.None); + + public override async Task ExecuteAsync( + NodeInvokeRequest request, + CancellationToken cancellationToken) + { + return request.Command switch + { + TranscribeCommand => await HandleTranscribeAsync(request, cancellationToken).ConfigureAwait(false), + ListenCommand => await HandleListenAsync(request, cancellationToken).ConfigureAwait(false), + StatusCommand => await HandleStatusAsync(cancellationToken).ConfigureAwait(false), + _ => Error($"Unknown command: {request.Command}") + }; + } + + private async Task HandleTranscribeAsync( + NodeInvokeRequest request, + CancellationToken cancellationToken) + { + // maxDurationMs is required and bounded server-side. We deliberately + // reject 0/negative rather than substituting a default — callers + // explicitly choose how much mic time they're spending. + var maxDurationMs = GetIntArg(request.Args, "maxDurationMs", 0); + if (maxDurationMs <= 0) + return Error("Missing required maxDurationMs"); + if (maxDurationMs > MaxTranscribeDurationMs) + return Error($"maxDurationMs exceeds {MaxTranscribeDurationMs} ms"); + + var requestedLanguage = GetStringArg(request.Args, "language"); + string? resolvedLanguage = null; + if (!string.IsNullOrWhiteSpace(requestedLanguage)) + { + resolvedLanguage = NormalizeLanguageTag(requestedLanguage); + if (resolvedLanguage == null) + return Error("Invalid language tag"); + } + + if (TranscribeRequested == null) + return Error("STT transcribe not available"); + + var args = new SttTranscribeArgs + { + MaxDurationMs = maxDurationMs, + Language = resolvedLanguage // null lets the tray fall back to its configured setting + }; + + Logger.Info($"stt.transcribe: maxDurationMs={args.MaxDurationMs}, language={args.Language ?? "(default)"}"); + + try + { + var result = await TranscribeRequested(args, cancellationToken).ConfigureAwait(false); + return Success(new + { + transcribed = result.Transcribed, + text = result.Text, + durationMs = result.DurationMs, + language = result.Language, + engineEffective = result.EngineEffective + }); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + return Error("Transcribe canceled"); + } + catch (Exception ex) + { + // Privacy: never echo raw exception text into the response. The + // exception flows through the failed-invoke path and may be + // persisted to recent activity / support bundles. Full detail + // stays in the local log only. + Logger.Error("STT transcribe failed", ex); + return Error("Transcribe failed"); + } + } + + private async Task HandleListenAsync( + NodeInvokeRequest request, + CancellationToken cancellationToken) + { + // timeoutMs is optional with a sane default; bounded both ways so + // a hostile caller can't pin the mic open for an hour. + var timeoutMs = GetIntArg(request.Args, "timeoutMs", DefaultListenTimeoutMs); + if (timeoutMs < MinListenTimeoutMs) timeoutMs = MinListenTimeoutMs; + if (timeoutMs > MaxListenTimeoutMs) timeoutMs = MaxListenTimeoutMs; + + var requestedLanguage = GetStringArg(request.Args, "language"); + string resolvedLanguage = AutoLanguage; + if (!string.IsNullOrWhiteSpace(requestedLanguage)) + { + var normalized = NormalizeLanguageTag(requestedLanguage); + if (normalized == null) + return Error("Invalid language tag"); + resolvedLanguage = normalized; + } + + if (ListenRequested == null) + return Error("STT listen not available"); + + var args = new SttListenArgs + { + TimeoutMs = timeoutMs, + Language = resolvedLanguage + }; + + Logger.Info($"stt.listen: timeoutMs={timeoutMs}, language={resolvedLanguage}"); + + try + { + var result = await ListenRequested(args, cancellationToken).ConfigureAwait(false); + return Success(new + { + text = result.Text, + language = result.Language, + durationMs = result.DurationMs, + segments = result.Segments, + engineEffective = result.EngineEffective + }); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + return Error("Listen canceled"); + } + catch (Exception ex) + { + // Same privacy invariant as Transcribe. + Logger.Error("STT listen failed", ex); + return Error("Listen failed"); + } + } + + private async Task HandleStatusAsync(CancellationToken cancellationToken) + { + if (StatusRequested == null) + return Error("STT status not available"); + + try + { + var result = await StatusRequested(cancellationToken).ConfigureAwait(false); + return Success(new + { + engine = result.Engine, + readiness = result.Readiness, + modelDownloadProgress = result.ModelDownloadProgress, + isListenWithVadSupported = result.IsListenWithVadSupported, + isBoundedTranscribeSupported = result.IsBoundedTranscribeSupported + }); + } + catch (Exception ex) + { + // Status must not leak engine internals; carry only a fixed message. + Logger.Error("STT status failed", ex); + return Error("Status failed"); + } + } +} + +public sealed class SttTranscribeArgs +{ + public int MaxDurationMs { get; set; } + /// + /// BCP-47 tag (e.g., "en-US"), the literal "auto" sentinel, or null + /// to let the tray fall back to its configured SttLanguage setting. + /// + public string? Language { get; set; } +} + +public sealed class SttTranscribeResult +{ + public bool Transcribed { get; set; } + public string Text { get; set; } = ""; + public int DurationMs { get; set; } + public string Language { get; set; } = SttCapability.DefaultLanguage; + + /// + /// Engine that served this call. Always + /// today; the field exists so a future engine doesn't break the wire. + /// + public string EngineEffective { get; set; } = SttCapability.EngineWhisper; +} + +public sealed class SttListenArgs +{ + public int TimeoutMs { get; set; } + /// + /// BCP-47 tag (e.g., "en-US"), or the literal "auto" sentinel + /// (default; lets Whisper auto-detect). + /// + public string Language { get; set; } = SttCapability.AutoLanguage; +} + +public sealed class SttListenResult +{ + public string Text { get; set; } = ""; + public string Language { get; set; } = SttCapability.AutoLanguage; + public int DurationMs { get; set; } + public IReadOnlyList Segments { get; set; } = Array.Empty(); + + public string EngineEffective { get; set; } = SttCapability.EngineWhisper; +} + +public sealed class SttSegment +{ + public string Text { get; set; } = ""; + public int StartMs { get; set; } + public int EndMs { get; set; } +} + +public sealed class SttStatusResult +{ + public string Engine { get; set; } = SttCapability.EngineWhisper; + + /// One of "ready", "initializing", "model-downloading", "model-not-downloaded", "unavailable". + public string Readiness { get; set; } = "unavailable"; + + /// 0..1 download progress when == "model-downloading"; null otherwise. + public double? ModelDownloadProgress { get; set; } + + public bool IsListenWithVadSupported { get; set; } + public bool IsBoundedTranscribeSupported { get; set; } +} diff --git a/src/OpenClaw.Shared/Capabilities/TtsCapability.cs b/src/OpenClaw.Shared/Capabilities/TtsCapability.cs index c6407828..23878b22 100644 --- a/src/OpenClaw.Shared/Capabilities/TtsCapability.cs +++ b/src/OpenClaw.Shared/Capabilities/TtsCapability.cs @@ -10,6 +10,11 @@ public sealed class TtsCapability : NodeCapabilityBase public const string SpeakCommand = "tts.speak"; public const string WindowsProvider = "windows"; public const string ElevenLabsProvider = "elevenlabs"; + /// + /// Local neural TTS via Sherpa-ONNX wrapping Piper voices. No network + /// egress; voice models download once to %LOCALAPPDATA%. + /// + public const string PiperProvider = "piper"; public const int MaxTextLength = 5000; private static readonly string[] _commands = [SpeakCommand]; @@ -30,7 +35,7 @@ public static string ResolveProvider(string? requestedProvider, string? configur : requestedProvider; return string.IsNullOrWhiteSpace(provider) - ? WindowsProvider + ? PiperProvider : provider.Trim().ToLowerInvariant(); } @@ -81,8 +86,14 @@ public override async Task ExecuteAsync( } catch (Exception ex) { + // Privacy: never echo raw exception text into the response. The + // exception flows through the failed-invoke path and may be + // persisted to recent activity / support bundles. ElevenLabs + // error messages can contain key prefixes; OS speech errors + // can contain device names. Full detail stays in the local + // log only. (Same pattern as SttCapability.) Logger.Error("TTS speak failed", ex); - return Error($"Speak failed: {ex.Message}"); + return Error("Speak failed"); } } diff --git a/src/OpenClaw.Shared/DeepLinkParser.cs b/src/OpenClaw.Shared/DeepLinkParser.cs index 21bcdf7c..4ef68266 100644 --- a/src/OpenClaw.Shared/DeepLinkParser.cs +++ b/src/OpenClaw.Shared/DeepLinkParser.cs @@ -20,10 +20,13 @@ public static class DeepLinkParser if (!uri.StartsWith(Scheme, StringComparison.OrdinalIgnoreCase)) return null; - var remainder = uri[Scheme.Length..].TrimEnd('/'); + var remainder = uri[Scheme.Length..]; var queryIndex = remainder.IndexOf('?'); var query = queryIndex >= 0 ? remainder[(queryIndex + 1)..] : ""; - var path = queryIndex >= 0 ? remainder[..queryIndex] : remainder; + // Trim trailing slash AFTER splitting off the query so the + // Windows-canonicalized form `openclaw://send/?args=...` (slash + // BEFORE the `?`) yields path "send", not "send/". + var path = (queryIndex >= 0 ? remainder[..queryIndex] : remainder).TrimEnd('/'); var parameters = new Dictionary(StringComparer.OrdinalIgnoreCase); foreach (var part in query.Split('&', StringSplitOptions.RemoveEmptyEntries)) diff --git a/src/OpenClaw.Shared/Mcp/McpToolBridge.cs b/src/OpenClaw.Shared/Mcp/McpToolBridge.cs index e6ce348a..2bae2000 100644 --- a/src/OpenClaw.Shared/Mcp/McpToolBridge.cs +++ b/src/OpenClaw.Shared/Mcp/McpToolBridge.cs @@ -236,9 +236,18 @@ private object HandleToolsList() ["camera.clip"] = "Record a short clip from a camera. Args: deviceId (string, optional), durationMs (int, required, max 60000), format ('mp4'|'webm', default 'mp4'), maxWidth (int, default 1280). Returns { format, durationMs, base64 }.", + // stt.* — microphone capture → text. Default-off; privacy-sensitive. + // Single engine: Whisper.net runs locally on the device. + ["stt.transcribe"] = + "Capture microphone audio for a bounded duration and return the transcribed text. Args: maxDurationMs (int, required, > 0, max 30000), language (string, optional BCP-47 tag like 'en-US' or 'auto' — falls back to the configured SttLanguage setting). Returns { transcribed, text, durationMs, language, engineEffective ('whisper') }. Whisper model is downloaded on first use; until then this returns an error pointing to Voice Settings. Requires NodeSttEnabled.", + ["stt.listen"] = + "Capture microphone audio with voice-activity detection and return when the user stops speaking, or after timeoutMs. Args: timeoutMs (int, optional, default 30000, range 1000..120000), language (string, optional BCP-47 tag or 'auto', default 'auto'). Returns { text, language, durationMs, segments[{ text, startMs, endMs }], engineEffective ('whisper') }. Result is the full silence-bounded utterance (all Whisper segments concatenated), not a partial first segment. Requires NodeSttEnabled.", + ["stt.status"] = + "Report STT engine readiness. No args. Returns { engine ('whisper'), readiness ('ready'|'initializing'|'model-downloading'|'model-not-downloaded'|'unavailable'), modelDownloadProgress (0..1 or null), isListenWithVadSupported (bool), isBoundedTranscribeSupported (bool) }. Carries no PII (no transcript history, no language history, no device IDs, no model paths).", + // tts.* ["tts.speak"] = - "Speak text aloud on the Windows node. Args: text (string, required), provider ('windows'|'elevenlabs', optional), voiceId (string, optional), model (string, optional), interrupt (bool, default false). Returns { spoken, provider, contentType, durationMs }.", + "Speak text aloud on the Windows node. Args: text (string, required), provider ('piper'|'windows'|'elevenlabs', optional — falls back to the configured TtsProvider setting, default 'piper' for fresh installs), voiceId (string, optional — overrides the per-provider configured voice), model (string, optional, ElevenLabs only), interrupt (bool, default false — interrupts any in-progress playback). Returns { spoken, provider, contentType, durationMs }.", // app.* ["app.navigate"] = diff --git a/src/OpenClaw.Shared/Models.cs b/src/OpenClaw.Shared/Models.cs index 496e879a..3ce16f99 100644 --- a/src/OpenClaw.Shared/Models.cs +++ b/src/OpenClaw.Shared/Models.cs @@ -762,7 +762,7 @@ public static List BuildDefaultWindowsMatrix() { Name = "Microphone", Status = "review", - Detail = "Required only for camera clips with audio or future voice features.", + Detail = "Required for camera clips with audio and for stt.transcribe speech-to-text capture.", SettingsUri = "ms-settings:privacy-microphone" }, new() @@ -1019,7 +1019,7 @@ public static class CommandCenterCommandGroups public static readonly FrozenSet SafeCompanionCommandSet = SafeCompanionCommands.ToFrozenSet(StringComparer.OrdinalIgnoreCase); - public static readonly string[] DangerousCommands = + public static readonly string[] CommonDangerousCommands = [ "camera.snap", "camera.clip", @@ -1027,6 +1027,14 @@ public static class CommandCenterCommandGroups "tts.speak" ]; + public static readonly string[] DangerousCommands = + [ + .. CommonDangerousCommands, + "stt.transcribe", + "stt.listen", + "stt.status" + ]; + public static readonly FrozenSet DangerousCommandSet = DangerousCommands.ToFrozenSet(StringComparer.OrdinalIgnoreCase); @@ -1235,7 +1243,7 @@ public static List BuildNodeWarnings(NodeCapabilityHea Severity = GatewayDiagnosticSeverity.Info, Category = "allowlist", Title = "Privacy-sensitive commands are currently blocked", - Detail = $"{blocked} {(node.MissingDangerousAllowlistCommands.Count == 1 ? "is" : "are")} declared but filtered by gateway policy. Leave blocked unless you explicitly want camera or screen recording access for this node.", + Detail = $"{blocked} {(node.MissingDangerousAllowlistCommands.Count == 1 ? "is" : "are")} declared but filtered by gateway policy. Leave blocked unless you explicitly want camera, microphone, or screen recording access for this node.", RepairAction = "Copy opt-in guidance", CopyText = BuildDangerousCommandOptInGuidance(node.MissingDangerousAllowlistCommands) }); diff --git a/src/OpenClaw.Shared/OpenClaw.Shared.csproj b/src/OpenClaw.Shared/OpenClaw.Shared.csproj index ac832955..2c0a6df5 100644 --- a/src/OpenClaw.Shared/OpenClaw.Shared.csproj +++ b/src/OpenClaw.Shared/OpenClaw.Shared.csproj @@ -15,6 +15,13 @@ + + + + + + + diff --git a/src/OpenClaw.Shared/SettingsData.cs b/src/OpenClaw.Shared/SettingsData.cs index f1983d88..a8096c0c 100644 --- a/src/OpenClaw.Shared/SettingsData.cs +++ b/src/OpenClaw.Shared/SettingsData.cs @@ -34,8 +34,24 @@ public class SettingsData public bool NodeCameraEnabled { get; set; } = true; public bool NodeLocationEnabled { get; set; } = true; public bool NodeBrowserProxyEnabled { get; set; } = true; + public bool NodeSttEnabled { get; set; } = false; + /// STT language: "auto" for Whisper auto-detect, or a BCP-47 tag like "en-US". + public string SttLanguage { get; set; } = "auto"; + /// Whisper model name: "tiny", "base", or "small". + public string SttModelName { get; set; } = "base"; + /// Seconds of silence before auto-submit in voice chat mode. + public float SttSilenceTimeout { get; set; } = 2.5f; + /// Enable TTS playback of responses during voice sessions. + public bool VoiceTtsEnabled { get; set; } = true; + /// Play audio feedback chimes on listen start/stop. + public bool VoiceAudioFeedback { get; set; } = true; public bool NodeTtsEnabled { get; set; } = false; - public string TtsProvider { get; set; } = "windows"; + public string TtsProvider { get; set; } = OpenClaw.Shared.Capabilities.TtsCapability.PiperProvider; + /// Persisted: whether the Hub's NavigationView pane is expanded + /// (true) or collapsed/compact (false). Default true. + public bool HubNavPaneOpen { get; set; } = true; + /// Optional Windows TTS voice id (or display name). Empty = system default. + public string? TtsWindowsVoiceId { get; set; } /// /// ElevenLabs API key storage slot. When persisted by the Windows tray's /// SettingsManager this is an opaque dpapi:-prefixed blob, not plaintext. @@ -43,6 +59,8 @@ public class SettingsData public string? TtsElevenLabsApiKey { get; set; } public string? TtsElevenLabsModel { get; set; } public string? TtsElevenLabsVoiceId { get; set; } + /// Piper voice identifier, e.g. "en_US-amy-low". Voice file is downloaded on first use. + public string TtsPiperVoiceId { get; set; } = "en_US-amy-low"; /// Run the local MCP HTTP server. Independent of EnableNodeMode. public bool EnableMcpServer { get; set; } = false; /// @@ -64,6 +82,8 @@ public class SettingsData public bool PreferStructuredCategories { get; set; } = true; public List? UserRules { get; set; } + // ── (Voice / STT settings consolidated into the block above.) ── + private static readonly JsonSerializerOptions s_options = new() { WriteIndented = true, diff --git a/src/OpenClaw.Tray.WinUI/App.xaml.cs b/src/OpenClaw.Tray.WinUI/App.xaml.cs index 35930815..fc4f49af 100644 --- a/src/OpenClaw.Tray.WinUI/App.xaml.cs +++ b/src/OpenClaw.Tray.WinUI/App.xaml.cs @@ -380,6 +380,7 @@ protected override async void OnLaunched(LaunchActivatedEventArgs args) { _globalHotkey = new GlobalHotkeyService(); _globalHotkey.HotkeyPressed += OnGlobalHotkeyPressed; + _globalHotkey.VoiceHotkeyPressed += OnVoiceHotkeyPressed; _globalHotkey.Register(); } @@ -454,6 +455,49 @@ private void ShowChatWindow() } } + private VoiceOverlayWindow? _voiceOverlayWindow; + private VoiceService? _standaloneVoiceService; + + private void ShowVoiceOverlay() + { + var voiceService = _nodeService?.VoiceService ?? EnsureStandaloneVoiceService(); + if (voiceService == null) + { + // STT not enabled — show settings + ShowHub("voice"); + return; + } + + if (_voiceOverlayWindow == null || _voiceOverlayWindow.AppWindow == null) + { + _voiceOverlayWindow = new VoiceOverlayWindow(voiceService, new AppLogger()); + _voiceOverlayWindow.Closed += (_, _) => _voiceOverlayWindow = null; + // Wire transcription to gateway chat when connected + _voiceOverlayWindow.TextSubmitted += text => + { + if (_gatewayClient != null && _currentStatus == ConnectionStatus.Connected) + { + _ = _gatewayClient.SendChatMessageAsync(text); + } + }; + // Wire Settings button → open the Hub on the Voice & Audio page. + _voiceOverlayWindow.SettingsRequested += () => + { + _dispatcherQueue?.TryEnqueue(() => ShowHub("voice")); + }; + } + + _voiceOverlayWindow.Activate(); + } + + private VoiceService? EnsureStandaloneVoiceService() + { + if (_settings?.NodeSttEnabled != true) + return null; + + return _standaloneVoiceService ??= new VoiceService(new AppLogger(), _settings); + } + private void OnTrayContextMenu(TrayIcon sender, TrayIconEventArgs e) { // Right-click: show menu @@ -503,6 +547,7 @@ private void OnTrayMenuItemClicked(object? sender, string action) case "dashboard": OpenDashboard(); break; case "canvas": _nodeService?.ShowCanvasWindow(); break; case "openchat": ShowChatWindow(); break; + case "voice": ShowVoiceOverlay(); break; case "webchat": ShowWebChat(); break; case "hub": ShowHub(); break; case "companion": @@ -1031,6 +1076,7 @@ private void BuildTrayMenuPopup(TrayMenuWindow menu) menu.AddMenuItem("Dashboard", "🌐", "dashboard"); menu.AddMenuItem("Chat", "💬", "openchat"); menu.AddMenuItem("Canvas", "🎨", "canvas"); + menu.AddMenuItem("Voice", "🎙️", "voice"); menu.AddMenuItem("Companion", "🦞", "companion"); menu.AddMenuItem(LocalizationHelper.GetString("Menu_QuickSend"), "📤", "quicksend"); @@ -1863,6 +1909,7 @@ private void SyncHubNodeState() _hubWindow.NodeIsPendingApproval = _nodeService.IsPendingApproval; _hubWindow.NodeShortDeviceId = _nodeService.ShortDeviceId; _hubWindow.NodeFullDeviceId = _nodeService.FullDeviceId; + _hubWindow.VoiceServiceInstance = _nodeService.VoiceService; } else { @@ -2318,6 +2365,32 @@ private void OnNotificationReceived(object? sender, OpenClawNotification notific $"{notification.Type ?? "info"}: {notification.Title ?? "notification"}", category: "notification", details: notification.Message); + + // Voice overlay: show agent chat responses, and (independently) speak them + // if the user enabled "Read responses aloud". TTS used to be gated on + // an active voice overlay session — we want the toggle to honor every + // chat reply now that voice and text chat will eventually share one UI. + if (notification.IsChat && !string.IsNullOrEmpty(notification.Message)) + { + if (_voiceOverlayWindow != null) + { + _dispatcherQueue?.TryEnqueue(() => + { + try + { + _voiceOverlayWindow?.AddAgentResponse(notification.Message); + } + catch { } + }); + } + + // TTS: read response aloud whenever the toggle is on (any chat surface). + if (_settings?.VoiceTtsEnabled == true) + { + _ = SpeakResponseAsync(notification.Message); + } + } + if (_settings?.ShowNotifications != true) return; if (!ShouldShowNotification(notification)) return; @@ -2519,7 +2592,7 @@ private string BuildTrayTooltip() #region Window Management - private void ShowHub(string? navigateTo = null) + private void ShowHub(string? navigateTo = null, bool activate = true) { if (_hubWindow == null || _hubWindow.IsClosed) { @@ -2561,6 +2634,7 @@ private void ShowHub(string? navigateTo = null) _hubWindow.NodeShortDeviceId = _nodeService.ShortDeviceId; _hubWindow.NodeFullDeviceId = _nodeService.FullDeviceId; } + _hubWindow.VoiceServiceInstance = _nodeService?.VoiceService ?? _standaloneVoiceService; _hubWindow.SettingsSaved += OnSettingsSaved; _hubWindow.Closed += (s, e) => { @@ -2578,6 +2652,7 @@ private void ShowHub(string? navigateTo = null) _hubWindow.Settings = _settings; _hubWindow.GatewayClient = _gatewayClient; _hubWindow.CurrentStatus = _currentStatus; + _hubWindow.VoiceServiceInstance = _nodeService?.VoiceService ?? _standaloneVoiceService; if (_nodeService != null) { _hubWindow.NodeIsConnected = _nodeService.IsConnected; @@ -2594,7 +2669,29 @@ private void ShowHub(string? navigateTo = null) { _hubWindow.NavigateTo(navigateTo); } - _hubWindow.Activate(); + if (activate) + { + _hubWindow.Activate(); + } + else + { + // Show without stealing focus — used by right-click on the + // tray icon where the popup needs to remain the foreground + // window (popups light-dismiss if focus moves away). + // If the Hub was minimized, restore it first so it actually + // becomes visible behind the popup; otherwise Show(false) + // is a no-op on a minimized window. + try + { + if (_hubWindow.AppWindow.Presenter is Microsoft.UI.Windowing.OverlappedPresenter op + && op.State == Microsoft.UI.Windowing.OverlappedPresenterState.Minimized) + { + op.Restore(activateWindow: false); + } + _hubWindow.AppWindow.Show(activateWindow: false); + } + catch { /* swallow */ } + } } private void SeedHubCachedData() @@ -3637,8 +3734,6 @@ private void CopyExtensibilitySummary() private void OnGlobalHotkeyPressed(object? sender, EventArgs e) { - // Hotkey events are raised from a dedicated Win32 message-loop thread. - // Creating/activating WinUI windows must happen on the app's UI thread. if (_dispatcherQueue == null) { Logger.Warn("Hotkey pressed but DispatcherQueue is null"); @@ -3652,6 +3747,12 @@ private void OnGlobalHotkeyPressed(object? sender, EventArgs e) } } + private void OnVoiceHotkeyPressed(object? sender, EventArgs e) + { + if (_dispatcherQueue == null) return; + _dispatcherQueue.TryEnqueue(() => ShowVoiceOverlay()); + } + #endregion #region Updates @@ -3871,6 +3972,8 @@ private void HandleDeepLink(string uri) OpenDashboard = OpenDashboard, OpenQuickSend = ShowQuickSend, OpenHub = (page) => ShowHub(page), + OpenVoice = () => ShowVoiceOverlay(), + StopVoice = () => _ = StopVoiceAsync(), SendMessage = async (msg) => { if (_gatewayClient != null) @@ -3881,6 +3984,58 @@ private void HandleDeepLink(string uri) }); } + private async Task StopVoiceAsync() + { + var voiceService = _nodeService?.VoiceService; + if (voiceService != null) + await voiceService.StopAsync(); + } + + private int _ttsMuteCount; + + private async Task SpeakResponseAsync(string text) + { + var voiceService = _nodeService?.VoiceService; + var ttsService = _nodeService?.TextToSpeech; + try + { + if (voiceService == null || _settings == null || ttsService == null) return; + + // Increment mute counter — multiple concurrent TTS won't unmute prematurely + Interlocked.Increment(ref _ttsMuteCount); + voiceService.IsMutedForPlayback = true; + + var speakText = text.Length > 500 ? text[..500] + "..." : text; + + // Don't pass VoiceId here. The shared TextToSpeechService picks + // the right per-provider voice from settings (TtsPiperVoiceId, + // TtsWindowsVoiceId, TtsElevenLabsVoiceId). Cross-provider + // voice IDs would otherwise leak across providers. + var speakArgs = new OpenClaw.Shared.Capabilities.TtsSpeakArgs + { + Text = speakText, + Provider = _settings.TtsProvider ?? TtsCapability.PiperProvider, + Interrupt = true + }; + + await ttsService.SpeakAsync(speakArgs); + } + catch (Exception ex) + { + Logger.Warn($"TTS response playback failed: {ex.Message}"); + } + finally + { + // Only unmute when all concurrent TTS operations have finished + if (voiceService != null) + { + await Task.Delay(300); + if (Interlocked.Decrement(ref _ttsMuteCount) <= 0) + voiceService.IsMutedForPlayback = false; + } + } + } + private static void SendDeepLinkToRunningInstance(string uri) { try @@ -3988,6 +4143,12 @@ private void ExitApplication() _nodeService = null; }); + SafeShutdownStep("standalone voice service", () => + { + _standaloneVoiceService?.DisposeAsync().AsTask().GetAwaiter().GetResult(); + _standaloneVoiceService = null; + }); + SafeShutdownStep("ssh tunnel service", () => { _sshTunnelService?.Dispose(); diff --git a/src/OpenClaw.Tray.WinUI/Onboarding/Pages/WizardPage.cs b/src/OpenClaw.Tray.WinUI/Onboarding/Pages/WizardPage.cs index 687bd716..7355f1ec 100644 --- a/src/OpenClaw.Tray.WinUI/Onboarding/Pages/WizardPage.cs +++ b/src/OpenClaw.Tray.WinUI/Onboarding/Pages/WizardPage.cs @@ -282,7 +282,7 @@ async void SubmitStep() var answerValue = string.IsNullOrEmpty(stepInput) ? "true" : stepInput; // Smart timeout: 5min for auth-related steps (device code polling), 30s for everything else - var isAuthStep = !string.IsNullOrEmpty(stepMessage) && + var isAuthStep = !string.IsNullOrEmpty(stepMessage) && (stepMessage.Contains("device", StringComparison.OrdinalIgnoreCase) || stepMessage.Contains("authorize", StringComparison.OrdinalIgnoreCase) || stepMessage.Contains("login", StringComparison.OrdinalIgnoreCase) || @@ -639,4 +639,3 @@ async void SkipStep() .Padding(0, 8, 0, 0); } } - diff --git a/src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj b/src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj index b9873035..ed8afcbd 100644 --- a/src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj +++ b/src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj @@ -60,6 +60,8 @@ + + diff --git a/src/OpenClaw.Tray.WinUI/Pages/CapabilitiesPage.xaml b/src/OpenClaw.Tray.WinUI/Pages/CapabilitiesPage.xaml index 2022f27b..bfff9a6f 100644 --- a/src/OpenClaw.Tray.WinUI/Pages/CapabilitiesPage.xaml +++ b/src/OpenClaw.Tray.WinUI/Pages/CapabilitiesPage.xaml @@ -25,6 +25,70 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + settings.NodeScreenEnabled = v), ("📍", "Location", settings.NodeLocationEnabled, v => settings.NodeLocationEnabled = v), ("🔊", "Text-to-Speech", settings.NodeTtsEnabled, v => settings.NodeTtsEnabled = v), + ("🎤", "Speech-to-Text", settings.NodeSttEnabled, v => settings.NodeSttEnabled = v), }; var items = new List(); @@ -59,6 +71,8 @@ private void BuildCapabilityToggles(HubWindow hub) setter(toggle.IsOn); settings.Save(); hub.RaiseSettingsSaved(); + UpdateSttCard(hub); + UpdateTtsCard(hub); UpdateNodeStatus(hub); }; items.Add(toggle); @@ -67,6 +81,162 @@ private void BuildCapabilityToggles(HubWindow hub) CapabilityRepeater.ItemsSource = items; } + // ============================================================ + // Speech-to-Text settings card + // ============================================================ + + private void UpdateSttCard(HubWindow hub) + { + var enabled = hub.Settings?.NodeSttEnabled == true; + SttCard.Visibility = enabled ? Visibility.Visible : Visibility.Collapsed; + if (!enabled || hub.Settings == null) return; + + UpdateSttEngineHint(hub); + } + + private void UpdateSttEngineHint(HubWindow hub) + { + // Whisper is the only engine. Surface model-readiness so the user + // knows what (if anything) needs to happen before stt.* will work. + // + // Check the file directly via WhisperModelManager rather than going + // through hub.VoiceServiceInstance — that instance is only created + // by NodeService.RegisterCapabilities() at Connect time, so a user + // who toggled STT on but hasn't reconnected yet would see a stale + // "not downloaded" message even with the file on disk. + var modelName = hub.Settings?.SttModelName ?? "base"; + var modelManager = new OpenClaw.Shared.Audio.WhisperModelManager( + SettingsManager.SettingsDirectoryPath, new AppLogger()); + var modelDownloaded = modelManager.IsModelDownloaded(modelName); + var modelDownloading = hub.VoiceServiceInstance?.IsWhisperDownloadingModel ?? false; + + if (modelDownloaded) + { + SttEngineHint.Text = "Whisper model is ready. Speech-to-text runs fully on this PC; no audio leaves the device."; + } + else if (modelDownloading) + { + SttEngineHint.Text = "Whisper model is downloading. Speech-to-text will be available once it's ready."; + } + else + { + SttEngineHint.Text = "Whisper model is not downloaded. Open More voice settings… to download it before using speech-to-text."; + } + } + + private void OnSttMoreSettingsClick(object sender, RoutedEventArgs e) + { + // Navigate the Hub to the dedicated voice settings page. + _hub?.NavigateTo("voice"); + } + + // ============================================================ + // Text-to-Speech settings card + // ============================================================ + + private void UpdateTtsCard(HubWindow hub) + { + var enabled = hub.Settings?.NodeTtsEnabled == true; + TtsCard.Visibility = enabled ? Visibility.Visible : Visibility.Collapsed; + if (!enabled || hub.Settings == null) return; + + var settings = hub.Settings; + + _suppressTtsProviderChange = true; + // ComboBox order: 0=Piper, 1=Windows, 2=ElevenLabs. + TtsProviderComboBox.SelectedIndex = settings.TtsProvider switch + { + var p when string.Equals(p, TtsCapability.ElevenLabsProvider, StringComparison.OrdinalIgnoreCase) => 2, + var p when string.Equals(p, TtsCapability.WindowsProvider, StringComparison.OrdinalIgnoreCase) => 1, + _ => 0 // default to Piper for unknown / null / whitespace + }; + _suppressTtsProviderChange = false; + + // PasswordBox shows a masked sentinel when we already have a saved + // key, so the user can tell something is set without us ever + // putting plaintext on screen. + TtsElevenLabsApiKeyBox.Password = + string.IsNullOrEmpty(settings.TtsElevenLabsApiKey) ? "" : SavedApiKeySentinel; + TtsElevenLabsVoiceIdBox.Text = settings.TtsElevenLabsVoiceId; + TtsElevenLabsModelBox.Text = settings.TtsElevenLabsModel; + + UpdateTtsElevenLabsPanelVisibility(); + TtsStatusText.Text = ""; + } + + private void UpdateTtsElevenLabsPanelVisibility() + { + var isEleven = (TtsProviderComboBox.SelectedItem is ComboBoxItem item) + && string.Equals(item.Tag as string, TtsCapability.ElevenLabsProvider, StringComparison.OrdinalIgnoreCase); + TtsElevenLabsPanel.Visibility = isEleven ? Visibility.Visible : Visibility.Collapsed; + } + + private void OnTtsProviderSelectionChanged(object sender, SelectionChangedEventArgs e) + { + if (_suppressTtsProviderChange) return; + if (_hub?.Settings == null) return; + + var newProvider = (TtsProviderComboBox.SelectedItem is ComboBoxItem item && item.Tag is string tag) + ? tag + : TtsCapability.WindowsProvider; + + if (!string.Equals(_hub.Settings.TtsProvider, newProvider, StringComparison.OrdinalIgnoreCase)) + { + _hub.Settings.TtsProvider = newProvider; + _hub.Settings.Save(); + _hub.RaiseSettingsSaved(); + TtsStatusText.Text = $"Default provider: {newProvider}"; + } + + UpdateTtsElevenLabsPanelVisibility(); + } + + private void OnTtsElevenLabsCommitted(object sender, RoutedEventArgs e) + { + if (_hub?.Settings == null) return; + var settings = _hub.Settings; + + var changed = false; + + // Treat the sentinel as "keep existing"; only overwrite when the + // user has typed a real key. + var typedKey = TtsElevenLabsApiKeyBox.Password ?? ""; + if (!string.Equals(typedKey, SavedApiKeySentinel, StringComparison.Ordinal)) + { + var trimmedKey = typedKey.Trim(); + if (!string.Equals(settings.TtsElevenLabsApiKey, trimmedKey, StringComparison.Ordinal)) + { + settings.TtsElevenLabsApiKey = trimmedKey; + changed = true; + } + } + + var voiceId = TtsElevenLabsVoiceIdBox.Text?.Trim() ?? ""; + if (!string.Equals(settings.TtsElevenLabsVoiceId, voiceId, StringComparison.Ordinal)) + { + settings.TtsElevenLabsVoiceId = voiceId; + changed = true; + } + + var model = TtsElevenLabsModelBox.Text?.Trim() ?? ""; + if (!string.Equals(settings.TtsElevenLabsModel, model, StringComparison.Ordinal)) + { + settings.TtsElevenLabsModel = model; + changed = true; + } + + if (changed) + { + settings.Save(); + _hub.RaiseSettingsSaved(); + // Re-render the API key field so the sentinel tracks the newly + // saved state instead of leaving the typed key visible. + TtsElevenLabsApiKeyBox.Password = + string.IsNullOrEmpty(settings.TtsElevenLabsApiKey) ? "" : SavedApiKeySentinel; + TtsStatusText.Text = "ElevenLabs settings saved."; + } + } + private void UpdateNodeStatus(HubWindow hub) { var nodeEnabled = hub.Settings?.EnableNodeMode ?? false; @@ -90,6 +260,7 @@ private void UpdateNodeStatus(HubWindow hub) if (hub.Settings?.NodeScreenEnabled == true) caps.Add("screen"); if (hub.Settings?.NodeLocationEnabled == true) caps.Add("location"); if (hub.Settings?.NodeTtsEnabled == true) caps.Add("tts"); + if (hub.Settings?.NodeSttEnabled == true) caps.Add("stt"); NodeDetailsText.Text = caps.Count > 0 ? $"Providing {caps.Count} capabilities: {string.Join(", ", caps)}" : "No capabilities enabled."; diff --git a/src/OpenClaw.Tray.WinUI/Pages/HomePage.xaml b/src/OpenClaw.Tray.WinUI/Pages/HomePage.xaml index 3c4c5828..e3687d4b 100644 --- a/src/OpenClaw.Tray.WinUI/Pages/HomePage.xaml +++ b/src/OpenClaw.Tray.WinUI/Pages/HomePage.xaml @@ -19,14 +19,14 @@ - + - - string.Equals(s.Status, "active", StringComparison.OrdinalIgnoreCase)) ?? false; @@ -72,17 +72,17 @@ private void UpdateMoltyRing(ConnectionStatus status) if (status == ConnectionStatus.Connected && hasActiveSessions) { // Agent working — animated blue ring - MoltyRing.Visibility = Visibility.Collapsed; - MoltyProgressRing.IsActive = true; - MoltyProgressRing.Visibility = Visibility.Visible; + CompanionRing.Visibility = Visibility.Collapsed; + CompanionProgressRing.IsActive = true; + CompanionProgressRing.Visibility = Visibility.Visible; } else { - MoltyProgressRing.IsActive = false; - MoltyProgressRing.Visibility = Visibility.Collapsed; - MoltyRing.Visibility = Visibility.Visible; + CompanionProgressRing.IsActive = false; + CompanionProgressRing.Visibility = Visibility.Collapsed; + CompanionRing.Visibility = Visibility.Visible; - MoltyRing.Stroke = status switch + CompanionRing.Stroke = status switch { ConnectionStatus.Connected => new SolidColorBrush(Colors.LimeGreen), ConnectionStatus.Error => new SolidColorBrush(Colors.Red), @@ -133,7 +133,7 @@ public void UpdateSessions(SessionInfo[] sessions) _lastSessions = sessions; DispatcherQueue?.TryEnqueue(() => { - UpdateMoltyRing(_lastStatus); + UpdateCompanionRing(_lastStatus); UpdateStatusText(_lastStatus); }); } diff --git a/src/OpenClaw.Tray.WinUI/Pages/VoiceSettingsPage.xaml b/src/OpenClaw.Tray.WinUI/Pages/VoiceSettingsPage.xaml new file mode 100644 index 00000000..34c3bf20 --- /dev/null +++ b/src/OpenClaw.Tray.WinUI/Pages/VoiceSettingsPage.xaml @@ -0,0 +1,187 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/OpenClaw.Tray.WinUI/Windows/VoiceOverlayWindow.xaml.cs b/src/OpenClaw.Tray.WinUI/Windows/VoiceOverlayWindow.xaml.cs new file mode 100644 index 00000000..e18feb78 --- /dev/null +++ b/src/OpenClaw.Tray.WinUI/Windows/VoiceOverlayWindow.xaml.cs @@ -0,0 +1,345 @@ +using System; +using System.Globalization; +using System.Threading.Tasks; +using Microsoft.UI.Dispatching; +using Microsoft.UI.Xaml; +using Microsoft.UI.Xaml.Controls; +using Microsoft.UI.Xaml.Media; +using OpenClaw.Shared; +using OpenClaw.Shared.Audio; +using OpenClawTray.Helpers; +using OpenClawTray.Services; +using WinUIEx; + +namespace OpenClawTray.Windows; + +/// +/// Floating voice overlay window for voice chat sessions. +/// Shows conversation transcript, audio levels, and controls. +/// +public sealed partial class VoiceOverlayWindow : WindowEx +{ + private readonly VoiceService _voiceService; + private readonly IOpenClawLogger _logger; + private readonly DispatcherQueue _dispatcherQueue; + private bool _isMuted; + + /// Fired when the user submits transcribed text to the agent. + public event Action? TextSubmitted; + + /// Fired when the user clicks the Settings button. Hosts should + /// navigate to the Voice & Audio page (e.g. via ShowHub("voice")). + public event Action? SettingsRequested; + + public VoiceOverlayWindow(VoiceService voiceService, IOpenClawLogger logger) + { + InitializeComponent(); + _voiceService = voiceService; + _logger = logger; + _dispatcherQueue = DispatcherQueue.GetForCurrentThread(); + + // Modern custom title bar + ExtendsContentIntoTitleBar = true; + SetTitleBar(AppTitleBar); + + _voiceService.TranscriptionReceived += OnTranscriptionReceived; + _voiceService.UtteranceCompleted += OnUtteranceCompleted; + _voiceService.SpeakingChanged += OnSpeakingChanged; + _voiceService.AudioLevelChanged += OnAudioLevelChanged; + _voiceService.ModeChanged += OnModeChanged; + _voiceService.PipelineStateChanged += OnPipelineStateChanged; + _voiceService.DiagnosticMessage += OnDiagnosticMessage; + + Closed += WindowClosed; + UpdateUI(); + } + + private DateTime _lastUserBubbleTime = DateTime.MinValue; + private TextBlock? _lastUserTextBlock; + + private void OnTranscriptionReceived(string text) + { + _dispatcherQueue.TryEnqueue(() => + { + // Per-segment bubble update (visual streaming). Consolidate into + // the last user bubble when fragments arrive within 5 seconds so + // a multi-segment utterance reads as one bubble in the transcript. + var elapsed = DateTime.UtcNow - _lastUserBubbleTime; + if (_lastUserTextBlock != null && elapsed.TotalSeconds < 5) + { + _lastUserTextBlock.Text += " " + text; + _lastUserBubbleTime = DateTime.UtcNow; + try + { + TranscriptScroller.UpdateLayout(); + TranscriptScroller.ChangeView(null, TranscriptScroller.ScrollableHeight, null); + } + catch { } + } + else + { + AddTranscriptBubble(text, isUser: true); + } + // NOTE: chat submission moved to OnUtteranceCompleted so the + // gateway receives one message per spoken utterance, not one per + // Whisper segment. + }); + } + + private void OnUtteranceCompleted(OpenClaw.Shared.Audio.UtteranceResult utterance) + { + // Fire once per silence-bounded utterance. The visual bubble already + // shows the streamed text; here we just hand the complete sentence + // to the gateway exactly once. + _dispatcherQueue.TryEnqueue(() => + { + if (!string.IsNullOrWhiteSpace(utterance.Text)) + TextSubmitted?.Invoke(utterance.Text); + }); + } + + /// Add an agent response to the transcript. + public void AddAgentResponse(string text) + { + _dispatcherQueue.TryEnqueue(() => + { + AddTranscriptBubble(text, isUser: false); + }); + } + + private void AddTranscriptBubble(string text, bool isUser) + { + try + { + // Hide empty state on first message + if (EmptyState.Visibility == Visibility.Visible) + EmptyState.Visibility = Visibility.Collapsed; + + var bubble = new Border + { + Background = isUser + ? new SolidColorBrush(Microsoft.UI.Colors.DodgerBlue) + : (Brush)Application.Current.Resources["CardBackgroundFillColorDefaultBrush"], + CornerRadius = isUser + ? new CornerRadius(12, 12, 4, 12) + : new CornerRadius(12, 12, 12, 4), + Padding = new Thickness(12, 10, 12, 10), + HorizontalAlignment = isUser + ? HorizontalAlignment.Right + : HorizontalAlignment.Left, + Margin = new Thickness(isUser ? 24 : 0, 4, isUser ? 0 : 24, 4) + }; + + var icon = isUser ? "\uE77B" : "\uE799"; // Person / Robot + var grid = new Grid { ColumnSpacing = 8 }; + grid.ColumnDefinitions.Add(new ColumnDefinition { Width = GridLength.Auto }); + grid.ColumnDefinitions.Add(new ColumnDefinition { Width = new GridLength(1, GridUnitType.Star) }); + + var fontIcon = new FontIcon { Glyph = icon, FontSize = 12, VerticalAlignment = VerticalAlignment.Top, Margin = new Thickness(0, 3, 0, 0) }; + Grid.SetColumn(fontIcon, 0); + grid.Children.Add(fontIcon); + + var textBlock = new TextBlock + { + Text = text, + TextWrapping = TextWrapping.Wrap, + FontSize = 13, + IsTextSelectionEnabled = true + }; + if (isUser) + { + textBlock.Foreground = new SolidColorBrush(Microsoft.UI.Colors.White); + _lastUserTextBlock = textBlock; + _lastUserBubbleTime = DateTime.UtcNow; + } + else + { + // Agent response breaks the consolidation window + _lastUserTextBlock = null; + } + Grid.SetColumn(textBlock, 1); + grid.Children.Add(textBlock); + + bubble.Child = grid; + TranscriptPanel.Children.Add(bubble); + + // Auto-scroll to bottom + TranscriptScroller.UpdateLayout(); + TranscriptScroller.ChangeView(null, TranscriptScroller.ScrollableHeight, null); + } + catch (Exception ex) + { + _logger.Error("Failed to add transcript bubble", ex); + } + } + + private static string L(string key) => LocalizationHelper.GetString(key); + private static string Lf(string key, params object?[] args) => + string.Format(CultureInfo.CurrentCulture, LocalizationHelper.GetString(key), args); + + private void OnSpeakingChanged(bool isSpeaking) + { + _dispatcherQueue.TryEnqueue(() => + { + StatusText.Text = isSpeaking + ? L("VoiceOverlayWindow_StatusListening") + : L("VoiceOverlayWindow_StatusSpeakNow"); + }); + } + + private void OnAudioLevelChanged(float level) + { + _dispatcherQueue.TryEnqueue(() => + { + // Scale the level bar width (max width = parent width) + var maxWidth = AudioLevelBar.Parent is FrameworkElement parent ? parent.ActualWidth : 300; + AudioLevelBar.Width = Math.Max(0, level * maxWidth); + }); + } + + private void OnModeChanged(VoiceMode mode) + { + _dispatcherQueue.TryEnqueue(UpdateUI); + } + + private void OnDiagnosticMessage(string message) + { + _dispatcherQueue.TryEnqueue(() => + { + StatusText.Text = message; + }); + } + + private void OnPipelineStateChanged(AudioPipelineState state) + { + _dispatcherQueue.TryEnqueue(() => + { + StatusBadge.Text = state switch + { + AudioPipelineState.Stopped => L("VoiceOverlayWindow_BadgeStopped"), + AudioPipelineState.Starting => L("VoiceOverlayWindow_BadgeStartingDots"), + AudioPipelineState.Listening => L("VoiceOverlayWindow_BadgeListening"), + AudioPipelineState.Processing => L("VoiceOverlayWindow_BadgeProcessing"), + AudioPipelineState.Error => L("VoiceOverlayWindow_StateError"), + _ => L("VoiceOverlayWindow_BadgeUnknown") + }; + + StatusText.Text = state switch + { + AudioPipelineState.Stopped => L("VoiceOverlayWindow_StatusReadyMessage"), + AudioPipelineState.Starting => L("VoiceOverlayWindow_StatusInitMic"), + AudioPipelineState.Listening => L("VoiceOverlayWindow_StatusSpeakNow"), + AudioPipelineState.Processing => L("VoiceOverlayWindow_StatusTranscribing"), + AudioPipelineState.Error => L("VoiceOverlayWindow_StatusErrorOccurred"), + _ => "" + }; + }); + } + + private void UpdateUI() + { + var isActive = _voiceService.CurrentMode != VoiceMode.Inactive; + + StartStopIcon.Glyph = isActive ? "\uE71A" : "\uE768"; // Stop / Play + StartStopText.Text = isActive + ? L("VoiceOverlayWindow_StopText") + : L("VoiceOverlayWindow_ButtonStartListening"); + MuteButton.IsEnabled = isActive; + + if (!isActive) + { + StatusBadge.Text = L("VoiceOverlayWindow_BadgeReady"); + StatusText.Text = L("VoiceOverlayWindow_StatusReadyMessage"); + AudioLevelBar.Width = 0; + } + } + + private async void OnStartStopClick(object sender, RoutedEventArgs e) + { + try + { + if (_voiceService.CurrentMode == VoiceMode.Inactive) + { + StatusText.Text = L("VoiceOverlayWindow_StateInitializing"); + StatusBadge.Text = L("VoiceOverlayWindow_StateStarting"); + StartStopButton.IsEnabled = false; + + // Initialize models if needed (may trigger downloads) + if (!_voiceService.IsModelLoaded) + { + if (!_voiceService.IsModelDownloaded) + { + StatusText.Text = L("VoiceOverlayWindow_StateDownloadingModel"); + var progress = new Progress<(long downloaded, long total)>(p => + { + _dispatcherQueue.TryEnqueue(() => + { + var pct = p.total > 0 ? (int)(p.downloaded * 100 / p.total) : 0; + StatusText.Text = Lf("VoiceOverlayWindow_StateDownloadingPct", pct); + }); + }); + await _voiceService.DownloadModelAsync(progress: progress); + } + + StatusText.Text = L("VoiceOverlayWindow_StateLoadingModel"); + await _voiceService.InitializeAsync(); + } + + StatusText.Text = L("VoiceOverlayWindow_StateStartingMic"); + await _voiceService.StartVoiceChatAsync(); + } + else + { + StatusText.Text = L("VoiceOverlayWindow_StateStopping"); + await _voiceService.StopAsync(); + } + } + catch (Exception ex) + { + _logger.Error("Voice overlay start/stop failed", ex); + // Sanitized — full ex.Message is in the log. + StatusText.Text = L("VoiceOverlayWindow_StatusError"); + StatusBadge.Text = L("VoiceOverlayWindow_StateError"); + } + finally + { + StartStopButton.IsEnabled = true; + UpdateUI(); + } + } + + private async void OnMuteClick(object sender, RoutedEventArgs e) + { + _isMuted = !_isMuted; + MuteIcon.Glyph = _isMuted ? "\uE74F" : "\uE767"; // Muted / Volume + + if (_isMuted) + { + await _voiceService.StopAsync(); + StatusText.Text = L("VoiceOverlayWindow_StatusMuted"); + } + else + { + await _voiceService.StartVoiceChatAsync(); + } + } + + private void OnSettingsClick(object sender, RoutedEventArgs e) + { + SettingsRequested?.Invoke(); + } + + private void WindowClosed(object sender, WindowEventArgs args) + { + _voiceService.TranscriptionReceived -= OnTranscriptionReceived; + _voiceService.UtteranceCompleted -= OnUtteranceCompleted; + _voiceService.SpeakingChanged -= OnSpeakingChanged; + _voiceService.AudioLevelChanged -= OnAudioLevelChanged; + _voiceService.ModeChanged -= OnModeChanged; + _voiceService.PipelineStateChanged -= OnPipelineStateChanged; + _voiceService.DiagnosticMessage -= OnDiagnosticMessage; + + // Stop voice session when window closes + _ = _voiceService.StopAsync(); + } +} diff --git a/src/OpenClaw.WinNode.Cli/skill.md b/src/OpenClaw.WinNode.Cli/skill.md index acb5add9..d7373fa9 100644 --- a/src/OpenClaw.WinNode.Cli/skill.md +++ b/src/OpenClaw.WinNode.Cli/skill.md @@ -221,6 +221,124 @@ default camera. ``` Returns `{ format, durationMs, base64 }`. +## Speech-to-text (stt.*) + +Local Whisper.net runs on this device — no audio leaves the box. The +model is downloaded on first use; until then every `stt.*` call returns +a clear error pointing the caller at the Voice Settings page. +**Privacy-sensitive: requires `NodeSttEnabled` in tray Settings.** + +### stt.transcribe +Bounded fixed-duration mic capture + transcription. +``` +{ + "maxDurationMs": 5000, // required, > 0, max 30000 + "language": "en" // optional BCP-47 tag or "auto" — falls back to SttLanguage setting +} +``` +Returns `{ transcribed, text, durationMs, language, engineEffective: "whisper" }`. + +### stt.listen +Mic capture with voice-activity detection. Returns when the user stops +speaking or after `timeoutMs`. Result is the full silence-bounded +utterance (all Whisper segments concatenated), not a partial first +segment. +``` +{ + "timeoutMs": 30000, // optional, default 30000, range 1000..120000 + "language": "auto" // optional BCP-47 tag or "auto" +} +``` +Returns `{ text, language, durationMs, segments[{ text, startMs, endMs }], engineEffective: "whisper" }`. + +### stt.status +Engine readiness. No params. Carries no PII (no transcript history, +no language history, no device IDs, no model paths). +Returns `{ engine: "whisper", readiness, modelDownloadProgress, isListenWithVadSupported, isBoundedTranscribeSupported }` +where `readiness` ∈ `"ready" | "initializing" | "model-downloading" | "model-not-downloaded" | "unavailable"`. + +## Text-to-speech (tts.*) + +Three providers — Piper (local neural via Sherpa-ONNX, default), Windows +built-in speech, and ElevenLabs (cloud). Provider + per-provider voice +are configured in tray Settings. + +### tts.speak +Speak text aloud on the Windows node. +``` +{ + "text": "string", // required + "provider": "piper|windows|elevenlabs", // optional, falls back to TtsProvider setting + "voiceId": "string", // optional, overrides the per-provider configured voice + "model": "string", // optional, ElevenLabs only + "interrupt": false // default false; true cuts off any in-progress playback +} +``` +Returns `{ spoken, provider, contentType, durationMs }`. + +## App control (app.*) + +Read-only and small write operations targeting the running tray. Used +by the command palette and by automation that wants to drive the UI. + +### app.navigate +Navigate the companion app to a specific page. +``` +{"page": "home|sessions|settings|chat|voice|connection|capabilities|conversations|...""} +``` +Returns `{ navigated, page }`. + +### app.status +Current connection / node state. +No params. Returns `{ connectionStatus, nodeConnected, nodePaired, nodePendingApproval, gatewayVersion, sessionCount, nodeCount }`. + +### app.sessions +Active sessions, optionally filtered by agent. +``` +{"agentId": "string"} // optional +``` +Returns array of `{ Key, Status, Model, AgeText, tokens }`. + +### app.agents +List agents from the connected gateway. No params. Returns the raw +agents JSON array. + +### app.nodes +List connected nodes and their capabilities. No params. Returns array +of `{ DisplayName, NodeId, IsOnline, Platform, CapabilityCount }`. + +### app.config.get +Read gateway configuration value at a dot-path. +``` +{"path": "string"} // optional; omit to fetch the full config tree +``` +Returns the config subtree (or full config) as JSON. + +### app.settings.get +Read a local app setting by name. +``` +{"name": "string"} // required +``` +Returns the setting value (type depends on the setting). + +### app.settings.set +Set a local app setting. +``` +{"name": "string", "value": "string"} // both required +``` +Returns `{ name, value }`. + +### app.menu +Get tray menu state (status, session count, node count). No params. +Returns array of menu items. + +### app.search +Search the command palette and return matching commands. +``` +{"query": "string"} // required +``` +Returns array of `{ Title, Subtitle, Icon }`. + --- ## A2UI v0.8 grammar (for canvas.a2ui.push) diff --git a/tests/OpenClaw.Shared.Tests/AssetHashPinningTests.cs b/tests/OpenClaw.Shared.Tests/AssetHashPinningTests.cs new file mode 100644 index 00000000..5073353d --- /dev/null +++ b/tests/OpenClaw.Shared.Tests/AssetHashPinningTests.cs @@ -0,0 +1,70 @@ +using System.Text.RegularExpressions; +using OpenClaw.Shared.Audio; +using Xunit; + +namespace OpenClaw.Shared.Tests; + +/// +/// Pre-GA security guard. Every shipped Whisper model and Piper voice MUST +/// have a pinned SHA-256 hash so the runtime can refuse tampered downloads. +/// New entries that forget the hash will fail this test loudly instead of +/// quietly being installable from a compromised source. +/// +/// See WhisperModelManager.AvailableModels / PiperVoiceManager.AvailableVoices +/// and Audio_FollowUps.md §2. +/// +public class AssetHashPinningTests +{ + private static readonly Regex Sha256Hex = new("^[0-9a-f]{64}$", RegexOptions.Compiled); + + [Fact] + public void EveryWhisperModel_HasPinnedSha256() + { + Assert.NotEmpty(WhisperModelManager.AvailableModels); + foreach (var m in WhisperModelManager.AvailableModels) + { + Assert.False(string.IsNullOrWhiteSpace(m.Sha256), + $"Whisper model '{m.Name}' is missing a pinned SHA-256 hash. Add one to AvailableModels."); + Assert.Matches(Sha256Hex, m.Sha256!); + } + } + + [Fact] + public void EveryPiperVoice_HasPinnedSha256() + { + Assert.NotEmpty(PiperVoiceManager.AvailableVoices); + foreach (var v in PiperVoiceManager.AvailableVoices) + { + Assert.False(string.IsNullOrWhiteSpace(v.Sha256), + $"Piper voice '{v.VoiceId}' is missing a pinned SHA-256 hash. Add one to AvailableVoices."); + Assert.Matches(Sha256Hex, v.Sha256!); + } + } + + [Fact] + public void EveryWhisperModel_UsesHttpsDownloadUrl() + { + foreach (var m in WhisperModelManager.AvailableModels) + { + Assert.StartsWith("https://", m.DownloadUrl); + } + } + + [Fact] + public void EveryPiperVoice_UsesHttpsDownloadUrl() + { + foreach (var v in PiperVoiceManager.AvailableVoices) + { + Assert.StartsWith("https://", v.DownloadUrl); + } + } + + [Fact] + public void SileroVadModel_HasPinnedSha256() + { + Assert.False(string.IsNullOrWhiteSpace(SileroVadModelManifest.Sha256), + "Silero VAD model is missing a pinned SHA-256 hash. Add one to SileroVadModelManifest."); + Assert.Matches(Sha256Hex, SileroVadModelManifest.Sha256); + Assert.StartsWith("https://", SileroVadModelManifest.DownloadUrl); + } +} diff --git a/tests/OpenClaw.Shared.Tests/CapabilityTests.cs b/tests/OpenClaw.Shared.Tests/CapabilityTests.cs index 0cda6237..f79c25cd 100644 --- a/tests/OpenClaw.Shared.Tests/CapabilityTests.cs +++ b/tests/OpenClaw.Shared.Tests/CapabilityTests.cs @@ -2583,8 +2583,8 @@ public void CanHandle_TtsSpeak() [InlineData(" ELEVENLABS ", "windows", "elevenlabs")] [InlineData(null, "elevenlabs", "elevenlabs")] [InlineData(" ", "elevenlabs", "elevenlabs")] - [InlineData(null, "", "windows")] - [InlineData(null, " ", "windows")] + [InlineData(null, "", "piper")] + [InlineData(null, " ", "piper")] public void ResolveProvider_NormalizesRequestedAndConfiguredValues( string? requestedProvider, string? configuredProvider, @@ -2712,7 +2712,32 @@ public async Task Speak_ReturnsError_WhenHandlerThrows() }); Assert.False(res.Ok); - Assert.Contains("Audio device unavailable", res.Error); + // Privacy: response surfaces a fixed sanitized error; the underlying + // exception text (which can include device names, ElevenLabs key + // fragments from 401 messages, etc.) stays in the local log only. + Assert.Equal("Speak failed", res.Error); + } + + [Fact] + public async Task Speak_HandlerException_DoesNotLeakExceptionMessageIntoError() + { + // Privacy regression: a 401 from ElevenLabs containing a key prefix + // must not bleed into the response error path (and from there into + // recent activity / support bundles). + var cap = new TtsCapability(NullLogger.Instance); + const string sensitive = "ElevenLabs 401: invalid key sk-secret-prefix-do-not-leak"; + cap.SpeakRequested += (_, _) => throw new InvalidOperationException(sensitive); + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "tts-priv", + Command = "tts.speak", + Args = Parse("""{"text":"hello"}""") + }); + + Assert.False(res.Ok); + Assert.DoesNotContain(sensitive, res.Error); + Assert.DoesNotContain("sk-secret-prefix-do-not-leak", res.Error); } [Fact] @@ -2901,3 +2926,567 @@ public async Task ExecuteAsync_ReturnsError_ForUnknownCommand() Assert.Contains("Unknown command", res.Error); } } + +public class SttCapabilityTests +{ + private static JsonElement Parse(string json) + { + using var doc = JsonDocument.Parse(json); + return doc.RootElement.Clone(); + } + + [Fact] + public void CanHandle_SttTranscribe() + { + var cap = new SttCapability(NullLogger.Instance); + Assert.True(cap.CanHandle("stt.transcribe")); + Assert.True(cap.CanHandle("stt.listen")); + Assert.True(cap.CanHandle("stt.status")); + Assert.False(cap.CanHandle("stt.stream")); + Assert.False(cap.CanHandle("tts.speak")); + Assert.Equal("stt", cap.Category); + Assert.Contains(SttCapability.TranscribeCommand, cap.Commands); + Assert.Contains(SttCapability.ListenCommand, cap.Commands); + Assert.Contains(SttCapability.StatusCommand, cap.Commands); + } + + [Fact] + public void ResolveLanguage_PrefersRequested() + { + Assert.Equal("ja-JP", SttCapability.ResolveLanguage("ja-JP", "en-GB")); + Assert.Equal("en-GB", SttCapability.ResolveLanguage(null, "en-GB")); + Assert.Equal("en-GB", SttCapability.ResolveLanguage(" ", "en-GB")); + Assert.Equal(SttCapability.DefaultLanguage, SttCapability.ResolveLanguage(null, null)); + } + + [Fact] + public void ResolveLanguage_RejectsNonsense() + { + Assert.Null(SttCapability.ResolveLanguage("not a tag", null)); + Assert.Null(SttCapability.ResolveLanguage("english", null)); + Assert.Null(SttCapability.ResolveLanguage("en_US", null)); + } + + [Fact] + public async Task Transcribe_ReturnsError_WhenMaxDurationMissing() + { + var cap = new SttCapability(NullLogger.Instance); + cap.TranscribeRequested += (_, _) => throw new InvalidOperationException("should not be called"); + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "stt1", + Command = "stt.transcribe", + Args = Parse("""{}""") + }); + + Assert.False(res.Ok); + Assert.Contains("Missing required maxDurationMs", res.Error); + } + + [Theory] + [InlineData(0)] + [InlineData(-1)] + [InlineData(-5000)] + public async Task Transcribe_ReturnsError_WhenMaxDurationNotPositive(int maxMs) + { + var cap = new SttCapability(NullLogger.Instance); + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "stt2", + Command = "stt.transcribe", + Args = Parse($$"""{"maxDurationMs":{{maxMs}}}""") + }); + + Assert.False(res.Ok); + Assert.Contains("Missing required maxDurationMs", res.Error); + } + + [Fact] + public async Task Transcribe_ReturnsError_WhenMaxDurationExceedsBound() + { + var cap = new SttCapability(NullLogger.Instance); + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "stt3", + Command = "stt.transcribe", + Args = Parse("""{"maxDurationMs":60000}""") + }); + + Assert.False(res.Ok); + Assert.Contains("exceeds 30000", res.Error); + } + + [Fact] + public async Task Transcribe_ReturnsError_WhenLanguageInvalid() + { + var cap = new SttCapability(NullLogger.Instance); + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "stt4", + Command = "stt.transcribe", + Args = Parse("""{"maxDurationMs":5000,"language":"english please"}""") + }); + + Assert.False(res.Ok); + Assert.Contains("Invalid language tag", res.Error); + } + + [Fact] + public async Task Transcribe_InvalidLanguageError_DoesNotEchoCallerInput() + { + // Privacy regression: caller-supplied language must not be echoed back + // in the error string, since failed-invoke errors land in recent + // activity / support bundles. + var cap = new SttCapability(NullLogger.Instance); + const string secretish = "ZZ-secret-tag-do-not-leak"; + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "stt-priv-lang", + Command = "stt.transcribe", + Args = Parse($$"""{"maxDurationMs":5000,"language":"{{secretish}}"}""") + }); + + Assert.False(res.Ok); + Assert.DoesNotContain(secretish, res.Error); + } + + [Fact] + public async Task Transcribe_HandlerException_DoesNotLeakExceptionMessageIntoError() + { + // Privacy regression: raw handler exception text could surface mic / + // audio-stack details. Response error must be a fixed sanitized + // string; full detail stays in logs. + var cap = new SttCapability(NullLogger.Instance); + const string sensitive = "secret-mic-device-path-or-stack-trace"; + cap.TranscribeRequested += (_, _) => throw new InvalidOperationException(sensitive); + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "stt-priv-ex", + Command = "stt.transcribe", + Args = Parse("""{"maxDurationMs":5000}""") + }); + + Assert.False(res.Ok); + Assert.DoesNotContain(sensitive, res.Error); + } + + [Fact] + public async Task Transcribe_ReturnsError_WhenHandlerNotWired() + { + var cap = new SttCapability(NullLogger.Instance); + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "stt5", + Command = "stt.transcribe", + Args = Parse("""{"maxDurationMs":5000}""") + }); + + Assert.False(res.Ok); + Assert.Contains("not available", res.Error); + } + + [Fact] + public async Task Transcribe_PassesArgsToHandler_AndReturnsPayload() + { + var cap = new SttCapability(NullLogger.Instance); + SttTranscribeArgs? received = null; + cap.TranscribeRequested += (a, _) => + { + received = a; + return Task.FromResult(new SttTranscribeResult + { + Transcribed = true, + Text = "hello", + DurationMs = 4200, + Language = a.Language ?? SttCapability.DefaultLanguage + }); + }; + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "stt6", + Command = "stt.transcribe", + Args = Parse("""{"maxDurationMs":5000,"language":"en-GB"}""") + }); + + Assert.True(res.Ok); + Assert.NotNull(received); + Assert.Equal(5000, received!.MaxDurationMs); + Assert.Equal("en-GB", received.Language); + + var payload = JsonSerializer.SerializeToElement(res.Payload); + Assert.True(payload.GetProperty("transcribed").GetBoolean()); + Assert.Equal("hello", payload.GetProperty("text").GetString()); + Assert.Equal(4200, payload.GetProperty("durationMs").GetInt32()); + Assert.Equal("en-GB", payload.GetProperty("language").GetString()); + } + + [Fact] + public async Task Transcribe_DropsLanguage_WhenOmitted_LettingTrayUseSetting() + { + var cap = new SttCapability(NullLogger.Instance); + SttTranscribeArgs? received = null; + cap.TranscribeRequested += (a, _) => + { + received = a; + return Task.FromResult(new SttTranscribeResult { Transcribed = true, Text = "hi", DurationMs = 100, Language = "en-US" }); + }; + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "stt7", + Command = "stt.transcribe", + Args = Parse("""{"maxDurationMs":1000}""") + }); + + Assert.True(res.Ok); + Assert.Null(received!.Language); + } + + [Fact] + public async Task Transcribe_ReportsHandlerException() + { + var cap = new SttCapability(NullLogger.Instance); + cap.TranscribeRequested += (_, _) => throw new InvalidOperationException("Microphone unavailable."); + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "stt8", + Command = "stt.transcribe", + Args = Parse("""{"maxDurationMs":2000}""") + }); + + Assert.False(res.Ok); + // Privacy: response surfaces a fixed sanitized error; raw exception + // text stays in the local log only. See + // Transcribe_HandlerException_DoesNotLeakExceptionMessageIntoError. + Assert.Equal("Transcribe failed", res.Error); + } + + [Fact] + public async Task Transcribe_ReturnsCanceled_WhenTokenFires() + { + var cap = new SttCapability(NullLogger.Instance); + cap.TranscribeRequested += async (_, ct) => + { + await Task.Delay(Timeout.Infinite, ct); + return new SttTranscribeResult(); + }; + + using var cts = new CancellationTokenSource(); + cts.CancelAfter(TimeSpan.FromMilliseconds(50)); + + var res = await cap.ExecuteAsync( + new NodeInvokeRequest { Id = "stt9", Command = "stt.transcribe", Args = Parse("""{"maxDurationMs":5000}""") }, + cts.Token); + + Assert.False(res.Ok); + Assert.Contains("canceled", res.Error, StringComparison.OrdinalIgnoreCase); + } + + [Fact] + public async Task ExecuteAsync_ReturnsError_ForUnknownCommand() + { + var cap = new SttCapability(NullLogger.Instance); + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "stt10", + Command = "stt.stream", + Args = Parse("""{}""") + }); + Assert.False(res.Ok); + Assert.Contains("Unknown command", res.Error); + } + + // ============================================================ + // stt.listen (VAD-driven capture) + // ============================================================ + + [Fact] + public async Task Listen_ClampsTimeoutMs_BelowMin() + { + var cap = new SttCapability(NullLogger.Instance); + SttListenArgs? received = null; + cap.ListenRequested += (a, _) => + { + received = a; + return Task.FromResult(new SttListenResult { Text = "x", Language = "auto", DurationMs = 100 }); + }; + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "listen-min", + Command = "stt.listen", + Args = Parse("""{"timeoutMs":50}""") + }); + + Assert.True(res.Ok); + Assert.NotNull(received); + Assert.Equal(SttCapability.MinListenTimeoutMs, received!.TimeoutMs); + } + + [Fact] + public async Task Listen_ClampsTimeoutMs_AboveMax() + { + var cap = new SttCapability(NullLogger.Instance); + SttListenArgs? received = null; + cap.ListenRequested += (a, _) => + { + received = a; + return Task.FromResult(new SttListenResult { Text = "x", Language = "auto", DurationMs = 100 }); + }; + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "listen-max", + Command = "stt.listen", + Args = Parse("""{"timeoutMs":1000000}""") + }); + + Assert.True(res.Ok); + Assert.NotNull(received); + Assert.Equal(SttCapability.MaxListenTimeoutMs, received!.TimeoutMs); + } + + [Fact] + public async Task Listen_DefaultsLanguageToAuto() + { + var cap = new SttCapability(NullLogger.Instance); + SttListenArgs? received = null; + cap.ListenRequested += (a, _) => + { + received = a; + return Task.FromResult(new SttListenResult { Text = "ok", Language = a.Language, DurationMs = 100 }); + }; + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "listen-auto", + Command = "stt.listen", + Args = Parse("""{"timeoutMs":5000}""") + }); + + Assert.True(res.Ok); + Assert.Equal(SttCapability.AutoLanguage, received!.Language); + } + + [Fact] + public async Task Listen_ReturnsError_WhenLanguageInvalid() + { + var cap = new SttCapability(NullLogger.Instance); + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "listen-bad-lang", + Command = "stt.listen", + Args = Parse("""{"timeoutMs":5000,"language":"english please"}""") + }); + + Assert.False(res.Ok); + Assert.Contains("Invalid language tag", res.Error); + } + + [Fact] + public async Task Listen_InvalidLanguageError_DoesNotEchoCallerInput() + { + var cap = new SttCapability(NullLogger.Instance); + const string secretish = "ZZ-secret-tag-do-not-leak"; + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "listen-priv-lang", + Command = "stt.listen", + Args = Parse($$"""{"timeoutMs":5000,"language":"{{secretish}}"}""") + }); + + Assert.False(res.Ok); + Assert.DoesNotContain(secretish, res.Error); + } + + [Fact] + public async Task Listen_ReturnsError_WhenHandlerNotWired() + { + var cap = new SttCapability(NullLogger.Instance); + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "listen-no-handler", + Command = "stt.listen", + Args = Parse("""{"timeoutMs":5000}""") + }); + + Assert.False(res.Ok); + Assert.Contains("not available", res.Error); + } + + [Fact] + public async Task Listen_HandlerException_DoesNotLeakExceptionMessageIntoError() + { + var cap = new SttCapability(NullLogger.Instance); + const string sensitive = "secret-mic-device-path-or-stack-trace"; + cap.ListenRequested += (_, _) => throw new InvalidOperationException(sensitive); + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "listen-priv-ex", + Command = "stt.listen", + Args = Parse("""{"timeoutMs":5000}""") + }); + + Assert.False(res.Ok); + Assert.DoesNotContain(sensitive, res.Error); + Assert.Equal("Listen failed", res.Error); + } + + [Fact] + public async Task Listen_PassesSegmentsAndEngineMetadata() + { + var cap = new SttCapability(NullLogger.Instance); + cap.ListenRequested += (_, _) => Task.FromResult(new SttListenResult + { + Text = "hello world", + Language = "en-US", + DurationMs = 1500, + Segments = new[] + { + new SttSegment { Text = "hello", StartMs = 0, EndMs = 500 }, + new SttSegment { Text = "world", StartMs = 600, EndMs = 1500 }, + }, + EngineEffective = SttCapability.EngineWhisper + }); + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "listen-payload", + Command = "stt.listen", + Args = Parse("""{"timeoutMs":5000,"language":"en-US"}""") + }); + + Assert.True(res.Ok); + // Round-trip through serialization to make sure the response object + // exposes the new fields. + var json = System.Text.Json.JsonSerializer.Serialize(res.Payload); + Assert.Contains("\"text\":\"hello world\"", json); + Assert.Contains("\"engineEffective\":\"whisper\"", json); + Assert.Contains("\"segments\":", json); + } + + [Fact] + public async Task Listen_ReturnsCanceled_WhenTokenFires() + { + var cap = new SttCapability(NullLogger.Instance); + cap.ListenRequested += async (_, ct) => + { + await Task.Delay(Timeout.Infinite, ct); + return new SttListenResult(); + }; + + using var cts = new CancellationTokenSource(); + cts.CancelAfter(TimeSpan.FromMilliseconds(50)); + + var res = await cap.ExecuteAsync( + new NodeInvokeRequest { Id = "listen-cancel", Command = "stt.listen", Args = Parse("""{"timeoutMs":5000}""") }, + cts.Token); + + Assert.False(res.Ok); + Assert.Contains("canceled", res.Error, StringComparison.OrdinalIgnoreCase); + } + + // ============================================================ + // stt.status + // ============================================================ + + [Fact] + public async Task Status_ReturnsError_WhenHandlerNotWired() + { + var cap = new SttCapability(NullLogger.Instance); + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "status-no-handler", + Command = "stt.status", + Args = Parse("""{}""") + }); + + Assert.False(res.Ok); + Assert.Contains("not available", res.Error); + } + + [Fact] + public async Task Status_HandlerException_DoesNotLeakExceptionMessageIntoError() + { + var cap = new SttCapability(NullLogger.Instance); + const string sensitive = "secret-engine-stack-trace"; + cap.StatusRequested += _ => throw new InvalidOperationException(sensitive); + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "status-priv-ex", + Command = "stt.status", + Args = Parse("""{}""") + }); + + Assert.False(res.Ok); + Assert.DoesNotContain(sensitive, res.Error); + Assert.Equal("Status failed", res.Error); + } + + [Fact] + public async Task Status_ReturnsEngineReadiness() + { + var cap = new SttCapability(NullLogger.Instance); + cap.StatusRequested += _ => Task.FromResult(new SttStatusResult + { + Engine = SttCapability.EngineWhisper, + Readiness = "model-downloading", + ModelDownloadProgress = 0.42, + IsListenWithVadSupported = false, + IsBoundedTranscribeSupported = false, + }); + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "status-ok", + Command = "stt.status", + Args = Parse("""{}""") + }); + + Assert.True(res.Ok); + var json = System.Text.Json.JsonSerializer.Serialize(res.Payload); + Assert.Contains("\"engine\":\"whisper\"", json); + Assert.Contains("\"readiness\":\"model-downloading\"", json); + Assert.Contains("\"modelDownloadProgress\":0.42", json); + // No PII fields ever surface in stt.status — even when synthesizing + // a result, callers can only see flat readiness strings + a single + // engine identifier. + Assert.DoesNotContain("language", json, StringComparison.OrdinalIgnoreCase); + Assert.DoesNotContain("path", json, StringComparison.OrdinalIgnoreCase); + } + + // ============================================================ + // BCP-47 + "auto" sentinel + // ============================================================ + + [Theory] + [InlineData("en-US", "en-US")] + [InlineData("en-GB", "en-GB")] + [InlineData("ja-JP", "ja-JP")] + [InlineData("zh-Hans-CN", "zh-Hans-CN")] + [InlineData(" en-US ", "en-US")] // leading/trailing whitespace trimmed + [InlineData("auto", "auto")] + [InlineData("AUTO", "auto")] // case-insensitive sentinel, normalized to lowercase + [InlineData("Auto", "auto")] + public void NormalizeLanguageTag_AcceptsValid(string input, string expected) + { + Assert.Equal(expected, SttCapability.NormalizeLanguageTag(input)); + } + + [Theory] + [InlineData("english")] + [InlineData("en_US")] // underscore not allowed + [InlineData("not a tag")] + [InlineData("en US")] // space not allowed + [InlineData("automatic")] // not the sentinel + public void NormalizeLanguageTag_RejectsInvalid(string input) + { + Assert.Null(SttCapability.NormalizeLanguageTag(input)); + } +} diff --git a/tests/OpenClaw.Shared.Tests/McpToolBridgeTests.cs b/tests/OpenClaw.Shared.Tests/McpToolBridgeTests.cs index 043b8212..c25614c4 100644 --- a/tests/OpenClaw.Shared.Tests/McpToolBridgeTests.cs +++ b/tests/OpenClaw.Shared.Tests/McpToolBridgeTests.cs @@ -352,6 +352,118 @@ public async Task UnhandledException_ReturnsGenericInternalError_NotLeakingMessa Assert.DoesNotContain("secret-internal-detail", error.GetProperty("message").GetString()); } + [Fact] + public async Task ToolsList_SttTranscribe_HasCuratedDescription() + { + var caps = new List + { + new FakeCapability("stt", "stt.transcribe"), + }; + var bridge = CreateBridge(caps); + var resp = await bridge.HandleRequestAsync(@"{""jsonrpc"":""2.0"",""id"":1,""method"":""tools/list""}"); + + using var doc = JsonDocument.Parse(resp!); + var description = doc.RootElement.GetProperty("result") + .GetProperty("tools")[0] + .GetProperty("description") + .GetString()!; + + // Must mention the key surface area so MCP clients render something useful. + Assert.Contains("microphone", description, System.StringComparison.OrdinalIgnoreCase); + Assert.Contains("maxDurationMs", description); + Assert.Contains("text", description, System.StringComparison.OrdinalIgnoreCase); + // And explicitly NOT the generic stub. + Assert.DoesNotContain("stt capability:", description); + } + + [Fact] + public async Task ToolsList_SttListen_HasCuratedDescription() + { + var caps = new List { new FakeCapability("stt", "stt.listen") }; + var bridge = CreateBridge(caps); + var resp = await bridge.HandleRequestAsync(@"{""jsonrpc"":""2.0"",""id"":1,""method"":""tools/list""}"); + + using var doc = JsonDocument.Parse(resp!); + var description = doc.RootElement.GetProperty("result") + .GetProperty("tools")[0] + .GetProperty("description") + .GetString()!; + + Assert.Contains("voice-activity detection", description, System.StringComparison.OrdinalIgnoreCase); + Assert.Contains("timeoutMs", description); + // Privacy: must mention NodeSttEnabled gate so MCP clients + // know this is opt-in. + Assert.Contains("NodeSttEnabled", description); + // Engine surface must be advertised so callers can read engineEffective. + Assert.Contains("engineEffective", description); + Assert.DoesNotContain("stt capability:", description); + } + + [Fact] + public async Task ToolsList_SttStatus_HasCuratedDescription() + { + var caps = new List { new FakeCapability("stt", "stt.status") }; + var bridge = CreateBridge(caps); + var resp = await bridge.HandleRequestAsync(@"{""jsonrpc"":""2.0"",""id"":1,""method"":""tools/list""}"); + + using var doc = JsonDocument.Parse(resp!); + var description = doc.RootElement.GetProperty("result") + .GetProperty("tools")[0] + .GetProperty("description") + .GetString()!; + + Assert.Contains("readiness", description, System.StringComparison.OrdinalIgnoreCase); + Assert.Contains("engine", description, System.StringComparison.OrdinalIgnoreCase); + Assert.Contains("whisper", description, System.StringComparison.OrdinalIgnoreCase); + // Privacy invariant in the description itself: no PII. + Assert.Contains("no PII", description, System.StringComparison.OrdinalIgnoreCase); + } + + [Fact] + public async Task ToolsList_AllStt_AppearWhenSttCapabilityRegistered() + { + // Single SttCapability instance advertises all three commands. + var caps = new List + { + new FakeCapability("stt", "stt.transcribe", "stt.listen", "stt.status"), + }; + var bridge = CreateBridge(caps); + var resp = await bridge.HandleRequestAsync(@"{""jsonrpc"":""2.0"",""id"":1,""method"":""tools/list""}"); + + using var doc = JsonDocument.Parse(resp!); + var toolNames = new HashSet(); + foreach (var t in doc.RootElement.GetProperty("result").GetProperty("tools").EnumerateArray()) + toolNames.Add(t.GetProperty("name").GetString()!); + + Assert.Contains("stt.transcribe", toolNames); + Assert.Contains("stt.listen", toolNames); + Assert.Contains("stt.status", toolNames); + } + + [Fact] + public async Task ToolsList_AllStt_Absent_WhenSttCapabilityNotRegistered() + { + // STT capability is gated by NodeSttEnabled in NodeService; + // when disabled, no SttCapability is constructed and tools/list + // must omit the three stt.* tools. + var caps = new List + { + new FakeCapability("device", "device.status"), + new FakeCapability("tts", "tts.speak"), + }; + var bridge = CreateBridge(caps); + var resp = await bridge.HandleRequestAsync(@"{""jsonrpc"":""2.0"",""id"":1,""method"":""tools/list""}"); + + using var doc = JsonDocument.Parse(resp!); + var toolNames = new HashSet(); + foreach (var t in doc.RootElement.GetProperty("result").GetProperty("tools").EnumerateArray()) + toolNames.Add(t.GetProperty("name").GetString()!); + + Assert.DoesNotContain("stt.transcribe", toolNames); + Assert.DoesNotContain("stt.listen", toolNames); + Assert.DoesNotContain("stt.status", toolNames); + } + [Fact] public async Task Initialize_ReturnsCustomServerNameAndVersion() { diff --git a/tests/OpenClaw.Shared.Tests/ModelsTests.cs b/tests/OpenClaw.Shared.Tests/ModelsTests.cs index ac9ac83f..ad93fb34 100644 --- a/tests/OpenClaw.Shared.Tests/ModelsTests.cs +++ b/tests/OpenClaw.Shared.Tests/ModelsTests.cs @@ -1732,4 +1732,34 @@ public void ContextSummaryShort_FormatsSmallNumbers() var session = new SessionInfo { TotalTokens = 500, ContextTokens = 1000 }; Assert.Contains("500/1.0K", session.ContextSummaryShort); } + + [Fact] + public void DangerousCommands_IncludesSttTranscribe() + { + Assert.Contains("stt.transcribe", CommandCenterCommandGroups.DangerousCommands); + Assert.Contains("stt.transcribe", (IReadOnlySet)CommandCenterCommandGroups.DangerousCommandSet); + // stt.listen and stt.status need the same explicit gateway opt-in so + // chat agents see them once NodeSttEnabled is on. Otherwise the + // gateway's Windows platform default policy keeps them hidden. + Assert.Contains("stt.listen", CommandCenterCommandGroups.DangerousCommands); + Assert.Contains("stt.status", CommandCenterCommandGroups.DangerousCommands); + } + + [Fact] + public void MacNodeParityCommands_ExcludesSttTranscribe() + { + // Mac has no equivalent yet; ensure parity diagnostic does not flag + // Windows nodes for "missing" stt.transcribe. + Assert.DoesNotContain("stt.transcribe", CommandCenterCommandGroups.MacNodeParityCommands); + } + + [Fact] + public void CommonDangerousCommands_StillIncludedInMacParity() + { + // Refactor invariant: the original camera/screen dangerous commands + // still appear in Mac parity via the shared CommonDangerousCommands set. + Assert.Contains("camera.snap", CommandCenterCommandGroups.MacNodeParityCommands); + Assert.Contains("camera.clip", CommandCenterCommandGroups.MacNodeParityCommands); + Assert.Contains("screen.record", CommandCenterCommandGroups.MacNodeParityCommands); + } } diff --git a/tests/OpenClaw.Shared.Tests/SingleFlightDownloadTests.cs b/tests/OpenClaw.Shared.Tests/SingleFlightDownloadTests.cs new file mode 100644 index 00000000..3b2e3057 --- /dev/null +++ b/tests/OpenClaw.Shared.Tests/SingleFlightDownloadTests.cs @@ -0,0 +1,134 @@ +using System; +using System.Collections.Concurrent; +using System.Threading; +using System.Threading.Tasks; +using OpenClaw.Shared.Audio; +using Xunit; + +namespace OpenClaw.Shared.Tests; + +public sealed class SingleFlightDownloadTests +{ + [Fact] + public async Task ConcurrentCallers_StartOnlyOneSharedOperation() + { + var inFlight = new ConcurrentDictionary>(StringComparer.OrdinalIgnoreCase); + var release = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + var started = 0; + + Task Start(CancellationToken _) + { + Interlocked.Increment(ref started); + return release.Task; + } + + var callers = new Task[50]; + for (var i = 0; i < callers.Length; i++) + { + callers[i] = SingleFlightDownload.RunAsync(inFlight, "asset", Start); + } + + await WaitUntilAsync(() => Volatile.Read(ref started) == 1); + release.SetResult(); + await Task.WhenAll(callers); + + Assert.Equal(1, Volatile.Read(ref started)); + await WaitUntilAsync(() => inFlight.IsEmpty); + } + + [Fact] + public async Task CancelingOneWaiter_DoesNotCancelSharedOperation() + { + var inFlight = new ConcurrentDictionary>(StringComparer.OrdinalIgnoreCase); + var release = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + var started = 0; + CancellationToken sharedToken = default; + + Task Start(CancellationToken token) + { + sharedToken = token; + Interlocked.Increment(ref started); + return release.Task; + } + + using var callerCts = new CancellationTokenSource(); + var canceledWaiter = SingleFlightDownload.RunAsync(inFlight, "asset", Start, callerCts.Token); + await WaitUntilAsync(() => Volatile.Read(ref started) == 1); + + var continuingWaiter = SingleFlightDownload.RunAsync(inFlight, "asset", Start); + callerCts.Cancel(); + + await Assert.ThrowsAsync(() => canceledWaiter); + Assert.False(sharedToken.CanBeCanceled); + + release.SetResult(); + await continuingWaiter; + + Assert.Equal(1, Volatile.Read(ref started)); + await WaitUntilAsync(() => inFlight.IsEmpty); + } + + [Fact] + public async Task FailedSharedOperation_IsRemovedSoRetryCanStart() + { + var inFlight = new ConcurrentDictionary>(StringComparer.OrdinalIgnoreCase); + var attempts = 0; + + Task Start(CancellationToken _) + { + return Interlocked.Increment(ref attempts) == 1 + ? Task.FromException(new InvalidOperationException("first failure")) + : Task.CompletedTask; + } + + var ex = await Assert.ThrowsAsync( + () => SingleFlightDownload.RunAsync(inFlight, "asset", Start)); + Assert.Equal("first failure", ex.Message); + + await WaitUntilAsync(() => inFlight.IsEmpty); + await SingleFlightDownload.RunAsync(inFlight, "asset", Start); + + Assert.Equal(2, Volatile.Read(ref attempts)); + } + + [Fact] + public async Task SynchronousFactoryFailure_IsRemovedSoRetryCanStart() + { + var inFlight = new ConcurrentDictionary>(StringComparer.OrdinalIgnoreCase); + var attempts = 0; + + Task Start(CancellationToken _) + { + if (Interlocked.Increment(ref attempts) == 1) + { + throw new InvalidOperationException("sync failure"); + } + + return Task.CompletedTask; + } + + var ex = await Assert.ThrowsAsync( + () => SingleFlightDownload.RunAsync(inFlight, "asset", Start)); + Assert.Equal("sync failure", ex.Message); + + await WaitUntilAsync(() => inFlight.IsEmpty); + await SingleFlightDownload.RunAsync(inFlight, "asset", Start); + + Assert.Equal(2, Volatile.Read(ref attempts)); + } + + private static async Task WaitUntilAsync(Func condition) + { + for (var i = 0; i < 100; i++) + { + if (condition()) + { + return; + } + + await Task.Delay(10); + } + + Assert.True(condition()); + } +} diff --git a/tests/OpenClaw.Shared.Tests/SpeechToTextLanguageNormalizationTests.cs b/tests/OpenClaw.Shared.Tests/SpeechToTextLanguageNormalizationTests.cs new file mode 100644 index 00000000..971495a9 --- /dev/null +++ b/tests/OpenClaw.Shared.Tests/SpeechToTextLanguageNormalizationTests.cs @@ -0,0 +1,43 @@ +using OpenClaw.Shared.Audio; +using Xunit; + +namespace OpenClaw.Shared.Tests; + +/// +/// SttCapability accepts BCP-47 language tags (the validator + MCP docs +/// both advertise the wider shape like "en-US"), but Whisper.net's +/// WithLanguage call only understands "auto" or 2-letter ISO 639-1 codes. +/// SpeechToTextService.NormalizeForWhisper bridges the gap. These tests +/// pin the normalization rules so a future change can't silently start +/// passing a region-tagged BCP-47 string straight to Whisper. +/// +public class SpeechToTextLanguageNormalizationTests +{ + [Theory] + [InlineData("auto", "auto")] + [InlineData("AUTO", "auto")] + [InlineData("en", "en")] + [InlineData("EN", "en")] + [InlineData("en-US", "en")] + [InlineData("en-us", "en")] + [InlineData("zh-Hans-CN", "zh")] + [InlineData("fr-FR", "fr")] + [InlineData(" ja-JP ", "ja")] + public void NormalizeForWhisper_StripsRegionAndScript(string input, string expected) + { + Assert.Equal(expected, SpeechToTextService.NormalizeForWhisper(input)); + } + + [Theory] + [InlineData(null)] + [InlineData("")] + [InlineData(" ")] + [InlineData("abc")] // 3-letter — no safe ISO 639-3 cross-walk + [InlineData("e")] // single letter + [InlineData("123-XX")] // numeric primary subtag + [InlineData("en1-US")] // non-letter primary + public void NormalizeForWhisper_FallsBackToAuto_OnInvalid(string? input) + { + Assert.Equal("auto", SpeechToTextService.NormalizeForWhisper(input)); + } +} diff --git a/tests/OpenClaw.Tray.Tests/ActivityStreamServiceTests.cs b/tests/OpenClaw.Tray.Tests/ActivityStreamServiceTests.cs index b14b9c2a..128571bf 100644 --- a/tests/OpenClaw.Tray.Tests/ActivityStreamServiceTests.cs +++ b/tests/OpenClaw.Tray.Tests/ActivityStreamServiceTests.cs @@ -2,6 +2,13 @@ namespace OpenClaw.Tray.Tests; +[CollectionDefinition(ActivityStreamServiceCollection.Name, DisableParallelization = true)] +public sealed class ActivityStreamServiceCollection +{ + public const string Name = "ActivityStreamService"; +} + +[Collection(ActivityStreamServiceCollection.Name)] public class ActivityStreamServiceTests : IDisposable { public ActivityStreamServiceTests() diff --git a/tests/OpenClaw.Tray.Tests/CapabilitiesPageLocalizationCoverageTests.cs b/tests/OpenClaw.Tray.Tests/CapabilitiesPageLocalizationCoverageTests.cs new file mode 100644 index 00000000..4ebee380 --- /dev/null +++ b/tests/OpenClaw.Tray.Tests/CapabilitiesPageLocalizationCoverageTests.cs @@ -0,0 +1,107 @@ +using System.Xml.Linq; + +namespace OpenClaw.Tray.Tests; + +/// +/// Pins that the STT/TTS card controls in CapabilitiesPage.xaml are localized +/// (have an x:Uid) and that en-us\Resources.resw provides matching keys. +/// +/// LocalizationValidationTests catches drift between locales, but does not +/// catch the case where a developer adds a control with hardcoded English +/// text and never registers it. This test closes that hole for the new +/// privacy-sensitive voice surface (the engine picker, the language input, +/// the ElevenLabs panel, and the deep-link to VoiceSettingsPage). +/// +public sealed class CapabilitiesPageLocalizationCoverageTests +{ + private static readonly XNamespace XNs = "http://schemas.microsoft.com/winfx/2006/xaml"; + + private static string GetRepositoryRoot() + { + var envRepoRoot = Environment.GetEnvironmentVariable("OPENCLAW_REPO_ROOT"); + if (!string.IsNullOrWhiteSpace(envRepoRoot) && Directory.Exists(envRepoRoot)) + return envRepoRoot; + + var directory = new DirectoryInfo(AppContext.BaseDirectory); + while (directory != null) + { + if ((Directory.Exists(Path.Combine(directory.FullName, ".git")) || + File.Exists(Path.Combine(directory.FullName, ".git"))) && + File.Exists(Path.Combine(directory.FullName, "README.md"))) + return directory.FullName; + directory = directory.Parent; + } + + throw new InvalidOperationException( + "Could not find repository root. Set OPENCLAW_REPO_ROOT to the repo path."); + } + + private static string GetCapabilitiesXamlPath() => + Path.Combine(GetRepositoryRoot(), "src", "OpenClaw.Tray.WinUI", "Pages", "CapabilitiesPage.xaml"); + + private static string GetEnUsReswPath() => + Path.Combine(GetRepositoryRoot(), "src", "OpenClaw.Tray.WinUI", "Strings", "en-us", "Resources.resw"); + + private static HashSet LoadReswKeys() + { + var doc = XDocument.Load(GetEnUsReswPath()); + return doc.Descendants("data") + .Select(e => e.Attribute("name")!.Value) + .ToHashSet(StringComparer.Ordinal); + } + + private static HashSet LoadXamlUids() + { + var doc = XDocument.Load(GetCapabilitiesXamlPath()); + return doc.Descendants() + .Select(e => e.Attribute(XNs + "Uid")?.Value) + .Where(v => !string.IsNullOrEmpty(v)) + .Cast() + .ToHashSet(StringComparer.Ordinal); + } + + /// + /// Contract for the STT/TTS surface introduced by the audio merge. + /// Each entry: x:Uid + the resw key suffixes that MUST exist in en-us. + /// + public static IEnumerable SttAndTtsCardUids => new[] + { + // STT card (deep-link to dedicated voice settings) + new object[] { "CapabilitiesPage_SttCardHeader", new[] { ".Text" } }, + new object[] { "CapabilitiesPage_SttCardDescription", new[] { ".Text" } }, + new object[] { "CapabilitiesPage_SttMoreSettingsLink", new[] { ".Content" } }, + // TTS card (provider picker, ElevenLabs sub-panel) + new object[] { "CapabilitiesPage_TtsCardHeader", new[] { ".Text" } }, + new object[] { "CapabilitiesPage_TtsCardDescription", new[] { ".Text" } }, + new object[] { "CapabilitiesPage_TtsProviderComboBox", new[] { ".Header" } }, + new object[] { "CapabilitiesPage_TtsProviderPiper", new[] { ".Content" } }, + new object[] { "CapabilitiesPage_TtsProviderWindows", new[] { ".Content" } }, + new object[] { "CapabilitiesPage_TtsProviderElevenLabs",new[] { ".Content" } }, + new object[] { "CapabilitiesPage_TtsElevenLabsApiKey", new[] { ".Header" } }, + new object[] { "CapabilitiesPage_TtsElevenLabsVoiceId", new[] { ".Header" } }, + new object[] { "CapabilitiesPage_TtsElevenLabsModel", new[] { ".Header", ".PlaceholderText" } }, + new object[] { "CapabilitiesPage_TtsElevenLabsHelp", new[] { ".Text" } }, + }; + + [Theory] + [MemberData(nameof(SttAndTtsCardUids))] + public void SttOrTtsControl_HasXUid_InCapabilitiesPageXaml(string uid, string[] _) + { + var uids = LoadXamlUids(); + Assert.Contains(uid, uids); + } + + [Theory] + [MemberData(nameof(SttAndTtsCardUids))] + public void SttOrTtsControl_AllExpectedReswKeys_ExistInEnUs(string uid, string[] suffixes) + { + var keys = LoadReswKeys(); + var missing = suffixes + .Select(suffix => uid + suffix) + .Where(key => !keys.Contains(key)) + .ToList(); + + Assert.True(missing.Count == 0, + $"Missing en-us resw keys for x:Uid '{uid}': {string.Join(", ", missing)}"); + } +} diff --git a/tests/OpenClaw.Tray.Tests/DeepLinkParserTests.cs b/tests/OpenClaw.Tray.Tests/DeepLinkParserTests.cs index 6f36dfba..61d559c3 100644 --- a/tests/OpenClaw.Tray.Tests/DeepLinkParserTests.cs +++ b/tests/OpenClaw.Tray.Tests/DeepLinkParserTests.cs @@ -121,6 +121,21 @@ public void ParseDeepLink_TrailingSlash_IsStripped() Assert.Equal("settings", result.Path); } + [Theory] + [InlineData("openclaw://send/?message=hello", "send")] + [InlineData("openclaw://agent/?message=hi&key=abc", "agent")] + [InlineData("openclaw://activity/?filter=nodes", "activity")] + public void ParseDeepLink_TrailingSlashBeforeQuery_IsStripped(string uri, string expectedPath) + { + // Windows canonicalizes openclaw://send?... to openclaw://send/?... + // before handing it to us. The slash sits before the `?`, so a naïve + // TrimEnd before query split fails to strip it. Regression test for + // the off-by-one fix in DeepLinkParser.ParseDeepLink. + var result = DeepLinkParser.ParseDeepLink(uri); + Assert.NotNull(result); + Assert.Equal(expectedPath, result!.Path); + } + [Fact] public void ParseDeepLink_CaseInsensitiveScheme() { diff --git a/tests/OpenClaw.Tray.Tests/LocalizationValidationTests.cs b/tests/OpenClaw.Tray.Tests/LocalizationValidationTests.cs index 68eb2b7b..824a99f4 100644 --- a/tests/OpenClaw.Tray.Tests/LocalizationValidationTests.cs +++ b/tests/OpenClaw.Tray.Tests/LocalizationValidationTests.cs @@ -24,6 +24,18 @@ public class LocalizationValidationTests "Onboarding_Connection_Token", "WindowTitle_TrayMenu", "WindowTitle_Update", + // STT/TTS card invariants — these are protocol/brand identifiers + // not user-visible prose. They intentionally read the same in every + // locale: "eleven_multilingual_v2" is an ElevenLabs model + // identifier, "ElevenLabs" is a brand name. + // VoiceOverlayWindow window-title key — matches the convention + // for ChatWindow / HubWindow / CanvasWindow / TrayMenuWindow. + "VoiceOverlayWindow_winexWindowEx_2.Title", + "CapabilitiesPage_TtsElevenLabsModel.PlaceholderText", + "CapabilitiesPage_TtsProviderElevenLabs.Content", + // Sample IDs / brand identifiers — same across locales. + "VoiceSettingsPage_ElevenLabsVoiceIdBox.PlaceholderText", + "VoiceSettingsPage_ElevenLabsModelBox.PlaceholderText", }; private static readonly string[] RequiredRuntimeOnboardingKeys = diff --git a/tests/OpenClaw.Tray.Tests/NodeCapabilityGatingTests.cs b/tests/OpenClaw.Tray.Tests/NodeCapabilityGatingTests.cs new file mode 100644 index 00000000..69671057 --- /dev/null +++ b/tests/OpenClaw.Tray.Tests/NodeCapabilityGatingTests.cs @@ -0,0 +1,132 @@ +using OpenClawTray.Services; + +namespace OpenClaw.Tray.Tests; + +/// +/// Pins the optional-capability gating that drives both the gateway client +/// path and the MCP-only path inside NodeService.RegisterCapabilities. +/// +/// Privacy-sensitive defaults must be **off** even when settings are missing. +/// A regression that flips Stt/Tts to default-on would silently advertise +/// stt.transcribe / tts.speak the moment the tray launches with a fresh +/// settings file, with no user opt-in. +/// +public sealed class NodeCapabilityGatingTests : IDisposable +{ + private readonly List _tempDirs = new(); + + public void Dispose() + { + foreach (var dir in _tempDirs) + { + if (Directory.Exists(dir)) + Directory.Delete(dir, recursive: true); + } + } + + private SettingsManager NewSettings() + { + var dir = Path.Combine(Path.GetTempPath(), "openclaw-tray-tests-" + Guid.NewGuid().ToString("N")); + Directory.CreateDirectory(dir); + _tempDirs.Add(dir); + return new SettingsManager(dir); + } + + [Fact] + public void NullSettings_DefaultOnCapabilities_AreEnabled() + { + // Defensive default: when settings are not yet loaded, we still + // advertise the non-privacy-sensitive capabilities so the node is + // usable immediately. + Assert.True(NodeCapabilityGating.ShouldRegisterCanvas(null)); + Assert.True(NodeCapabilityGating.ShouldRegisterScreen(null)); + Assert.True(NodeCapabilityGating.ShouldRegisterCamera(null)); + Assert.True(NodeCapabilityGating.ShouldRegisterLocation(null)); + Assert.True(NodeCapabilityGating.ShouldRegisterBrowserProxy(null)); + } + + [Fact] + public void NullSettings_PrivacySensitiveCapabilities_AreDisabled() + { + // Privacy invariant: TTS and STT must require an explicit user + // opt-in. A null/missing settings object must not enable mic capture + // or speaker output. + Assert.False(NodeCapabilityGating.ShouldRegisterTts(null)); + Assert.False(NodeCapabilityGating.ShouldRegisterStt(null)); + } + + [Fact] + public void DefaultSettings_PrivacySensitiveCapabilities_AreDisabled() + { + var s = NewSettings(); + Assert.False(NodeCapabilityGating.ShouldRegisterTts(s)); + Assert.False(NodeCapabilityGating.ShouldRegisterStt(s)); + } + + [Fact] + public void DefaultSettings_OtherCapabilities_AreEnabled() + { + var s = NewSettings(); + Assert.True(NodeCapabilityGating.ShouldRegisterCanvas(s)); + Assert.True(NodeCapabilityGating.ShouldRegisterScreen(s)); + Assert.True(NodeCapabilityGating.ShouldRegisterCamera(s)); + Assert.True(NodeCapabilityGating.ShouldRegisterLocation(s)); + Assert.True(NodeCapabilityGating.ShouldRegisterBrowserProxy(s)); + } + + [Fact] + public void Tts_OnlyAdvertisedWhenExplicitlyEnabled() + { + var s = NewSettings(); + Assert.False(NodeCapabilityGating.ShouldRegisterTts(s)); + s.NodeTtsEnabled = true; + Assert.True(NodeCapabilityGating.ShouldRegisterTts(s)); + s.NodeTtsEnabled = false; + Assert.False(NodeCapabilityGating.ShouldRegisterTts(s)); + } + + [Fact] + public void Stt_OnlyAdvertisedWhenExplicitlyEnabled() + { + var s = NewSettings(); + Assert.False(NodeCapabilityGating.ShouldRegisterStt(s)); + s.NodeSttEnabled = true; + Assert.True(NodeCapabilityGating.ShouldRegisterStt(s)); + s.NodeSttEnabled = false; + Assert.False(NodeCapabilityGating.ShouldRegisterStt(s)); + } + + [Fact] + public void TtsAndStt_Independent() + { + // A user who enables only TTS (output) must not silently enable STT + // (input), and vice versa. Each capability is its own consent surface. + var s = NewSettings(); + s.NodeTtsEnabled = true; + s.NodeSttEnabled = false; + Assert.True(NodeCapabilityGating.ShouldRegisterTts(s)); + Assert.False(NodeCapabilityGating.ShouldRegisterStt(s)); + + s.NodeTtsEnabled = false; + s.NodeSttEnabled = true; + Assert.False(NodeCapabilityGating.ShouldRegisterTts(s)); + Assert.True(NodeCapabilityGating.ShouldRegisterStt(s)); + } + + [Fact] + public void DefaultOnCapabilities_OnlyDisabledWhenExplicitlySetToFalse() + { + var s = NewSettings(); + s.NodeCanvasEnabled = false; + s.NodeScreenEnabled = false; + s.NodeCameraEnabled = false; + s.NodeLocationEnabled = false; + s.NodeBrowserProxyEnabled = false; + + Assert.False(NodeCapabilityGating.ShouldRegisterCanvas(s)); + Assert.False(NodeCapabilityGating.ShouldRegisterScreen(s)); + Assert.False(NodeCapabilityGating.ShouldRegisterCamera(s)); + Assert.False(NodeCapabilityGating.ShouldRegisterLocation(s)); + Assert.False(NodeCapabilityGating.ShouldRegisterBrowserProxy(s)); + } +} diff --git a/tests/OpenClaw.Tray.Tests/NodeInvokeActivityFormatterTests.cs b/tests/OpenClaw.Tray.Tests/NodeInvokeActivityFormatterTests.cs new file mode 100644 index 00000000..06f9a576 --- /dev/null +++ b/tests/OpenClaw.Tray.Tests/NodeInvokeActivityFormatterTests.cs @@ -0,0 +1,127 @@ +using OpenClawTray.Services; + +namespace OpenClaw.Tray.Tests; + +/// +/// Privacy regression tests for the activity-stream details formatter that +/// powers both the recent-activity menu and the support bundle. +/// +/// The end-to-end persistence path is: +/// NodeService.OnNodeInvokeCompleted (capability handler exception) +/// → App.OnNodeInvokeCompleted +/// → NodeInvokeActivityFormatter.BuildDetails +/// → ActivityStreamService.Add +/// → ActivityStreamService.BuildSupportBundle (when user shares logs) +/// +/// For privacy-sensitive commands (mic / camera / screen), no caller-supplied +/// arg or runtime detail may reach support bundles. This test pins that. +/// +[Collection(ActivityStreamServiceCollection.Name)] +public sealed class NodeInvokeActivityFormatterTests : IDisposable +{ + public NodeInvokeActivityFormatterTests() => ActivityStreamService.Clear(); + public void Dispose() => ActivityStreamService.Clear(); + + [Theory] + [InlineData("stt.transcribe")] + [InlineData("stt.listen")] + [InlineData("stt.status")] + [InlineData("camera.snap")] + [InlineData("camera.clip")] + [InlineData("screen.snapshot")] + [InlineData("screen.record")] + public void PrivacySensitive_FailedInvoke_OmitsErrorTextFromDetails(string command) + { + const string secret = "secret-language-or-device-detail"; + var details = NodeInvokeActivityFormatter.BuildDetails(command, ok: false, durationMs: 4321, error: secret); + + Assert.Equal("privacy-sensitive · 4321 ms · error", details); + Assert.DoesNotContain(secret, details); + } + + [Fact] + public void PrivacySensitive_FailedInvoke_SecretDoesNotReachSupportBundle() + { + const string secret = "secret-language-or-device-detail"; + var details = NodeInvokeActivityFormatter.BuildDetails("stt.transcribe", ok: false, durationMs: 1234, error: secret); + + ActivityStreamService.Add( + category: "node.invoke", + title: "node.invoke failed: stt.transcribe", + details: details, + nodeId: "test-node"); + + var bundle = ActivityStreamService.BuildSupportBundle(); + Assert.DoesNotContain(secret, bundle); + Assert.Contains("privacy-sensitive · 1234 ms · error", bundle); + } + + [Fact] + public void PrivacySensitive_SuccessfulInvoke_OmitsAllDetail() + { + var details = NodeInvokeActivityFormatter.BuildDetails("stt.transcribe", ok: true, durationMs: 800, error: null); + Assert.Equal("privacy-sensitive · 800 ms", details); + } + + [Fact] + public void NonPrivacySensitive_FailedInvoke_KeepsErrorForDiagnostics() + { + // Non-privacy-sensitive commands (metadata / exec) keep the error text + // because they're useful for diagnostics and don't carry mic/camera args. + var details = NodeInvokeActivityFormatter.BuildDetails( + "device.status", + ok: false, + durationMs: 50, + error: "gateway unreachable"); + + Assert.Equal("metadata · 50 ms · gateway unreachable", details); + } + + [Fact] + public void NonPrivacySensitive_FailedInvoke_NullError_FallsBackToUnknown() + { + var details = NodeInvokeActivityFormatter.BuildDetails("device.status", ok: false, durationMs: 0, error: null); + Assert.Equal("metadata · 0 ms · unknown error", details); + } + + [Fact] + public void Exec_FailedInvoke_KeepsErrorForDiagnostics() + { + var details = NodeInvokeActivityFormatter.BuildDetails( + "system.run", + ok: false, + durationMs: 100, + error: "exit code 1"); + + Assert.Equal("exec · 100 ms · exit code 1", details); + } + + [Fact] + public void NegativeDuration_ClampsToZero() + { + var details = NodeInvokeActivityFormatter.BuildDetails("device.status", ok: true, durationMs: -7, error: null); + Assert.Equal("metadata · 0 ms", details); + } + + [Theory] + [InlineData("stt.transcribe", "privacy-sensitive")] + [InlineData("STT.Transcribe", "privacy-sensitive")] + [InlineData("stt.listen", "privacy-sensitive")] + [InlineData("Stt.Listen", "privacy-sensitive")] + [InlineData("stt.status", "privacy-sensitive")] + [InlineData("stt.future-command", "privacy-sensitive")] // any new stt.* defaults privacy-sensitive + [InlineData("camera.snap", "privacy-sensitive")] + [InlineData("camera.clip", "privacy-sensitive")] + [InlineData("screen.snapshot", "privacy-sensitive")] + [InlineData("screen.record", "privacy-sensitive")] + [InlineData("system.run", "exec")] + [InlineData("system.run.shell", "exec")] + [InlineData("device.status", "metadata")] + [InlineData("tts.speak", "privacy-sensitive")] // TTS errors can leak ElevenLabs key fragments / device names + [InlineData("tts.future-command", "privacy-sensitive")] // any future tts.* defaults privacy-sensitive + [InlineData("", "metadata")] + public void GetPrivacyClass_KnownCommands(string command, string expected) + { + Assert.Equal(expected, NodeInvokeActivityFormatter.GetPrivacyClass(command)); + } +} diff --git a/tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj b/tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj index 30f0256e..40340184 100644 --- a/tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj +++ b/tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj @@ -26,6 +26,8 @@ + + diff --git a/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs b/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs index f1996b3c..03b84c6f 100644 --- a/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs +++ b/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs @@ -37,11 +37,20 @@ public void RoundTrip_AllFields_Preserved() NodeCameraEnabled = false, NodeLocationEnabled = true, NodeBrowserProxyEnabled = false, + NodeSttEnabled = true, + SttLanguage = "en-GB", + SttModelName = "tiny", + SttSilenceTimeout = 2.5f, + VoiceTtsEnabled = false, + VoiceAudioFeedback = false, NodeTtsEnabled = true, TtsProvider = "elevenlabs", TtsElevenLabsApiKey = "elevenlabs-key", TtsElevenLabsModel = "eleven_multilingual_v2", TtsElevenLabsVoiceId = "voice-123", + TtsWindowsVoiceId = "Microsoft Zira Desktop", + HubNavPaneOpen = false, + TtsPiperVoiceId = "fr_FR-siwis-low", HasSeenActivityStreamTip = true, SkippedUpdateTag = "v1.2.3", NotifyChatResponses = false, @@ -82,11 +91,20 @@ public void RoundTrip_AllFields_Preserved() Assert.Equal(original.NodeCameraEnabled, restored.NodeCameraEnabled); Assert.Equal(original.NodeLocationEnabled, restored.NodeLocationEnabled); Assert.Equal(original.NodeBrowserProxyEnabled, restored.NodeBrowserProxyEnabled); + Assert.Equal(original.NodeSttEnabled, restored.NodeSttEnabled); + Assert.Equal(original.SttLanguage, restored.SttLanguage); + Assert.Equal(original.SttModelName, restored.SttModelName); + Assert.Equal(original.SttSilenceTimeout, restored.SttSilenceTimeout); + Assert.Equal(original.VoiceTtsEnabled, restored.VoiceTtsEnabled); + Assert.Equal(original.VoiceAudioFeedback, restored.VoiceAudioFeedback); Assert.Equal(original.NodeTtsEnabled, restored.NodeTtsEnabled); Assert.Equal(original.TtsProvider, restored.TtsProvider); Assert.Equal(original.TtsElevenLabsApiKey, restored.TtsElevenLabsApiKey); Assert.Equal(original.TtsElevenLabsModel, restored.TtsElevenLabsModel); Assert.Equal(original.TtsElevenLabsVoiceId, restored.TtsElevenLabsVoiceId); + Assert.Equal(original.TtsWindowsVoiceId, restored.TtsWindowsVoiceId); + Assert.Equal(original.HubNavPaneOpen, restored.HubNavPaneOpen); + Assert.Equal(original.TtsPiperVoiceId, restored.TtsPiperVoiceId); Assert.Equal(original.HasSeenActivityStreamTip, restored.HasSeenActivityStreamTip); Assert.Equal(original.SkippedUpdateTag, restored.SkippedUpdateTag); Assert.Equal(original.NotifyChatResponses, restored.NotifyChatResponses); @@ -144,8 +162,10 @@ public void MissingFields_UseDefaults() Assert.True(settings.NodeCameraEnabled); Assert.True(settings.NodeLocationEnabled); Assert.True(settings.NodeBrowserProxyEnabled); + Assert.False(settings.NodeSttEnabled); + Assert.Equal("auto", settings.SttLanguage); Assert.False(settings.NodeTtsEnabled); - Assert.Equal("windows", settings.TtsProvider); + Assert.Equal("piper", settings.TtsProvider); Assert.Null(settings.TtsElevenLabsApiKey); Assert.Null(settings.TtsElevenLabsModel); Assert.Null(settings.TtsElevenLabsVoiceId); @@ -153,9 +173,24 @@ public void MissingFields_UseDefaults() Assert.Null(settings.SkippedUpdateTag); Assert.True(settings.NotifyChatResponses); Assert.True(settings.PreferStructuredCategories); + // HubNavPaneOpen defaults to true (NavView starts expanded for new + // installs and for any settings file that predates the field). + Assert.True(settings.HubNavPaneOpen); Assert.Null(settings.UserRules); } + [Fact] + public void HubNavPaneOpen_DefaultsTrue_ForEmptyJson() + { + // Existing users have a settings file written before HubNavPaneOpen + // existed. The default-true initializer must survive deserialization + // of a missing field so the NavView lands expanded for them, not + // silently collapsed. + var settings = SettingsData.FromJson("{}"); + Assert.NotNull(settings); + Assert.True(settings!.HubNavPaneOpen); + } + [Fact] public void BackwardCompatibility_OldSettingsWithoutNewFields() { @@ -198,14 +233,18 @@ public void BackwardCompatibility_OldSettingsWithoutNewFields() Assert.True(settings.NodeCameraEnabled); Assert.True(settings.NodeLocationEnabled); Assert.True(settings.NodeBrowserProxyEnabled); + Assert.False(settings.NodeSttEnabled); + Assert.Equal("auto", settings.SttLanguage); Assert.False(settings.NodeTtsEnabled); - Assert.Equal("windows", settings.TtsProvider); + Assert.Equal("piper", settings.TtsProvider); Assert.Null(settings.TtsElevenLabsApiKey); Assert.Null(settings.TtsElevenLabsModel); Assert.Null(settings.TtsElevenLabsVoiceId); Assert.False(settings.HasSeenActivityStreamTip); Assert.Null(settings.SkippedUpdateTag); Assert.True(settings.GlobalHotkeyEnabled); + // HubNavPaneOpen wasn't in this older JSON shape; default true. + Assert.True(settings.HubNavPaneOpen); Assert.Null(settings.UserRules); }