From f2568dedb0f5b76787764b15a75646f5fd9cb3e0 Mon Sep 17 00:00:00 2001 From: Regis Brid Date: Fri, 1 May 2026 18:19:56 -0700 Subject: [PATCH 01/34] Add Windows STT transcribe capability Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- README.md | 1 + docs/WINDOWS_NODE_TESTING.md | 35 ++ docs/gateway-node-integration.md | 6 +- .../Capabilities/SttCapability.cs | 153 +++++++ src/OpenClaw.Shared/Mcp/McpToolBridge.cs | 4 + src/OpenClaw.Shared/Models.cs | 12 +- src/OpenClaw.Shared/SettingsData.cs | 2 + src/OpenClaw.Tray.WinUI/App.xaml.cs | 3 +- .../OpenClaw.Tray.WinUI.csproj | 1 + .../Services/NodeService.cs | 59 +++ .../Services/SettingsManager.cs | 6 + .../SpeechToText/SpeechToTextService.cs | 411 ++++++++++++++++++ .../Windows/SettingsWindow.xaml | 27 ++ .../Windows/SettingsWindow.xaml.cs | 6 + .../OpenClaw.Shared.Tests/CapabilityTests.cs | 227 ++++++++++ .../McpToolBridgeTests.cs | 24 + tests/OpenClaw.Shared.Tests/ModelsTests.cs | 25 ++ .../SettingsRoundTripTests.cs | 8 + 18 files changed, 1004 insertions(+), 6 deletions(-) create mode 100644 src/OpenClaw.Shared/Capabilities/SttCapability.cs create mode 100644 src/OpenClaw.Tray.WinUI/Services/SpeechToText/SpeechToTextService.cs diff --git a/README.md b/README.md index 26815f37..b5fe075a 100644 --- a/README.md +++ b/README.md @@ -177,6 +177,7 @@ When Node Mode is enabled in Settings, your Windows PC becomes a **node** that t | **Canvas** | `canvas.present`, `canvas.hide`, `canvas.navigate`, `canvas.eval`, `canvas.snapshot`, `canvas.a2ui.push`, `canvas.a2ui.pushJSONL`, `canvas.a2ui.reset` | Display and control a WebView2 window | | **Screen** | `screen.snapshot`, `screen.record` | Capture screenshots and fixed-duration MP4 screen recordings | | **Camera** | `camera.list`, `camera.snap`, `camera.clip` | Enumerate cameras and capture still photos or short video clips | +| **Speech-to-text** | `stt.transcribe` | Capture audio from the default microphone for a bounded duration and return transcribed text. Default-off; opt-in via Settings. When enabled, advertised to both gateway callers (subject to gateway allowlist) and local MCP clients (subject to bearer token). | | **Location** | `location.get` | Return Windows geolocation when permission is available | | **Device** | `device.info`, `device.status` | Return Windows host/app metadata and lightweight status | diff --git a/docs/WINDOWS_NODE_TESTING.md b/docs/WINDOWS_NODE_TESTING.md index 2f60127a..562c84e7 100644 --- a/docs/WINDOWS_NODE_TESTING.md +++ b/docs/WINDOWS_NODE_TESTING.md @@ -61,6 +61,7 @@ These features need the gateway to send `node.invoke` commands: | `location.get` | Get Windows location | Uses Windows location permission/settings | | `device.info` / `device.status` | Device metadata/status | Returns host/app/locale plus battery/storage/network/uptime payloads | | `browser.proxy` | Proxy browser-control host requests | Requires Browser proxy bridge enabled, a compatible browser-control host listening on gateway port + 2, and matching browser-control auth | +| `stt.transcribe` | Speech-to-text from default microphone | Default-off; bounded `maxDurationMs` ≤ 30000; concatenates phrases until duration elapses; requires explicit gateway allowlist | ## Capabilities Advertised @@ -110,6 +111,40 @@ When the node connects, it advertises these capabilities: - If you see "Camera access blocked", enable camera access for desktop apps in Windows Privacy settings - Packaged MSIX builds will show the system consent prompt automatically +### `stt.transcribe` returns "Speech recognition failed" or "Internal Speech Error" +- Open Windows Settings → Privacy & security → Speech (`ms-settings:privacy-speech`) +- Turn **Online speech recognition** = On. The Windows speech recognizer's default dictation grammar often fails without it, and Windows surfaces an unmapped HRESULT as "Internal Speech Error" +- Open Windows Settings → Time & language → Language & region (`ms-settings:regionlanguage`), select your display language → Language options, and confirm **Speech** appears under Installed features (install it if not, ~50 MB; reboot or sign out/in afterward) +- Verify the recognizer end-to-end with `ms-settings:speech` → "Microphone" → **Get started** before re-trying `stt.transcribe` + +### `stt.transcribe` returns "Microphone permission denied" +- Open Windows Settings → Privacy & security → Microphone +- Ensure **Microphone access** (top-level toggle) is on +- For **unpackaged** tray builds (the default `.\build.ps1` output): ensure **Let desktop apps access your microphone** is on. The tray exe will **not** appear as its own row — desktop-app access is granted as a group, not per-app +- For **packaged MSIX** tray builds: the tray appears as its own entry under "Let apps access your microphone" and must be individually enabled (the OS shows a consent prompt on first use) +- After changing permissions, re-pair the node so the gateway picks up the new advertised command + +### `stt.transcribe` returns "Language pack 'X' is not installed" +- Open Windows Settings → Time & language → Language & region +- Add the requested display language and ensure the **Speech** optional feature is installed +- Restart the tray after installing the speech pack + +### Manual STT validation +1. Enable Node Mode in Settings. +2. Enable **Speech-to-text (microphone)** in Settings → Node mode. +3. Append `stt.transcribe` to your existing gateway allowlist (do **not** copy a literal `...` — substitute the commands you already allow). For example, starting from the recommended Windows safe companion list: + ```bash + openclaw config set gateway.nodes.allowCommands '["canvas.present","canvas.hide","canvas.navigate","canvas.eval","canvas.snapshot","canvas.a2ui.push","canvas.a2ui.pushJSONL","canvas.a2ui.reset","camera.list","location.get","screen.snapshot","device.info","device.status","system.execApprovals.get","system.execApprovals.set","stt.transcribe"]' + openclaw gateway restart + ``` +4. Re-pair or re-approve the node so the gateway refreshes its command snapshot. +5. Invoke and speak a short phrase: + ```bash + openclaw nodes invoke --node --command stt.transcribe \ + --params '{"maxDurationMs":5000,"language":"en-US"}' + ``` +6. The Windows microphone OS indicator should appear during recognition. Confirm a `transcribed:true` payload returns the text. + ## Remaining Work (Roadmap) 1. ~~**system.run + exec approvals**~~ ✅ Implemented diff --git a/docs/gateway-node-integration.md b/docs/gateway-node-integration.md index ee62390a..c7594f64 100644 --- a/docs/gateway-node-integration.md +++ b/docs/gateway-node-integration.md @@ -334,7 +334,7 @@ Recommended gateway defaults: | Command bucket | Windows default? | Reason | |----------------|------------------|--------| | Safe declared companion commands: `canvas.*`, `camera.list`, `location.get`, `screen.snapshot`, `device.info`, `device.status` | Yes | Matches macOS parity and only applies when declared by the node | -| Dangerous/privacy-heavy commands: `camera.snap`, `camera.clip`, `screen.record`, write commands like `contacts.add` | No | Existing gateway model already requires explicit `gateway.nodes.allowCommands` | +| Dangerous/privacy-heavy commands: `camera.snap`, `camera.clip`, `screen.record`, `stt.transcribe`, write commands like `contacts.add` | No | Existing gateway model already requires explicit `gateway.nodes.allowCommands` | | Exec commands: `system.run`, `system.run.prepare`, `system.which`, `system.notify`, `browser.proxy` | Yes | Existing Windows headless-host behavior | Until the gateway expands Windows safe defaults, the practical local solution is: @@ -362,6 +362,7 @@ Privacy-sensitive commands should stay out of the default safe list and should o camera.snap camera.clip screen.record +stt.transcribe ``` After changing either `gateway.nodes.allowCommands` or `gateway.nodes.denyCommands`, re-approve or re-pair the Windows node. Approved device records may keep a snapshot of the commands that were visible at approval time, so a gateway restart alone may not refresh existing approvals. @@ -422,6 +423,7 @@ Proposal: - `camera.snap` - `camera.clip` - `screen.record` + - `stt.transcribe` - write commands such as `contacts.add`, `calendar.add`, etc. This does not grant capabilities to headless Windows hosts by itself. A command still has to pass both gates: the node must declare it in `commands`, and the gateway policy must allow it. Headless Windows node hosts that only declare `system.run` / `system.which` remain exec-only. @@ -439,7 +441,7 @@ When shipping the Windows node, README/wiki should tell users: > ``` > Then re-pair the node (`openclaw devices reject ` + re-approve). > -> Add `camera.snap`, `camera.clip`, and `screen.record` only when you explicitly want to allow privacy-sensitive camera or screen capture. +> Add `camera.snap`, `camera.clip`, `screen.record`, and `stt.transcribe` only when you explicitly want to allow privacy-sensitive camera, screen, or microphone capture. > > The Windows tray Command Center (`openclaw://commandcenter`) surfaces these policy problems directly: it separates safe companion allowlist fixes from privacy-sensitive opt-ins and provides copyable repair text for safe fixes or pending pairing approval. diff --git a/src/OpenClaw.Shared/Capabilities/SttCapability.cs b/src/OpenClaw.Shared/Capabilities/SttCapability.cs new file mode 100644 index 00000000..f401b606 --- /dev/null +++ b/src/OpenClaw.Shared/Capabilities/SttCapability.cs @@ -0,0 +1,153 @@ +using System; +using System.Collections.Generic; +using System.Text.RegularExpressions; +using System.Threading; +using System.Threading.Tasks; + +namespace OpenClaw.Shared.Capabilities; + +/// +/// Standalone speech-to-text capability for the Windows node. +/// +/// Mirrors in shape: shared bounds & arg +/// parsing, with a single event hook that the tray's +/// SpeechToTextService implements using +/// Windows.Media.SpeechRecognition.SpeechContinuousRecognitionSession. +/// +/// Audio in → text out. No Talk Mode integration, no chat send, +/// no wake word. Default-off privacy-sensitive command. +/// +public sealed class SttCapability : NodeCapabilityBase +{ + public const string TranscribeCommand = "stt.transcribe"; + public const string WindowsProvider = "windows"; + public const int MaxDurationMs = 30_000; + public const string DefaultLanguage = "en-US"; + + private static readonly string[] _commands = [TranscribeCommand]; + + // Conservative BCP-47 check: 2-3 letter language, optional script + // (4 letter), optional region (2 letter or 3 digit), each separated + // by a hyphen. Rejects whitespace and punctuation that would otherwise + // trip Windows.Globalization.Language ctor. + private static readonly Regex BcpTagRegex = new( + "^[A-Za-z]{2,3}(?:-[A-Za-z]{4})?(?:-(?:[A-Za-z]{2}|[0-9]{3}))?$", + RegexOptions.Compiled); + + public override string Category => "stt"; + public override IReadOnlyList Commands => _commands; + + /// + /// Tray-side handler that performs the actual recognition. + /// + public event Func>? TranscribeRequested; + + public SttCapability(IOpenClawLogger logger) : base(logger) + { + } + + /// + /// Resolve the language string callers should run recognition with: + /// per-call argument wins, then configured setting, then default. + /// Returns null if the supplied string fails the BCP-47 sanity check + /// (caller should map this to a clear error). + /// + public static string? ResolveLanguage(string? requested, string? configured) + { + var candidate = !string.IsNullOrWhiteSpace(requested) + ? requested + : (!string.IsNullOrWhiteSpace(configured) ? configured : DefaultLanguage); + + return NormalizeLanguageTag(candidate!); + } + + /// + /// Trim and BCP-47-validate a single tag. Returns the trimmed tag on + /// success or null if the input is not a recognizable language tag. + /// + private static string? NormalizeLanguageTag(string tag) + { + var trimmed = tag.Trim(); + return BcpTagRegex.IsMatch(trimmed) ? trimmed : null; + } + + public override Task ExecuteAsync(NodeInvokeRequest request) + => ExecuteAsync(request, CancellationToken.None); + + public override async Task ExecuteAsync( + NodeInvokeRequest request, + CancellationToken cancellationToken) + { + if (!string.Equals(request.Command, TranscribeCommand, StringComparison.Ordinal)) + return Error($"Unknown command: {request.Command}"); + + // maxDurationMs is required and bounded server-side. We deliberately + // reject 0/negative rather than substituting a default — callers + // explicitly choose how much mic time they're spending. + var maxDurationMs = GetIntArg(request.Args, "maxDurationMs", 0); + if (maxDurationMs <= 0) + return Error("Missing required maxDurationMs"); + if (maxDurationMs > MaxDurationMs) + return Error($"maxDurationMs exceeds {MaxDurationMs} ms"); + + var requestedLanguage = GetStringArg(request.Args, "language"); + string? resolvedLanguage = null; + if (!string.IsNullOrWhiteSpace(requestedLanguage)) + { + resolvedLanguage = NormalizeLanguageTag(requestedLanguage); + if (resolvedLanguage == null) + return Error($"Invalid language tag '{requestedLanguage}'"); + } + + if (TranscribeRequested == null) + return Error("STT transcribe not available"); + + var args = new SttTranscribeArgs + { + MaxDurationMs = maxDurationMs, + Language = resolvedLanguage // null lets the tray fall back to its configured setting + }; + + Logger.Info($"stt.transcribe: maxDurationMs={args.MaxDurationMs}, language={args.Language ?? "(default)"}"); + + try + { + var result = await TranscribeRequested(args, cancellationToken).ConfigureAwait(false); + return Success(new + { + transcribed = result.Transcribed, + text = result.Text, + durationMs = result.DurationMs, + language = result.Language + }); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + return Error("Transcribe canceled"); + } + catch (Exception ex) + { + // Exception message only — capability never sees the transcript. + Logger.Error("STT transcribe failed", ex); + return Error($"Transcribe failed: {ex.Message}"); + } + } +} + +public sealed class SttTranscribeArgs +{ + public int MaxDurationMs { get; set; } + /// + /// BCP-47 language tag (e.g., "en-US"). Null lets the tray service + /// fall back to its configured SttLanguage setting. + /// + public string? Language { get; set; } +} + +public sealed class SttTranscribeResult +{ + public bool Transcribed { get; set; } + public string Text { get; set; } = ""; + public int DurationMs { get; set; } + public string Language { get; set; } = SttCapability.DefaultLanguage; +} diff --git a/src/OpenClaw.Shared/Mcp/McpToolBridge.cs b/src/OpenClaw.Shared/Mcp/McpToolBridge.cs index e3d19dce..69a428ae 100644 --- a/src/OpenClaw.Shared/Mcp/McpToolBridge.cs +++ b/src/OpenClaw.Shared/Mcp/McpToolBridge.cs @@ -229,6 +229,10 @@ private object HandleToolsList() "Capture a still photo from a camera. Args: deviceId (string, optional — defaults to system default camera), format ('jpeg'|'png', default 'jpeg'), maxWidth (int, default 1280), quality (int 1-100, default 80). Returns { format, width, height, base64 }.", ["camera.clip"] = "Record a short clip from a camera. Args: deviceId (string, optional), durationMs (int, required, max 60000), format ('mp4'|'webm', default 'mp4'), maxWidth (int, default 1280). Returns { format, durationMs, base64 }.", + + // stt.* — bounded microphone capture → text. Default-off privacy-sensitive. + ["stt.transcribe"] = + "Capture audio from the default microphone for a bounded duration and return the transcribed text. Args: maxDurationMs (int, required, > 0, max 30000), language (string, optional BCP-47 tag like 'en-US' — falls back to the configured SttLanguage setting). Uses Windows.Media.SpeechRecognition (local recognizer; OS may use online services for some configurations). Returns { transcribed, text, durationMs, language }. Requires NodeSttEnabled in tray Settings; when enabled, exposed to both gateway callers (subject to the gateway allowlist) and local MCP clients (subject to the MCP bearer token).", }; private async Task HandleToolsCallAsync(JsonElement parameters, CancellationToken cancellationToken) diff --git a/src/OpenClaw.Shared/Models.cs b/src/OpenClaw.Shared/Models.cs index 0d7888a8..57ede5bd 100644 --- a/src/OpenClaw.Shared/Models.cs +++ b/src/OpenClaw.Shared/Models.cs @@ -1019,13 +1019,19 @@ public static class CommandCenterCommandGroups public static readonly FrozenSet SafeCompanionCommandSet = SafeCompanionCommands.ToFrozenSet(StringComparer.OrdinalIgnoreCase); - public static readonly string[] DangerousCommands = + public static readonly string[] CommonDangerousCommands = [ "camera.snap", "camera.clip", "screen.record" ]; + public static readonly string[] DangerousCommands = + [ + .. CommonDangerousCommands, + "stt.transcribe" + ]; + public static readonly FrozenSet DangerousCommandSet = DangerousCommands.ToFrozenSet(StringComparer.OrdinalIgnoreCase); @@ -1046,7 +1052,7 @@ public static class CommandCenterCommandGroups public static readonly string[] MacNodeParityCommands = [ .. SafeCompanionCommands, - .. DangerousCommands, + .. CommonDangerousCommands, "system.notify", "system.run", "system.which", @@ -1232,7 +1238,7 @@ public static List BuildNodeWarnings(NodeCapabilityHea Severity = GatewayDiagnosticSeverity.Info, Category = "allowlist", Title = "Privacy-sensitive commands are currently blocked", - Detail = $"{blocked} {(node.MissingDangerousAllowlistCommands.Count == 1 ? "is" : "are")} declared but filtered by gateway policy. Leave blocked unless you explicitly want camera or screen recording access for this node.", + Detail = $"{blocked} {(node.MissingDangerousAllowlistCommands.Count == 1 ? "is" : "are")} declared but filtered by gateway policy. Leave blocked unless you explicitly want camera, microphone, or screen recording access for this node.", RepairAction = "Copy opt-in guidance", CopyText = BuildDangerousCommandOptInGuidance(node.MissingDangerousAllowlistCommands) }); diff --git a/src/OpenClaw.Shared/SettingsData.cs b/src/OpenClaw.Shared/SettingsData.cs index 6f3fe495..5fa85686 100644 --- a/src/OpenClaw.Shared/SettingsData.cs +++ b/src/OpenClaw.Shared/SettingsData.cs @@ -34,6 +34,8 @@ public class SettingsData public bool NodeCameraEnabled { get; set; } = true; public bool NodeLocationEnabled { get; set; } = true; public bool NodeBrowserProxyEnabled { get; set; } = true; + public bool NodeSttEnabled { get; set; } = false; + public string SttLanguage { get; set; } = "en-US"; /// Run the local MCP HTTP server. Independent of EnableNodeMode. public bool EnableMcpServer { get; set; } = false; /// diff --git a/src/OpenClaw.Tray.WinUI/App.xaml.cs b/src/OpenClaw.Tray.WinUI/App.xaml.cs index 8f93a14c..9c38b1cb 100644 --- a/src/OpenClaw.Tray.WinUI/App.xaml.cs +++ b/src/OpenClaw.Tray.WinUI/App.xaml.cs @@ -1396,7 +1396,8 @@ private static string GetNodeInvokePrivacyClass(string command) if (string.Equals(command, "screen.record", StringComparison.OrdinalIgnoreCase) || string.Equals(command, "screen.snapshot", StringComparison.OrdinalIgnoreCase) || string.Equals(command, "camera.snap", StringComparison.OrdinalIgnoreCase) || - string.Equals(command, "camera.clip", StringComparison.OrdinalIgnoreCase)) + string.Equals(command, "camera.clip", StringComparison.OrdinalIgnoreCase) || + string.Equals(command, "stt.transcribe", StringComparison.OrdinalIgnoreCase)) { return "privacy-sensitive"; } diff --git a/src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj b/src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj index 3285f99a..cbc56875 100644 --- a/src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj +++ b/src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj @@ -56,6 +56,7 @@ + diff --git a/src/OpenClaw.Tray.WinUI/Services/NodeService.cs b/src/OpenClaw.Tray.WinUI/Services/NodeService.cs index 2d9a6ac9..42ebec0b 100644 --- a/src/OpenClaw.Tray.WinUI/Services/NodeService.cs +++ b/src/OpenClaw.Tray.WinUI/Services/NodeService.cs @@ -69,6 +69,8 @@ public sealed class NodeService : IDisposable private LocationCapability? _locationCapability; private DeviceCapability? _deviceCapability; private BrowserProxyCapability? _browserProxyCapability; + private SttCapability? _sttCapability; + private SpeechToText.SpeechToTextService? _speechToTextService; private readonly string _dataPath; private string? _token; @@ -286,6 +288,14 @@ private void RegisterCapabilities() _deviceCapability = new DeviceCapability(_logger); Register(_deviceCapability); + if (_settings?.NodeSttEnabled == true) + { + _speechToTextService ??= new SpeechToText.SpeechToTextService(_logger, _settings); + _sttCapability = new SttCapability(_logger); + _sttCapability.TranscribeRequested += OnSttTranscribeAsync; + Register(_sttCapability); + } + // BrowserProxy needs a live gateway connection — only register when gateway is up. if (_nodeClient != null && _settings?.NodeBrowserProxyEnabled != false) { @@ -447,6 +457,8 @@ private List BuildDisabledCommands() disabled.AddRange(CommandCenterCommandGroups.SafeCompanionCommands.Where(command => command.StartsWith("location.", StringComparison.OrdinalIgnoreCase))); if (_settings?.NodeBrowserProxyEnabled == false) disabled.Add("browser.proxy"); + if (_settings?.NodeSttEnabled != true) + disabled.Add(SttCapability.TranscribeCommand); return disabled; } @@ -1268,6 +1280,52 @@ private async Task GetLocationAsync(LocationGetArgs args) #endregion + #region STT Capability Handlers + + private Task OnSttTranscribeAsync(SttTranscribeArgs args, CancellationToken cancellationToken) + { + if (_speechToTextService == null) + throw new InvalidOperationException("Speech-to-text service not available"); + + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + var registration = cancellationToken.Register(() => tcs.TrySetCanceled(cancellationToken)); + + bool enqueued = _dispatcherQueue.TryEnqueue(async () => + { + try + { + tcs.TrySetResult(await _speechToTextService.TranscribeAsync(args, cancellationToken)); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + tcs.TrySetCanceled(cancellationToken); + } + catch (Exception ex) + { + tcs.TrySetException(ex); + } + }); + + if (!enqueued) + tcs.TrySetException(new InvalidOperationException("Speech-to-text dispatcher unavailable")); + + return AwaitWithRegistrationAsync(tcs.Task, registration); + } + + private static async Task AwaitWithRegistrationAsync(Task task, CancellationTokenRegistration registration) + { + try + { + return await task; + } + finally + { + await registration.DisposeAsync(); + } + } + + #endregion + public void Dispose() { StopMcpServer(); @@ -1278,6 +1336,7 @@ public void Dispose() try { _cameraCaptureService?.Dispose(); } catch { /* ignore */ } try { _screenRecordingService?.Dispose(); } catch { /* ignore */ } + try { _speechToTextService?.Dispose(); } catch { /* ignore */ } // MediaResolver owns SocketsHttpHandler + HttpClient (disposeHandler:true); // without disposal the connection pool survives node teardown/recreate. try { _mediaResolver?.Dispose(); } catch { /* ignore */ } diff --git a/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs b/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs index 0afbf9b6..e2914131 100644 --- a/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs +++ b/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs @@ -64,6 +64,8 @@ public class SettingsManager public bool NodeCameraEnabled { get; set; } = true; public bool NodeLocationEnabled { get; set; } = true; public bool NodeBrowserProxyEnabled { get; set; } = true; + public bool NodeSttEnabled { get; set; } = false; + public string SttLanguage { get; set; } = "en-US"; // Local MCP HTTP server (independent of EnableNodeMode) public bool EnableMcpServer { get; set; } = false; /// @@ -117,6 +119,8 @@ public void Load() NodeCameraEnabled = loaded.NodeCameraEnabled; NodeLocationEnabled = loaded.NodeLocationEnabled; NodeBrowserProxyEnabled = loaded.NodeBrowserProxyEnabled; + NodeSttEnabled = loaded.NodeSttEnabled; + SttLanguage = string.IsNullOrWhiteSpace(loaded.SttLanguage) ? SttLanguage : loaded.SttLanguage; EnableMcpServer = loaded.EnableMcpServer; A2UIImageHosts = loaded.A2UIImageHosts ?? new List(); // Legacy McpOnlyMode migration: @@ -185,6 +189,8 @@ public void Save() NodeCameraEnabled = NodeCameraEnabled, NodeLocationEnabled = NodeLocationEnabled, NodeBrowserProxyEnabled = NodeBrowserProxyEnabled, + NodeSttEnabled = NodeSttEnabled, + SttLanguage = SttLanguage, EnableMcpServer = EnableMcpServer, A2UIImageHosts = A2UIImageHosts.Count == 0 ? null : new List(A2UIImageHosts), // McpOnlyMode is legacy — never written; remains null in serialized output. diff --git a/src/OpenClaw.Tray.WinUI/Services/SpeechToText/SpeechToTextService.cs b/src/OpenClaw.Tray.WinUI/Services/SpeechToText/SpeechToTextService.cs new file mode 100644 index 00000000..944fd7ed --- /dev/null +++ b/src/OpenClaw.Tray.WinUI/Services/SpeechToText/SpeechToTextService.cs @@ -0,0 +1,411 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Globalization; +using System.Linq; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using OpenClaw.Shared; +using OpenClaw.Shared.Capabilities; +using Windows.Globalization; +using Windows.Media.SpeechRecognition; +using DesktopDictationGrammar = System.Speech.Recognition.DictationGrammar; +using DesktopRecognizeMode = System.Speech.Recognition.RecognizeMode; +using DesktopSpeechRecognitionEngine = System.Speech.Recognition.SpeechRecognitionEngine; + +namespace OpenClawTray.Services.SpeechToText; + +/// +/// Bounded-window speech-to-text service backed by Windows +/// running in continuous-recognition mode. +/// One call records for up to maxDurationMs, accumulating +/// phrases, +/// then stops the session and returns the joined transcript. +/// +/// Single-flight: a second concurrent caller fails fast with +/// "STT already in progress" rather than tearing down the active session +/// (per Phase 2 design — no interrupt arg). +/// +/// **Privacy invariant:** transcript text is never passed to . +/// Logger sees outcome + duration only. +/// +public sealed class SpeechToTextService : IDisposable +{ + private readonly IOpenClawLogger _logger; + private readonly SettingsManager _settings; + private readonly SemaphoreSlim _gate = new(1, 1); + + public SpeechToTextService(IOpenClawLogger logger, SettingsManager settings) + { + _logger = logger; + _settings = settings; + } + + public async Task TranscribeAsync(SttTranscribeArgs args, CancellationToken cancellationToken = default) + { + // Resolve language: per-call wins, then configured setting, then default. + var languageTag = !string.IsNullOrWhiteSpace(args.Language) + ? args.Language! + : (!string.IsNullOrWhiteSpace(_settings.SttLanguage) ? _settings.SttLanguage : SttCapability.DefaultLanguage); + + // Preflight: bail before opening the mic if the OS can't recognize this language. + ValidateLanguageSupported(languageTag); + + if (!await _gate.WaitAsync(0, cancellationToken).ConfigureAwait(false)) + throw new InvalidOperationException("STT already in progress"); + + SpeechRecognizer? recognizer = null; + try + { + try + { + recognizer = await CreateRecognizerAsync(languageTag); + + var stopwatch = Stopwatch.StartNew(); + var text = await CaptureWindowAsync(recognizer, args.MaxDurationMs, cancellationToken); + stopwatch.Stop(); + + // Log outcome only — never the transcript text. + _logger.Info($"stt.transcribe completed: language={languageTag}, durationMs={stopwatch.ElapsedMilliseconds:0}, transcribed={!string.IsNullOrEmpty(text)}"); + + return new SttTranscribeResult + { + Transcribed = !string.IsNullOrEmpty(text), + Text = text, + DurationMs = (int)Math.Min(stopwatch.ElapsedMilliseconds, int.MaxValue), + Language = languageTag + }; + } + catch (InvalidOperationException) + { + // Already wrapped with a friendly message (mic denied, mic + // unavailable, language pack missing, no speech, "STT already + // in progress", or our own "Speech recognizer unavailable: X"). + throw; + } + catch (Exception ex) when (IsSapiError(ex.HResult)) + { + // The Windows SAPI / WinRT SpeechRecognition stack often + // surfaces a generic "Internal Speech Error" / "The text + // associated with this error code could not be found." for + // ANY failure inside CompileConstraintsAsync, StartAsync, or + // session callbacks. Keep the HRESULT visible because Windows + // often maps these failures to "Internal Speech Error". + _logger.Warn($"[stt] speech stack failure: HRESULT=0x{ex.HResult:X8} type={ex.GetType().Name} message={ex.Message}"); + recognizer?.Dispose(); + recognizer = null; + + try + { + _logger.Info($"[stt] falling back to desktop SAPI recognizer: language={languageTag}"); + var stopwatch = Stopwatch.StartNew(); + var text = await CaptureWindowWithDesktopSapiAsync(languageTag, args.MaxDurationMs, cancellationToken).ConfigureAwait(false); + stopwatch.Stop(); + + _logger.Info($"stt.transcribe completed: engine=desktop-sapi, language={languageTag}, durationMs={stopwatch.ElapsedMilliseconds:0}, transcribed={!string.IsNullOrEmpty(text)}"); + return new SttTranscribeResult + { + Transcribed = !string.IsNullOrEmpty(text), + Text = text, + DurationMs = (int)Math.Min(stopwatch.ElapsedMilliseconds, int.MaxValue), + Language = languageTag + }; + } + catch (Exception fallbackEx) + { + _logger.Warn($"[stt] desktop SAPI fallback failed: type={fallbackEx.GetType().Name} message={fallbackEx.Message}"); + throw new InvalidOperationException( + $"Speech recognition failed (0x{ex.HResult:X8}) and the desktop SAPI fallback also failed: {fallbackEx.Message}. Confirm Online speech recognition, microphone access, and the Speech optional feature are enabled for this language. Underlying WinRT error: {ex.Message}", + fallbackEx); + } + } + catch (Exception ex) + { + // Unexpected — log the HRESULT so we can extend IsSapiError if + // a real SAPI error slips through the filter, then re-throw + // unwrapped so the capability surfaces the original message. + _logger.Warn($"[stt] unmapped failure: HRESULT=0x{ex.HResult:X8} type={ex.GetType().Name} message={ex.Message}"); + throw; + } + } + finally + { + recognizer?.Dispose(); + _gate.Release(); + } + } + + private static void ValidateLanguageSupported(string tag) + { + // Default dictation grammar uses the same topic-language inventory. + // Match by case-insensitive tag before opening the mic. + var supported = SpeechRecognizer.SupportedTopicLanguages; + if (supported == null || supported.Count == 0) + throw new InvalidOperationException("Speech recognition is unavailable on this system."); + + var match = supported.Any(lang => + string.Equals(lang.LanguageTag, tag, StringComparison.OrdinalIgnoreCase)); + if (!match) + throw new InvalidOperationException($"Language pack '{tag}' is not installed for speech recognition."); + } + + private async Task CreateRecognizerAsync(string languageTag) + { + var systemLanguageTag = SpeechRecognizer.SystemSpeechLanguage?.LanguageTag; + var useSystemLanguage = + string.Equals(systemLanguageTag, languageTag, StringComparison.OrdinalIgnoreCase); + + _logger.Info($"[stt] recognizer language: requested={languageTag}, system={systemLanguageTag ?? "(none)"}, mode={(useSystemLanguage ? "system" : "explicit")}"); + + SpeechRecognizer recognizer; + try + { + // If the caller requested the active Windows speech language, use + // the system-language constructor. This follows the same path as + // built-in dictation more closely than forcing an equivalent tag. + recognizer = useSystemLanguage + ? new SpeechRecognizer() + : new SpeechRecognizer(new Language(languageTag)); + } + catch (ArgumentException ex) + { + // Defense in depth — preflight should have caught this. + throw new InvalidOperationException($"Language pack '{languageTag}' is not installed for speech recognition.", ex); + } + + try + { + // Do not add an explicit SpeechRecognitionTopicConstraint here. + // CompileConstraintsAsync with an empty Constraints collection uses + // Windows' default dictation grammar, which avoids 0x800455A0 + // failures seen on some systems with the explicit Dictation topic. + _logger.Info($"[stt] compiling recognizer constraints: mode={(useSystemLanguage ? "system" : "explicit")}"); + var compilation = await recognizer.CompileConstraintsAsync(); + _logger.Info($"[stt] recognizer constraints compiled: status={compilation.Status}, mode={(useSystemLanguage ? "system" : "explicit")}"); + if (compilation.Status != SpeechRecognitionResultStatus.Success) + { + throw new InvalidOperationException($"Speech recognizer unavailable: {compilation.Status}"); + } + + return recognizer; + } + catch + { + recognizer.Dispose(); + throw; + } + } + + private async Task CaptureWindowAsync( + SpeechRecognizer recognizer, + int maxDurationMs, + CancellationToken cancellationToken) + { + // Buffer phrase results as they arrive; concatenate at the end. + var phrases = new List(); + var phraseLock = new object(); + + void OnResult(SpeechContinuousRecognitionSession session, SpeechContinuousRecognitionResultGeneratedEventArgs e) + { + // Drop low-confidence noise — Rejected confidence is what + // SpeechRecognizer returns for babble / background sound. + if (e.Result.Confidence == SpeechRecognitionConfidence.Rejected) + return; + var phrase = e.Result.Text; + if (string.IsNullOrWhiteSpace(phrase)) + return; + lock (phraseLock) + { + phrases.Add(phrase); + } + } + + recognizer.ContinuousRecognitionSession.ResultGenerated += OnResult; + + try + { + using var durationCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); + durationCts.CancelAfter(maxDurationMs); + + try + { + await recognizer.ContinuousRecognitionSession.StartAsync().AsTask(cancellationToken); + } + catch (UnauthorizedAccessException ex) + { + throw new InvalidOperationException("Microphone permission denied. Enable microphone access for desktop apps in Windows Settings → Privacy & security → Microphone (packaged MSIX installs additionally need per-app permission for OpenClaw Tray).", ex); + } + catch (Exception ex) when ((uint)ex.HResult == 0x80045509) + { + // SPERR_AUDIO_NOT_FOUND — no audio input device available. + throw new InvalidOperationException("Microphone unavailable.", ex); + } + // Other SAPI/speech errors are caught at the outer scope + // (TranscribeAsync) so failures from CompileConstraintsAsync or + // session callbacks get the same friendly mapping. + + try + { + await Task.Delay(Timeout.Infinite, durationCts.Token); + } + catch (OperationCanceledException) + { + // Either the caller cancelled or the duration cap fired. + // Both paths fall through to StopAsync below — we MUST + // release the mic before deciding whether to surface the + // cancel as an error. + } + + // Always try to stop cleanly. StopAsync awaits any in-flight + // ResultGenerated dispatch so we don't drop the last phrase. + try + { + await recognizer.ContinuousRecognitionSession.StopAsync().AsTask(CancellationToken.None); + } + catch + { + // StopAsync can throw if the session was already cancelled by the + // OS; we still want to release the mic and proceed. + } + + // Caller cancellation wins over a partial transcript — the API + // contract returns "Transcribe canceled" rather than partial text. + // Duration-cap cancellation falls through and we return what we got. + cancellationToken.ThrowIfCancellationRequested(); + } + finally + { + recognizer.ContinuousRecognitionSession.ResultGenerated -= OnResult; + } + + lock (phraseLock) + { + if (phrases.Count == 0) + { + // Plan contract: no-speech / timeout is an error, not a + // success with empty text. Caller distinguishes this from + // a transient failure by the message. + throw new InvalidOperationException("No speech detected within the bounded capture window."); + } + var sb = new StringBuilder(phrases.Sum(p => p.Length + 1)); + for (int i = 0; i < phrases.Count; i++) + { + if (i > 0) sb.Append(' '); + sb.Append(phrases[i]); + } + return sb.ToString(); + } + } + + private static async Task CaptureWindowWithDesktopSapiAsync( + string languageTag, + int maxDurationMs, + CancellationToken cancellationToken) + { + return await Task.Run(() => CaptureWindowWithDesktopSapi(languageTag, maxDurationMs, cancellationToken), cancellationToken).ConfigureAwait(false); + } + + private static string CaptureWindowWithDesktopSapi( + string languageTag, + int maxDurationMs, + CancellationToken cancellationToken) + { + var culture = CultureInfo.GetCultureInfo(languageTag); + var recognizerInfo = DesktopSpeechRecognitionEngine.InstalledRecognizers() + .FirstOrDefault(info => string.Equals(info.Culture.Name, culture.Name, StringComparison.OrdinalIgnoreCase)); + + if (recognizerInfo == null) + throw new InvalidOperationException($"Desktop speech recognizer language pack '{languageTag}' is not installed."); + + using var engine = new DesktopSpeechRecognitionEngine(recognizerInfo); + var phrases = new List(); + var phraseLock = new object(); + Exception? recognitionError = null; + + using var recognitionEnded = new ManualResetEventSlim(false); + using var cancellationRegistration = cancellationToken.Register(() => recognitionEnded.Set()); + + engine.SpeechRecognized += (_, e) => + { + if (e.Result.Confidence <= 0.0f || string.IsNullOrWhiteSpace(e.Result.Text)) + return; + + lock (phraseLock) + { + phrases.Add(e.Result.Text); + } + }; + engine.RecognizeCompleted += (_, e) => + { + if (e.Error != null) + recognitionError = e.Error; + recognitionEnded.Set(); + }; + + try + { + engine.LoadGrammar(new DesktopDictationGrammar()); + engine.SetInputToDefaultAudioDevice(); + engine.RecognizeAsync(DesktopRecognizeMode.Multiple); + + recognitionEnded.Wait(maxDurationMs); + } + finally + { + try + { + engine.RecognizeAsyncStop(); + } + catch (InvalidOperationException) + { + engine.RecognizeAsyncCancel(); + } + } + + cancellationToken.ThrowIfCancellationRequested(); + + if (recognitionError != null) + throw new InvalidOperationException($"Desktop speech recognition failed: {recognitionError.Message}", recognitionError); + + lock (phraseLock) + { + if (phrases.Count == 0) + throw new InvalidOperationException("No speech detected within the bounded capture window."); + + var sb = new StringBuilder(phrases.Sum(p => p.Length + 1)); + for (int i = 0; i < phrases.Count; i++) + { + if (i > 0) sb.Append(' '); + sb.Append(phrases[i]); + } + return sb.ToString(); + } + } + + public void Dispose() + { + _gate.Dispose(); + } + + /// + /// True for HRESULTs in the SAPI/Windows speech facility ranges. Windows + /// surfaces these as "Internal Speech Error" / "The text associated with + /// this error code could not be found" because the strings are not in the + /// system message table — the most common real cause is online speech + /// recognition being disabled in Privacy settings. + /// + private static bool IsSapiError(int hresult) + { + var u = (uint)hresult; + // Windows speech HRESULTs use facility 0x004 (FACILITY_ITF for SAPI + // and the WinRT speech subsystem). The full range 0x8004XXXX is + // shared with other COM/ITF errors, but the "no friendly text" + // / "Internal Speech Error" surface is specific to this range and + // a broader catch is safer than missing real failures. Note: we + // already handle UnauthorizedAccessException + 0x80045509 + // ("Microphone unavailable") above, so this catch sees only the + // residual speech-stack errors. + return (u & 0xFFFF0000u) == 0x80040000u; + } +} diff --git a/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml b/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml index e5dbee62..154f0d2e 100644 --- a/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml +++ b/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml @@ -203,6 +203,33 @@ + + + + + + + + + + + + + + + + + diff --git a/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml.cs b/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml.cs index 89f0b91b..5c3079de 100644 --- a/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml.cs +++ b/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml.cs @@ -94,6 +94,8 @@ private void LoadSettings() NodeCameraToggle.IsOn = _settings.NodeCameraEnabled; NodeLocationToggle.IsOn = _settings.NodeLocationEnabled; NodeBrowserProxyToggle.IsOn = _settings.NodeBrowserProxyEnabled; + NodeSttToggle.IsOn = _settings.NodeSttEnabled; + SttLanguageTextBox.Text = _settings.SttLanguage; UpdateSshTunnelPreviewText(); McpServerToggle.IsOn = _settings.EnableMcpServer; McpUrlTextBox.Text = NodeService.McpServerUrl; @@ -386,6 +388,10 @@ private void SaveSettings() _settings.NodeCameraEnabled = NodeCameraToggle.IsOn; _settings.NodeLocationEnabled = NodeLocationToggle.IsOn; _settings.NodeBrowserProxyEnabled = NodeBrowserProxyToggle.IsOn; + _settings.NodeSttEnabled = NodeSttToggle.IsOn; + var sttLanguage = SttLanguageTextBox.Text?.Trim(); + if (!string.IsNullOrEmpty(sttLanguage)) + _settings.SttLanguage = sttLanguage; _settings.EnableMcpServer = McpServerToggle.IsOn; _settings.Save(); diff --git a/tests/OpenClaw.Shared.Tests/CapabilityTests.cs b/tests/OpenClaw.Shared.Tests/CapabilityTests.cs index 57dacb42..5ef716be 100644 --- a/tests/OpenClaw.Shared.Tests/CapabilityTests.cs +++ b/tests/OpenClaw.Shared.Tests/CapabilityTests.cs @@ -2257,3 +2257,230 @@ public async Task ExecuteAsync_ReturnsError_ForUnknownCommand() Assert.Contains("Unknown command", res.Error); } } + +public class SttCapabilityTests +{ + private static JsonElement Parse(string json) + { + using var doc = JsonDocument.Parse(json); + return doc.RootElement.Clone(); + } + + [Fact] + public void CanHandle_SttTranscribe() + { + var cap = new SttCapability(NullLogger.Instance); + Assert.True(cap.CanHandle("stt.transcribe")); + Assert.False(cap.CanHandle("stt.stream")); + Assert.False(cap.CanHandle("tts.speak")); + Assert.Equal("stt", cap.Category); + Assert.Equal(SttCapability.TranscribeCommand, cap.Commands.Single()); + } + + [Fact] + public void ResolveLanguage_PrefersRequested() + { + Assert.Equal("ja-JP", SttCapability.ResolveLanguage("ja-JP", "en-GB")); + Assert.Equal("en-GB", SttCapability.ResolveLanguage(null, "en-GB")); + Assert.Equal("en-GB", SttCapability.ResolveLanguage(" ", "en-GB")); + Assert.Equal(SttCapability.DefaultLanguage, SttCapability.ResolveLanguage(null, null)); + } + + [Fact] + public void ResolveLanguage_RejectsNonsense() + { + Assert.Null(SttCapability.ResolveLanguage("not a tag", null)); + Assert.Null(SttCapability.ResolveLanguage("english", null)); + Assert.Null(SttCapability.ResolveLanguage("en_US", null)); + } + + [Fact] + public async Task Transcribe_ReturnsError_WhenMaxDurationMissing() + { + var cap = new SttCapability(NullLogger.Instance); + cap.TranscribeRequested += (_, _) => throw new InvalidOperationException("should not be called"); + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "stt1", + Command = "stt.transcribe", + Args = Parse("""{}""") + }); + + Assert.False(res.Ok); + Assert.Contains("Missing required maxDurationMs", res.Error); + } + + [Theory] + [InlineData(0)] + [InlineData(-1)] + [InlineData(-5000)] + public async Task Transcribe_ReturnsError_WhenMaxDurationNotPositive(int maxMs) + { + var cap = new SttCapability(NullLogger.Instance); + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "stt2", + Command = "stt.transcribe", + Args = Parse($$"""{"maxDurationMs":{{maxMs}}}""") + }); + + Assert.False(res.Ok); + Assert.Contains("Missing required maxDurationMs", res.Error); + } + + [Fact] + public async Task Transcribe_ReturnsError_WhenMaxDurationExceedsBound() + { + var cap = new SttCapability(NullLogger.Instance); + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "stt3", + Command = "stt.transcribe", + Args = Parse("""{"maxDurationMs":60000}""") + }); + + Assert.False(res.Ok); + Assert.Contains("exceeds 30000", res.Error); + } + + [Fact] + public async Task Transcribe_ReturnsError_WhenLanguageInvalid() + { + var cap = new SttCapability(NullLogger.Instance); + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "stt4", + Command = "stt.transcribe", + Args = Parse("""{"maxDurationMs":5000,"language":"english please"}""") + }); + + Assert.False(res.Ok); + Assert.Contains("Invalid language tag", res.Error); + } + + [Fact] + public async Task Transcribe_ReturnsError_WhenHandlerNotWired() + { + var cap = new SttCapability(NullLogger.Instance); + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "stt5", + Command = "stt.transcribe", + Args = Parse("""{"maxDurationMs":5000}""") + }); + + Assert.False(res.Ok); + Assert.Contains("not available", res.Error); + } + + [Fact] + public async Task Transcribe_PassesArgsToHandler_AndReturnsPayload() + { + var cap = new SttCapability(NullLogger.Instance); + SttTranscribeArgs? received = null; + cap.TranscribeRequested += (a, _) => + { + received = a; + return Task.FromResult(new SttTranscribeResult + { + Transcribed = true, + Text = "hello", + DurationMs = 4200, + Language = a.Language ?? SttCapability.DefaultLanguage + }); + }; + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "stt6", + Command = "stt.transcribe", + Args = Parse("""{"maxDurationMs":5000,"language":"en-GB"}""") + }); + + Assert.True(res.Ok); + Assert.NotNull(received); + Assert.Equal(5000, received!.MaxDurationMs); + Assert.Equal("en-GB", received.Language); + + var payload = JsonSerializer.SerializeToElement(res.Payload); + Assert.True(payload.GetProperty("transcribed").GetBoolean()); + Assert.Equal("hello", payload.GetProperty("text").GetString()); + Assert.Equal(4200, payload.GetProperty("durationMs").GetInt32()); + Assert.Equal("en-GB", payload.GetProperty("language").GetString()); + } + + [Fact] + public async Task Transcribe_DropsLanguage_WhenOmitted_LettingTrayUseSetting() + { + var cap = new SttCapability(NullLogger.Instance); + SttTranscribeArgs? received = null; + cap.TranscribeRequested += (a, _) => + { + received = a; + return Task.FromResult(new SttTranscribeResult { Transcribed = true, Text = "hi", DurationMs = 100, Language = "en-US" }); + }; + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "stt7", + Command = "stt.transcribe", + Args = Parse("""{"maxDurationMs":1000}""") + }); + + Assert.True(res.Ok); + Assert.Null(received!.Language); + } + + [Fact] + public async Task Transcribe_ReportsHandlerException() + { + var cap = new SttCapability(NullLogger.Instance); + cap.TranscribeRequested += (_, _) => throw new InvalidOperationException("Microphone unavailable."); + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "stt8", + Command = "stt.transcribe", + Args = Parse("""{"maxDurationMs":2000}""") + }); + + Assert.False(res.Ok); + Assert.Contains("Microphone unavailable", res.Error); + } + + [Fact] + public async Task Transcribe_ReturnsCanceled_WhenTokenFires() + { + var cap = new SttCapability(NullLogger.Instance); + cap.TranscribeRequested += async (_, ct) => + { + await Task.Delay(Timeout.Infinite, ct); + return new SttTranscribeResult(); + }; + + using var cts = new CancellationTokenSource(); + cts.CancelAfter(TimeSpan.FromMilliseconds(50)); + + var res = await cap.ExecuteAsync( + new NodeInvokeRequest { Id = "stt9", Command = "stt.transcribe", Args = Parse("""{"maxDurationMs":5000}""") }, + cts.Token); + + Assert.False(res.Ok); + Assert.Contains("canceled", res.Error, StringComparison.OrdinalIgnoreCase); + } + + [Fact] + public async Task ExecuteAsync_ReturnsError_ForUnknownCommand() + { + var cap = new SttCapability(NullLogger.Instance); + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "stt10", + Command = "stt.stream", + Args = Parse("""{}""") + }); + Assert.False(res.Ok); + Assert.Contains("Unknown command", res.Error); + } +} diff --git a/tests/OpenClaw.Shared.Tests/McpToolBridgeTests.cs b/tests/OpenClaw.Shared.Tests/McpToolBridgeTests.cs index 198141c6..a826a65c 100644 --- a/tests/OpenClaw.Shared.Tests/McpToolBridgeTests.cs +++ b/tests/OpenClaw.Shared.Tests/McpToolBridgeTests.cs @@ -349,4 +349,28 @@ public async Task UnhandledException_ReturnsGenericInternalError_NotLeakingMessa Assert.Equal(-32603, error.GetProperty("code").GetInt32()); Assert.DoesNotContain("secret-internal-detail", error.GetProperty("message").GetString()); } + + [Fact] + public async Task ToolsList_SttTranscribe_HasCuratedDescription() + { + var caps = new List + { + new FakeCapability("stt", "stt.transcribe"), + }; + var bridge = CreateBridge(caps); + var resp = await bridge.HandleRequestAsync(@"{""jsonrpc"":""2.0"",""id"":1,""method"":""tools/list""}"); + + using var doc = JsonDocument.Parse(resp!); + var description = doc.RootElement.GetProperty("result") + .GetProperty("tools")[0] + .GetProperty("description") + .GetString()!; + + // Must mention the key surface area so MCP clients render something useful. + Assert.Contains("microphone", description, System.StringComparison.OrdinalIgnoreCase); + Assert.Contains("maxDurationMs", description); + Assert.Contains("text", description, System.StringComparison.OrdinalIgnoreCase); + // And explicitly NOT the generic stub. + Assert.DoesNotContain("stt capability:", description); + } } diff --git a/tests/OpenClaw.Shared.Tests/ModelsTests.cs b/tests/OpenClaw.Shared.Tests/ModelsTests.cs index bef5095f..5e0e6bfc 100644 --- a/tests/OpenClaw.Shared.Tests/ModelsTests.cs +++ b/tests/OpenClaw.Shared.Tests/ModelsTests.cs @@ -1701,4 +1701,29 @@ public void ContextSummaryShort_FormatsSmallNumbers() var session = new SessionInfo { TotalTokens = 500, ContextTokens = 1000 }; Assert.Contains("500/1.0K", session.ContextSummaryShort); } + + [Fact] + public void DangerousCommands_IncludesSttTranscribe() + { + Assert.Contains("stt.transcribe", CommandCenterCommandGroups.DangerousCommands); + Assert.Contains("stt.transcribe", (IReadOnlySet)CommandCenterCommandGroups.DangerousCommandSet); + } + + [Fact] + public void MacNodeParityCommands_ExcludesSttTranscribe() + { + // Mac has no equivalent yet; ensure parity diagnostic does not flag + // Windows nodes for "missing" stt.transcribe. + Assert.DoesNotContain("stt.transcribe", CommandCenterCommandGroups.MacNodeParityCommands); + } + + [Fact] + public void CommonDangerousCommands_StillIncludedInMacParity() + { + // Refactor invariant: the original camera/screen dangerous commands + // still appear in Mac parity via the shared CommonDangerousCommands set. + Assert.Contains("camera.snap", CommandCenterCommandGroups.MacNodeParityCommands); + Assert.Contains("camera.clip", CommandCenterCommandGroups.MacNodeParityCommands); + Assert.Contains("screen.record", CommandCenterCommandGroups.MacNodeParityCommands); + } } diff --git a/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs b/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs index 231fba46..deb48875 100644 --- a/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs +++ b/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs @@ -36,6 +36,8 @@ public void RoundTrip_AllFields_Preserved() NodeCameraEnabled = false, NodeLocationEnabled = true, NodeBrowserProxyEnabled = false, + NodeSttEnabled = true, + SttLanguage = "en-GB", HasSeenActivityStreamTip = true, SkippedUpdateTag = "v1.2.3", NotifyChatResponses = false, @@ -76,6 +78,8 @@ public void RoundTrip_AllFields_Preserved() Assert.Equal(original.NodeCameraEnabled, restored.NodeCameraEnabled); Assert.Equal(original.NodeLocationEnabled, restored.NodeLocationEnabled); Assert.Equal(original.NodeBrowserProxyEnabled, restored.NodeBrowserProxyEnabled); + Assert.Equal(original.NodeSttEnabled, restored.NodeSttEnabled); + Assert.Equal(original.SttLanguage, restored.SttLanguage); Assert.Equal(original.HasSeenActivityStreamTip, restored.HasSeenActivityStreamTip); Assert.Equal(original.SkippedUpdateTag, restored.SkippedUpdateTag); Assert.Equal(original.NotifyChatResponses, restored.NotifyChatResponses); @@ -133,6 +137,8 @@ public void MissingFields_UseDefaults() Assert.True(settings.NodeCameraEnabled); Assert.True(settings.NodeLocationEnabled); Assert.True(settings.NodeBrowserProxyEnabled); + Assert.False(settings.NodeSttEnabled); + Assert.Equal("en-US", settings.SttLanguage); Assert.False(settings.HasSeenActivityStreamTip); Assert.Null(settings.SkippedUpdateTag); Assert.True(settings.NotifyChatResponses); @@ -182,6 +188,8 @@ public void BackwardCompatibility_OldSettingsWithoutNewFields() Assert.True(settings.NodeCameraEnabled); Assert.True(settings.NodeLocationEnabled); Assert.True(settings.NodeBrowserProxyEnabled); + Assert.False(settings.NodeSttEnabled); + Assert.Equal("en-US", settings.SttLanguage); Assert.False(settings.HasSeenActivityStreamTip); Assert.Null(settings.SkippedUpdateTag); Assert.True(settings.GlobalHotkeyEnabled); From 54b01e5aeb39b40ebee83bfcbc2dbc04193e1af4 Mon Sep 17 00:00:00 2001 From: Regis Brid Date: Mon, 4 May 2026 14:53:53 -0700 Subject: [PATCH 02/34] stt: privacy hardening, localization, and test coverage Review-driven cleanup on top of the initial stt.transcribe capability. No behavior change for successful invocations. Privacy: * SttCapability no longer echoes the caller-supplied language tag in the "Invalid language tag" error, and no longer interpolates the underlying exception's Message into "Transcribe failed". Both could end up in the recent-activity stream and BuildSupportBundle output, which can be shared off-device. Full detail still goes to the local logger. * App.OnNodeInvokeCompleted now sanitizes failed-invoke details for privacy-sensitive commands (stt.transcribe, camera.snap/clip, screen.snapshot/record). Recent activity and support bundles record only "privacy-sensitive | | error" instead of the raw error string. Non-privacy-sensitive commands keep the error text since it is useful for diagnostics and does not carry mic/camera args. * Models.cs PermissionDiagnostics microphone detail now mentions stt.transcribe instead of "future voice features", so users hitting 0x800455A0 see microphone in their permissions checklist as relevant. Refactors for testability (no behavior change): * New Services/NodeInvokeActivityFormatter.cs owns GetPrivacyClass and BuildDetails. App.OnNodeInvokeCompleted delegates to it. * New Services/NodeCapabilityGating.cs owns the optional-capability predicates. NodeService.RegisterCapabilities calls into it instead of inlining "_settings?.NodeXxxEnabled" checks. Privacy-sensitive defaults stay off; everything else stays default-on. * Both helpers are linked into OpenClaw.Tray.Tests. Localization: * SettingsWindow.xaml gains x:Uid for every TTS and STT control. The literal Text/Header/PlaceholderText values are kept as dev-time fallbacks, matching the SettingsTokenTextBox and SettingsMcpDescription pattern already in the file. * en-us, fr-fr, nl-nl, zh-cn, and zh-tw .resw files gain matching entries for the 14 new TTS/STT keys. Brand names (ElevenLabs), command names (tts.speak, stt.transcribe, gateway.nodes.allowCommands, MSIX), BCP-47 tags, and the eleven_multilingual_v2 model identifier are kept verbatim across all locales. * SettingsMcpDescription.Text in all five locales now lists "microphone" and "speakers" alongside camera/screen/canvas so the local MCP-server description reflects the full Phase 1 + Phase 2 voice surface. Tests: * Two new privacy regression tests in CapabilityTests verify that an invalid language and a thrown handler exception never leak their text into the response error. * New NodeInvokeActivityFormatterTests pin the privacy-class table, the sanitized details for privacy-sensitive failures, and the full ActivityStreamService.BuildSupportBundle path. * New NodeCapabilityGatingTests pin that tts.speak and stt.transcribe default off (including for null settings) and that the two capabilities are independent consent surfaces. * New SettingsWindowLocalizationCoverageTests parses SettingsWindow.xaml and asserts every new TTS/STT x:Uid resolves to the expected .Header/.Text/.Content/.PlaceholderText keys in en-us. * ActivityStreamServiceTests and NodeInvokeActivityFormatterTests now share a non-parallel xUnit collection because ActivityStreamService is a static singleton; running both classes in parallel could otherwise cause flaky support-bundle assertions. * NodeCapabilityGatingTests cleans up its temp settings directories. Cleanup: * Drop "Phase 2" wording from SpeechToTextService.cs; the resw section comments referring to "Phase 1 TTS / Phase 2 STT" are likewise reworded to plain "TTS / STT settings". Phase numbering is a planning artifact and should not appear in the codebase. Validation: * .\build.ps1 * dotnet test tests/OpenClaw.Shared.Tests/OpenClaw.Shared.Tests.csproj --no-restore (1173 passed, 20 skipped) * dotnet test tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj --no-restore (465 passed) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Capabilities/SttCapability.cs | 9 +- src/OpenClaw.Shared/Models.cs | 2 +- src/OpenClaw.Tray.WinUI/App.xaml.cs | 24 +--- .../Services/NodeCapabilityGating.cs | 26 ++++ .../Services/NodeInvokeActivityFormatter.cs | 62 ++++++++ .../Services/NodeService.cs | 14 +- .../SpeechToText/SpeechToTextService.cs | 2 +- .../Strings/en-us/Resources.resw | 49 ++++++- .../Strings/fr-fr/Resources.resw | 49 ++++++- .../Strings/nl-nl/Resources.resw | 49 ++++++- .../Strings/zh-cn/Resources.resw | 49 ++++++- .../Strings/zh-tw/Resources.resw | 49 ++++++- .../Windows/SettingsWindow.xaml | 29 ++-- .../OpenClaw.Shared.Tests/CapabilityTests.cs | 45 +++++- .../ActivityStreamServiceTests.cs | 7 + .../NodeCapabilityGatingTests.cs | 132 ++++++++++++++++++ .../NodeInvokeActivityFormatterTests.cs | 120 ++++++++++++++++ .../OpenClaw.Tray.Tests.csproj | 2 + ...SettingsWindowLocalizationCoverageTests.cs | 113 +++++++++++++++ 19 files changed, 782 insertions(+), 50 deletions(-) create mode 100644 src/OpenClaw.Tray.WinUI/Services/NodeCapabilityGating.cs create mode 100644 src/OpenClaw.Tray.WinUI/Services/NodeInvokeActivityFormatter.cs create mode 100644 tests/OpenClaw.Tray.Tests/NodeCapabilityGatingTests.cs create mode 100644 tests/OpenClaw.Tray.Tests/NodeInvokeActivityFormatterTests.cs create mode 100644 tests/OpenClaw.Tray.Tests/SettingsWindowLocalizationCoverageTests.cs diff --git a/src/OpenClaw.Shared/Capabilities/SttCapability.cs b/src/OpenClaw.Shared/Capabilities/SttCapability.cs index f401b606..43eea6ac 100644 --- a/src/OpenClaw.Shared/Capabilities/SttCapability.cs +++ b/src/OpenClaw.Shared/Capabilities/SttCapability.cs @@ -96,7 +96,7 @@ public override async Task ExecuteAsync( { resolvedLanguage = NormalizeLanguageTag(requestedLanguage); if (resolvedLanguage == null) - return Error($"Invalid language tag '{requestedLanguage}'"); + return Error("Invalid language tag"); } if (TranscribeRequested == null) @@ -127,9 +127,12 @@ public override async Task ExecuteAsync( } catch (Exception ex) { - // Exception message only — capability never sees the transcript. + // Privacy: never echo raw exception text into the response. The + // exception flows through the failed-invoke path and may be + // persisted to recent activity / support bundles. Full detail + // stays in the local log only. Logger.Error("STT transcribe failed", ex); - return Error($"Transcribe failed: {ex.Message}"); + return Error("Transcribe failed"); } } } diff --git a/src/OpenClaw.Shared/Models.cs b/src/OpenClaw.Shared/Models.cs index dab4bfdd..da7ef89e 100644 --- a/src/OpenClaw.Shared/Models.cs +++ b/src/OpenClaw.Shared/Models.cs @@ -762,7 +762,7 @@ public static List BuildDefaultWindowsMatrix() { Name = "Microphone", Status = "review", - Detail = "Required only for camera clips with audio or future voice features.", + Detail = "Required for camera clips with audio and for stt.transcribe speech-to-text capture.", SettingsUri = "ms-settings:privacy-microphone" }, new() diff --git a/src/OpenClaw.Tray.WinUI/App.xaml.cs b/src/OpenClaw.Tray.WinUI/App.xaml.cs index 063d2fa9..90922216 100644 --- a/src/OpenClaw.Tray.WinUI/App.xaml.cs +++ b/src/OpenClaw.Tray.WinUI/App.xaml.cs @@ -1199,10 +1199,8 @@ private void OnNodeNotificationRequested(object? sender, OpenClaw.Shared.Capabil private void OnNodeInvokeCompleted(object? sender, NodeInvokeCompletedEventArgs args) { var status = args.Ok ? "completed" : "failed"; - var durationMs = Math.Max(0, (int)Math.Round(args.Duration.TotalMilliseconds)); - var details = args.Ok - ? $"{GetNodeInvokePrivacyClass(args.Command)} · {durationMs} ms" - : $"{GetNodeInvokePrivacyClass(args.Command)} · {durationMs} ms · {args.Error ?? "unknown error"}"; + var durationMs = (int)Math.Round(args.Duration.TotalMilliseconds); + var details = NodeInvokeActivityFormatter.BuildDetails(args.Command, args.Ok, durationMs, args.Error); AddRecentActivity( $"node.invoke {status}: {args.Command}", @@ -1215,23 +1213,7 @@ private void OnNodeInvokeCompleted(object? sender, NodeInvokeCompletedEventArgs } private static string GetNodeInvokePrivacyClass(string command) - { - if (string.Equals(command, "screen.record", StringComparison.OrdinalIgnoreCase) || - string.Equals(command, "screen.snapshot", StringComparison.OrdinalIgnoreCase) || - string.Equals(command, "camera.snap", StringComparison.OrdinalIgnoreCase) || - string.Equals(command, "camera.clip", StringComparison.OrdinalIgnoreCase) || - string.Equals(command, "stt.transcribe", StringComparison.OrdinalIgnoreCase)) - { - return "privacy-sensitive"; - } - - if (command.StartsWith("system.run", StringComparison.OrdinalIgnoreCase)) - { - return "exec"; - } - - return "metadata"; - } + => NodeInvokeActivityFormatter.GetPrivacyClass(command); private void OnConnectionStatusChanged(object? sender, ConnectionStatus status) { diff --git a/src/OpenClaw.Tray.WinUI/Services/NodeCapabilityGating.cs b/src/OpenClaw.Tray.WinUI/Services/NodeCapabilityGating.cs new file mode 100644 index 00000000..343ae7af --- /dev/null +++ b/src/OpenClaw.Tray.WinUI/Services/NodeCapabilityGating.cs @@ -0,0 +1,26 @@ +namespace OpenClawTray.Services; + +/// +/// Pure predicates that decide which optional node capabilities should be +/// advertised based on the user's flags. +/// +/// Extracted from NodeService.RegisterCapabilities so the gating +/// rules can be unit-tested without standing up the full tray host. Both +/// the gateway client path and the MCP-only path read from the same +/// authoritative capability list, so a regression here would silently drop +/// or leak a capability across both surfaces. +/// +/// Defaults: capabilities default ON (a missing or null settings object +/// counts as enabled) except tts.speak and stt.transcribe, +/// which are privacy-sensitive and require an explicit opt-in. +/// +internal static class NodeCapabilityGating +{ + public static bool ShouldRegisterCanvas(SettingsManager? s) => s?.NodeCanvasEnabled != false; + public static bool ShouldRegisterScreen(SettingsManager? s) => s?.NodeScreenEnabled != false; + public static bool ShouldRegisterCamera(SettingsManager? s) => s?.NodeCameraEnabled != false; + public static bool ShouldRegisterLocation(SettingsManager? s) => s?.NodeLocationEnabled != false; + public static bool ShouldRegisterBrowserProxy(SettingsManager? s) => s?.NodeBrowserProxyEnabled != false; + public static bool ShouldRegisterTts(SettingsManager? s) => s?.NodeTtsEnabled == true; + public static bool ShouldRegisterStt(SettingsManager? s) => s?.NodeSttEnabled == true; +} diff --git a/src/OpenClaw.Tray.WinUI/Services/NodeInvokeActivityFormatter.cs b/src/OpenClaw.Tray.WinUI/Services/NodeInvokeActivityFormatter.cs new file mode 100644 index 00000000..7e13a5ed --- /dev/null +++ b/src/OpenClaw.Tray.WinUI/Services/NodeInvokeActivityFormatter.cs @@ -0,0 +1,62 @@ +using System; + +namespace OpenClawTray.Services; + +/// +/// Builds the activity-stream "details" string for completed node invocations. +/// +/// Extracted from App.OnNodeInvokeCompleted so the formatter can be unit-tested +/// without spinning up the WinUI App. Drives both the recent-activity menu and +/// . +/// +/// **Privacy invariant:** for privacy-sensitive commands (mic / camera / +/// screen) a failed invocation never includes the underlying error text in +/// details, since support bundles can be shared off-device. Caller-supplied +/// args (e.g., language tag) and runtime details (audio/video stack errors) +/// stay in the local log only. +/// +internal static class NodeInvokeActivityFormatter +{ + public const string PrivacySensitive = "privacy-sensitive"; + public const string Exec = "exec"; + public const string Metadata = "metadata"; + + public static string GetPrivacyClass(string command) + { + if (string.IsNullOrEmpty(command)) return Metadata; + + if (string.Equals(command, "screen.record", StringComparison.OrdinalIgnoreCase) || + string.Equals(command, "screen.snapshot", StringComparison.OrdinalIgnoreCase) || + string.Equals(command, "camera.snap", StringComparison.OrdinalIgnoreCase) || + string.Equals(command, "camera.clip", StringComparison.OrdinalIgnoreCase) || + string.Equals(command, "stt.transcribe", StringComparison.OrdinalIgnoreCase)) + { + return PrivacySensitive; + } + + if (command.StartsWith("system.run", StringComparison.OrdinalIgnoreCase)) + { + return Exec; + } + + return Metadata; + } + + public static string BuildDetails(string command, bool ok, int durationMs, string? error) + { + var privacyClass = GetPrivacyClass(command); + durationMs = Math.Max(0, durationMs); + + if (ok) + return $"{privacyClass} · {durationMs} ms"; + + if (string.Equals(privacyClass, PrivacySensitive, StringComparison.Ordinal)) + { + // See class summary: never echo error text for privacy-sensitive + // commands. Full detail stays in the local log. + return $"{privacyClass} · {durationMs} ms · error"; + } + + return $"{privacyClass} · {durationMs} ms · {error ?? "unknown error"}"; + } +} diff --git a/src/OpenClaw.Tray.WinUI/Services/NodeService.cs b/src/OpenClaw.Tray.WinUI/Services/NodeService.cs index 2b7e40c0..e062a7b2 100644 --- a/src/OpenClaw.Tray.WinUI/Services/NodeService.cs +++ b/src/OpenClaw.Tray.WinUI/Services/NodeService.cs @@ -248,7 +248,7 @@ private void RegisterCapabilities() _systemCapability.SetPromptHandler(new ExecApprovalPromptService(_dispatcherQueue, _rootProvider, _logger)); Register(_systemCapability); - if (_settings?.NodeCanvasEnabled != false) + if (NodeCapabilityGating.ShouldRegisterCanvas(_settings)) { _canvasCapability = new CanvasCapability(_logger); _canvasCapability.PresentRequested += OnCanvasPresent; @@ -263,7 +263,7 @@ private void RegisterCapabilities() Register(_canvasCapability); } - if (_settings?.NodeScreenEnabled != false) + if (NodeCapabilityGating.ShouldRegisterScreen(_settings)) { _screenCapability = new ScreenCapability(_logger); _screenCapability.CaptureRequested += OnScreenCapture; @@ -271,7 +271,7 @@ private void RegisterCapabilities() Register(_screenCapability); } - if (_settings?.NodeCameraEnabled != false) + if (NodeCapabilityGating.ShouldRegisterCamera(_settings)) { _cameraCapability = new CameraCapability(_logger); _cameraCapability.ListRequested += OnCameraList; @@ -280,14 +280,14 @@ private void RegisterCapabilities() Register(_cameraCapability); } - if (_settings?.NodeLocationEnabled != false) + if (NodeCapabilityGating.ShouldRegisterLocation(_settings)) { _locationCapability = new LocationCapability(_logger); _locationCapability.GetRequested += async (args) => await GetLocationAsync(args); Register(_locationCapability); } - if (_settings?.NodeTtsEnabled == true) + if (NodeCapabilityGating.ShouldRegisterTts(_settings)) { _textToSpeechService ??= new TextToSpeechService(_logger, _settings); _ttsCapability = new TtsCapability(_logger); @@ -302,7 +302,7 @@ private void RegisterCapabilities() _deviceCapability = new DeviceCapability(_logger, _deviceStatusProvider); Register(_deviceCapability); - if (_settings?.NodeSttEnabled == true) + if (NodeCapabilityGating.ShouldRegisterStt(_settings)) { _speechToTextService ??= new SpeechToText.SpeechToTextService(_logger, _settings); _sttCapability = new SttCapability(_logger); @@ -311,7 +311,7 @@ private void RegisterCapabilities() } // BrowserProxy needs a live gateway connection — only register when gateway is up. - if (_nodeClient != null && _settings?.NodeBrowserProxyEnabled != false) + if (_nodeClient != null && NodeCapabilityGating.ShouldRegisterBrowserProxy(_settings)) { _browserProxyCapability = new BrowserProxyCapability( _logger, diff --git a/src/OpenClaw.Tray.WinUI/Services/SpeechToText/SpeechToTextService.cs b/src/OpenClaw.Tray.WinUI/Services/SpeechToText/SpeechToTextService.cs index 944fd7ed..560bf74f 100644 --- a/src/OpenClaw.Tray.WinUI/Services/SpeechToText/SpeechToTextService.cs +++ b/src/OpenClaw.Tray.WinUI/Services/SpeechToText/SpeechToTextService.cs @@ -25,7 +25,7 @@ namespace OpenClawTray.Services.SpeechToText; /// /// Single-flight: a second concurrent caller fails fast with /// "STT already in progress" rather than tearing down the active session -/// (per Phase 2 design — no interrupt arg). +/// (the capability deliberately exposes no interrupt arg). /// /// **Privacy invariant:** transcript text is never passed to . /// Logger sees outcome + duration only. diff --git a/src/OpenClaw.Tray.WinUI/Strings/en-us/Resources.resw b/src/OpenClaw.Tray.WinUI/Strings/en-us/Resources.resw index 59accc1b..88053ada 100644 --- a/src/OpenClaw.Tray.WinUI/Strings/en-us/Resources.resw +++ b/src/OpenClaw.Tray.WinUI/Strings/en-us/Resources.resw @@ -960,7 +960,7 @@ On your gateway host (Mac/Linux), run: Enable Local MCP Server - Exposes the same node capabilities (system, screen, camera, canvas) to local MCP clients (Claude Desktop, Cursor, Claude Code). + Exposes the same node capabilities (system, screen, camera, microphone, speakers, canvas) to local MCP clients (Claude Desktop, Cursor, Claude Code). Endpoint: @@ -1414,4 +1414,51 @@ On your gateway host (Mac/Linux), run: You can configure the gateway later in Settings. + + + Text-to-speech playback + + + Text-to-speech exposes tts.speak. Gateway use still requires tts.speak in gateway.nodes.allowCommands. + + + TTS provider + + + Windows built-in speech + + + ElevenLabs + + + ElevenLabs API key + + + ElevenLabs voice ID + + + ElevenLabs model + + + eleven_multilingual_v2 + + + Speech-to-text (microphone) + + + When enabled, this Windows node advertises stt.transcribe to capture short audio clips from the default microphone and return transcribed text. Default off. Exposes the microphone to both gateway callers (when Node Mode is on, subject to the gateway allowlist) and local MCP clients (when the local MCP server is on, subject to the bearer token). Requires Windows microphone access for desktop apps in Settings → Privacy & security → Microphone (packaged MSIX installs additionally need per-app permission). + + + Enable speech-to-text + + + Language tag + + + en-US + + + BCP-47 tag (e.g. en-US, en-GB, ja-JP). The OS must have the speech recognition pack for this language installed. + + diff --git a/src/OpenClaw.Tray.WinUI/Strings/fr-fr/Resources.resw b/src/OpenClaw.Tray.WinUI/Strings/fr-fr/Resources.resw index c592a252..e47a3371 100644 --- a/src/OpenClaw.Tray.WinUI/Strings/fr-fr/Resources.resw +++ b/src/OpenClaw.Tray.WinUI/Strings/fr-fr/Resources.resw @@ -960,7 +960,7 @@ Sur votre hôte passerelle (Mac/Linux), exécutez : Activer le serveur MCP local - Expose les mêmes capacités du nœud (système, écran, caméra, canvas) aux clients MCP locaux (Claude Desktop, Cursor, Claude Code). + Expose les mêmes capacités du nœud (système, écran, caméra, microphone, haut-parleurs, canvas) aux clients MCP locaux (Claude Desktop, Cursor, Claude Code). Point de terminaison : @@ -1413,4 +1413,51 @@ Sur votre hôte passerelle (Mac/Linux), exécutez : Vous pouvez configurer la passerelle plus tard dans les paramètres. + + + Lecture de la synthèse vocale + + + La synthèse vocale expose tts.speak. L'utilisation via la passerelle nécessite que tts.speak figure dans gateway.nodes.allowCommands. + + + Fournisseur TTS + + + Voix intégrée à Windows + + + ElevenLabs + + + Clé API ElevenLabs + + + ID de voix ElevenLabs + + + Modèle ElevenLabs + + + eleven_multilingual_v2 + + + Reconnaissance vocale (microphone) + + + Lorsque cette option est activée, ce nœud Windows annonce stt.transcribe pour capturer de courts extraits audio depuis le microphone par défaut et renvoyer le texte transcrit. Désactivé par défaut. Expose le microphone aux appelants via la passerelle (lorsque le mode Nœud est activé, sous réserve de la liste d'autorisation de la passerelle) et aux clients MCP locaux (lorsque le serveur MCP local est activé, sous réserve du jeton porteur). Nécessite l'accès au microphone pour les applications de bureau dans Paramètres → Confidentialité et sécurité → Microphone (les installations MSIX empaquetées nécessitent en outre une autorisation par application). + + + Activer la reconnaissance vocale + + + Étiquette de langue + + + en-US + + + Étiquette BCP-47 (par ex. en-US, en-GB, ja-JP). Le pack de reconnaissance vocale pour cette langue doit être installé sur le système d'exploitation. + + diff --git a/src/OpenClaw.Tray.WinUI/Strings/nl-nl/Resources.resw b/src/OpenClaw.Tray.WinUI/Strings/nl-nl/Resources.resw index 443c34b9..c0504539 100644 --- a/src/OpenClaw.Tray.WinUI/Strings/nl-nl/Resources.resw +++ b/src/OpenClaw.Tray.WinUI/Strings/nl-nl/Resources.resw @@ -960,7 +960,7 @@ Voer op uw gateway-host (Mac/Linux) uit: Lokale MCP-server inschakelen - Stelt dezelfde knooppuntmogelijkheden (systeem, scherm, camera, canvas) beschikbaar voor lokale MCP-clients (Claude Desktop, Cursor, Claude Code). + Stelt dezelfde knooppuntmogelijkheden (systeem, scherm, camera, microfoon, luidsprekers, canvas) beschikbaar voor lokale MCP-clients (Claude Desktop, Cursor, Claude Code). Eindpunt: @@ -1414,4 +1414,51 @@ Voer op uw gateway-host (Mac/Linux) uit: U kunt de gateway later configureren in Instellingen. + + + Tekst-naar-spraak afspelen + + + Tekst-naar-spraak stelt tts.speak beschikbaar. Gebruik via de gateway vereist nog steeds dat tts.speak in gateway.nodes.allowCommands staat. + + + TTS-aanbieder + + + Ingebouwde Windows-spraak + + + ElevenLabs + + + ElevenLabs API-sleutel + + + ElevenLabs stem-ID + + + ElevenLabs-model + + + eleven_multilingual_v2 + + + Spraak-naar-tekst (microfoon) + + + Wanneer ingeschakeld, kondigt deze Windows-node stt.transcribe aan om korte audiofragmenten van de standaardmicrofoon op te nemen en getranscribeerde tekst terug te geven. Standaard uit. Stelt de microfoon beschikbaar aan zowel gateway-aanroepers (wanneer Node Mode aan staat, onder voorbehoud van de gateway-toelatingslijst) als lokale MCP-clients (wanneer de lokale MCP-server aan staat, onder voorbehoud van het bearer-token). Vereist Windows-microfoontoegang voor desktop-apps in Instellingen → Privacy & beveiliging → Microfoon (verpakte MSIX-installaties vereisen daarnaast app-specifieke toestemming). + + + Spraak-naar-tekst inschakelen + + + Taalcode + + + en-US + + + BCP-47-tag (bijv. en-US, en-GB, ja-JP). Het besturingssysteem moet het spraakherkenningspakket voor deze taal hebben geïnstalleerd. + + diff --git a/src/OpenClaw.Tray.WinUI/Strings/zh-cn/Resources.resw b/src/OpenClaw.Tray.WinUI/Strings/zh-cn/Resources.resw index 3bd92a3e..c38c8b37 100644 --- a/src/OpenClaw.Tray.WinUI/Strings/zh-cn/Resources.resw +++ b/src/OpenClaw.Tray.WinUI/Strings/zh-cn/Resources.resw @@ -960,7 +960,7 @@ 启用本地 MCP 服务器 - 向本地 MCP 客户端(Claude Desktop、Cursor、Claude Code)公开相同的节点功能(系统、屏幕、摄像头、画布)。 + 向本地 MCP 客户端(Claude Desktop、Cursor、Claude Code)公开相同的节点功能(系统、屏幕、摄像头、麦克风、扬声器、画布)。 终结点: @@ -1414,4 +1414,51 @@ 您可以稍后在“设置”中配置网关。 + + + 文字转语音播放 + + + 文字转语音功能公开 tts.speak。通过网关调用仍需要在 gateway.nodes.allowCommands 中包含 tts.speak。 + + + TTS 提供程序 + + + Windows 内置语音 + + + ElevenLabs + + + ElevenLabs API 密钥 + + + ElevenLabs 语音 ID + + + ElevenLabs 模型 + + + eleven_multilingual_v2 + + + 语音转文字(麦克风) + + + 启用后,此 Windows 节点会公开 stt.transcribe,从默认麦克风捕获短音频片段并返回转写文本。默认关闭。会向网关调用方(当节点模式开启时,受网关允许列表约束)和本地 MCP 客户端(当本地 MCP 服务器开启时,受持有者令牌约束)公开麦克风。要求在“设置 → 隐私和安全性 → 麦克风”中允许桌面应用访问麦克风(打包的 MSIX 安装还需要按应用授予权限)。 + + + 启用语音转文字 + + + 语言代码 + + + en-US + + + BCP-47 标签(例如 en-US、en-GB、ja-JP)。操作系统必须已安装该语言的语音识别包。 + + diff --git a/src/OpenClaw.Tray.WinUI/Strings/zh-tw/Resources.resw b/src/OpenClaw.Tray.WinUI/Strings/zh-tw/Resources.resw index dd76c719..5542b4a5 100644 --- a/src/OpenClaw.Tray.WinUI/Strings/zh-tw/Resources.resw +++ b/src/OpenClaw.Tray.WinUI/Strings/zh-tw/Resources.resw @@ -960,7 +960,7 @@ 啟用本機 MCP 伺服器 - 向本機 MCP 用戶端(Claude Desktop、Cursor、Claude Code)公開相同的節點功能(系統、螢幕、攝影機、畫布)。 + 向本機 MCP 用戶端(Claude Desktop、Cursor、Claude Code)公開相同的節點功能(系統、螢幕、攝影機、麥克風、喇叭、畫布)。 端點: @@ -1414,4 +1414,51 @@ 您可以稍後在「設定」中設定閘道。 + + + 文字轉語音播放 + + + 文字轉語音功能公開 tts.speak。透過閘道使用仍需在 gateway.nodes.allowCommands 中包含 tts.speak。 + + + TTS 提供者 + + + Windows 內建語音 + + + ElevenLabs + + + ElevenLabs API 金鑰 + + + ElevenLabs 語音 ID + + + ElevenLabs 模型 + + + eleven_multilingual_v2 + + + 語音轉文字(麥克風) + + + 啟用後,此 Windows 節點會公開 stt.transcribe,以從預設麥克風擷取短音訊片段並回傳轉錄的文字。預設為關閉。會向閘道呼叫者(當節點模式開啟時,受閘道允許清單約束)和本機 MCP 用戶端(當本機 MCP 伺服器開啟時,受持有者權杖約束)公開麥克風。需在「設定 → 隱私權與安全性 → 麥克風」中允許桌面應用程式存取麥克風(封裝的 MSIX 安裝還需要個別應用程式權限)。 + + + 啟用語音轉文字 + + + 語言標籤 + + + en-US + + + BCP-47 標籤(例如 en-US、en-GB、ja-JP)。作業系統必須已安裝該語言的語音辨識套件。 + + diff --git a/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml b/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml index 52149dad..0fc2ddbd 100644 --- a/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml +++ b/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml @@ -200,28 +200,33 @@ - - + - - + + @@ -235,21 +240,23 @@ Padding="12" Margin="0,8,0,0"> - - + - + - - + + - @@ -260,7 +267,7 @@ Foreground="{ThemeResource AccentTextFillColorPrimaryBrush}" FontWeight="Bold" Margin="0,16,0,0"/> throw new InvalidOperationException(sensitive); + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "stt-priv-ex", + Command = "stt.transcribe", + Args = Parse("""{"maxDurationMs":5000}""") + }); + + Assert.False(res.Ok); + Assert.DoesNotContain(sensitive, res.Error); + } + [Fact] public async Task Transcribe_ReturnsError_WhenHandlerNotWired() { @@ -2985,7 +3025,10 @@ public async Task Transcribe_ReportsHandlerException() }); Assert.False(res.Ok); - Assert.Contains("Microphone unavailable", res.Error); + // Privacy: response surfaces a fixed sanitized error; raw exception + // text stays in the local log only. See + // Transcribe_HandlerException_DoesNotLeakExceptionMessageIntoError. + Assert.Equal("Transcribe failed", res.Error); } [Fact] diff --git a/tests/OpenClaw.Tray.Tests/ActivityStreamServiceTests.cs b/tests/OpenClaw.Tray.Tests/ActivityStreamServiceTests.cs index b14b9c2a..128571bf 100644 --- a/tests/OpenClaw.Tray.Tests/ActivityStreamServiceTests.cs +++ b/tests/OpenClaw.Tray.Tests/ActivityStreamServiceTests.cs @@ -2,6 +2,13 @@ namespace OpenClaw.Tray.Tests; +[CollectionDefinition(ActivityStreamServiceCollection.Name, DisableParallelization = true)] +public sealed class ActivityStreamServiceCollection +{ + public const string Name = "ActivityStreamService"; +} + +[Collection(ActivityStreamServiceCollection.Name)] public class ActivityStreamServiceTests : IDisposable { public ActivityStreamServiceTests() diff --git a/tests/OpenClaw.Tray.Tests/NodeCapabilityGatingTests.cs b/tests/OpenClaw.Tray.Tests/NodeCapabilityGatingTests.cs new file mode 100644 index 00000000..69671057 --- /dev/null +++ b/tests/OpenClaw.Tray.Tests/NodeCapabilityGatingTests.cs @@ -0,0 +1,132 @@ +using OpenClawTray.Services; + +namespace OpenClaw.Tray.Tests; + +/// +/// Pins the optional-capability gating that drives both the gateway client +/// path and the MCP-only path inside NodeService.RegisterCapabilities. +/// +/// Privacy-sensitive defaults must be **off** even when settings are missing. +/// A regression that flips Stt/Tts to default-on would silently advertise +/// stt.transcribe / tts.speak the moment the tray launches with a fresh +/// settings file, with no user opt-in. +/// +public sealed class NodeCapabilityGatingTests : IDisposable +{ + private readonly List _tempDirs = new(); + + public void Dispose() + { + foreach (var dir in _tempDirs) + { + if (Directory.Exists(dir)) + Directory.Delete(dir, recursive: true); + } + } + + private SettingsManager NewSettings() + { + var dir = Path.Combine(Path.GetTempPath(), "openclaw-tray-tests-" + Guid.NewGuid().ToString("N")); + Directory.CreateDirectory(dir); + _tempDirs.Add(dir); + return new SettingsManager(dir); + } + + [Fact] + public void NullSettings_DefaultOnCapabilities_AreEnabled() + { + // Defensive default: when settings are not yet loaded, we still + // advertise the non-privacy-sensitive capabilities so the node is + // usable immediately. + Assert.True(NodeCapabilityGating.ShouldRegisterCanvas(null)); + Assert.True(NodeCapabilityGating.ShouldRegisterScreen(null)); + Assert.True(NodeCapabilityGating.ShouldRegisterCamera(null)); + Assert.True(NodeCapabilityGating.ShouldRegisterLocation(null)); + Assert.True(NodeCapabilityGating.ShouldRegisterBrowserProxy(null)); + } + + [Fact] + public void NullSettings_PrivacySensitiveCapabilities_AreDisabled() + { + // Privacy invariant: TTS and STT must require an explicit user + // opt-in. A null/missing settings object must not enable mic capture + // or speaker output. + Assert.False(NodeCapabilityGating.ShouldRegisterTts(null)); + Assert.False(NodeCapabilityGating.ShouldRegisterStt(null)); + } + + [Fact] + public void DefaultSettings_PrivacySensitiveCapabilities_AreDisabled() + { + var s = NewSettings(); + Assert.False(NodeCapabilityGating.ShouldRegisterTts(s)); + Assert.False(NodeCapabilityGating.ShouldRegisterStt(s)); + } + + [Fact] + public void DefaultSettings_OtherCapabilities_AreEnabled() + { + var s = NewSettings(); + Assert.True(NodeCapabilityGating.ShouldRegisterCanvas(s)); + Assert.True(NodeCapabilityGating.ShouldRegisterScreen(s)); + Assert.True(NodeCapabilityGating.ShouldRegisterCamera(s)); + Assert.True(NodeCapabilityGating.ShouldRegisterLocation(s)); + Assert.True(NodeCapabilityGating.ShouldRegisterBrowserProxy(s)); + } + + [Fact] + public void Tts_OnlyAdvertisedWhenExplicitlyEnabled() + { + var s = NewSettings(); + Assert.False(NodeCapabilityGating.ShouldRegisterTts(s)); + s.NodeTtsEnabled = true; + Assert.True(NodeCapabilityGating.ShouldRegisterTts(s)); + s.NodeTtsEnabled = false; + Assert.False(NodeCapabilityGating.ShouldRegisterTts(s)); + } + + [Fact] + public void Stt_OnlyAdvertisedWhenExplicitlyEnabled() + { + var s = NewSettings(); + Assert.False(NodeCapabilityGating.ShouldRegisterStt(s)); + s.NodeSttEnabled = true; + Assert.True(NodeCapabilityGating.ShouldRegisterStt(s)); + s.NodeSttEnabled = false; + Assert.False(NodeCapabilityGating.ShouldRegisterStt(s)); + } + + [Fact] + public void TtsAndStt_Independent() + { + // A user who enables only TTS (output) must not silently enable STT + // (input), and vice versa. Each capability is its own consent surface. + var s = NewSettings(); + s.NodeTtsEnabled = true; + s.NodeSttEnabled = false; + Assert.True(NodeCapabilityGating.ShouldRegisterTts(s)); + Assert.False(NodeCapabilityGating.ShouldRegisterStt(s)); + + s.NodeTtsEnabled = false; + s.NodeSttEnabled = true; + Assert.False(NodeCapabilityGating.ShouldRegisterTts(s)); + Assert.True(NodeCapabilityGating.ShouldRegisterStt(s)); + } + + [Fact] + public void DefaultOnCapabilities_OnlyDisabledWhenExplicitlySetToFalse() + { + var s = NewSettings(); + s.NodeCanvasEnabled = false; + s.NodeScreenEnabled = false; + s.NodeCameraEnabled = false; + s.NodeLocationEnabled = false; + s.NodeBrowserProxyEnabled = false; + + Assert.False(NodeCapabilityGating.ShouldRegisterCanvas(s)); + Assert.False(NodeCapabilityGating.ShouldRegisterScreen(s)); + Assert.False(NodeCapabilityGating.ShouldRegisterCamera(s)); + Assert.False(NodeCapabilityGating.ShouldRegisterLocation(s)); + Assert.False(NodeCapabilityGating.ShouldRegisterBrowserProxy(s)); + } +} diff --git a/tests/OpenClaw.Tray.Tests/NodeInvokeActivityFormatterTests.cs b/tests/OpenClaw.Tray.Tests/NodeInvokeActivityFormatterTests.cs new file mode 100644 index 00000000..bb314bcf --- /dev/null +++ b/tests/OpenClaw.Tray.Tests/NodeInvokeActivityFormatterTests.cs @@ -0,0 +1,120 @@ +using OpenClawTray.Services; + +namespace OpenClaw.Tray.Tests; + +/// +/// Privacy regression tests for the activity-stream details formatter that +/// powers both the recent-activity menu and the support bundle. +/// +/// The end-to-end persistence path is: +/// NodeService.OnNodeInvokeCompleted (capability handler exception) +/// → App.OnNodeInvokeCompleted +/// → NodeInvokeActivityFormatter.BuildDetails +/// → ActivityStreamService.Add +/// → ActivityStreamService.BuildSupportBundle (when user shares logs) +/// +/// For privacy-sensitive commands (mic / camera / screen), no caller-supplied +/// arg or runtime detail may reach support bundles. This test pins that. +/// +[Collection(ActivityStreamServiceCollection.Name)] +public sealed class NodeInvokeActivityFormatterTests : IDisposable +{ + public NodeInvokeActivityFormatterTests() => ActivityStreamService.Clear(); + public void Dispose() => ActivityStreamService.Clear(); + + [Theory] + [InlineData("stt.transcribe")] + [InlineData("camera.snap")] + [InlineData("camera.clip")] + [InlineData("screen.snapshot")] + [InlineData("screen.record")] + public void PrivacySensitive_FailedInvoke_OmitsErrorTextFromDetails(string command) + { + const string secret = "secret-language-or-device-detail"; + var details = NodeInvokeActivityFormatter.BuildDetails(command, ok: false, durationMs: 4321, error: secret); + + Assert.Equal("privacy-sensitive · 4321 ms · error", details); + Assert.DoesNotContain(secret, details); + } + + [Fact] + public void PrivacySensitive_FailedInvoke_SecretDoesNotReachSupportBundle() + { + const string secret = "secret-language-or-device-detail"; + var details = NodeInvokeActivityFormatter.BuildDetails("stt.transcribe", ok: false, durationMs: 1234, error: secret); + + ActivityStreamService.Add( + category: "node.invoke", + title: "node.invoke failed: stt.transcribe", + details: details, + nodeId: "test-node"); + + var bundle = ActivityStreamService.BuildSupportBundle(); + Assert.DoesNotContain(secret, bundle); + Assert.Contains("privacy-sensitive · 1234 ms · error", bundle); + } + + [Fact] + public void PrivacySensitive_SuccessfulInvoke_OmitsAllDetail() + { + var details = NodeInvokeActivityFormatter.BuildDetails("stt.transcribe", ok: true, durationMs: 800, error: null); + Assert.Equal("privacy-sensitive · 800 ms", details); + } + + [Fact] + public void NonPrivacySensitive_FailedInvoke_KeepsErrorForDiagnostics() + { + // Non-privacy-sensitive commands (metadata / exec) keep the error text + // because they're useful for diagnostics and don't carry mic/camera args. + var details = NodeInvokeActivityFormatter.BuildDetails( + "device.status", + ok: false, + durationMs: 50, + error: "gateway unreachable"); + + Assert.Equal("metadata · 50 ms · gateway unreachable", details); + } + + [Fact] + public void NonPrivacySensitive_FailedInvoke_NullError_FallsBackToUnknown() + { + var details = NodeInvokeActivityFormatter.BuildDetails("device.status", ok: false, durationMs: 0, error: null); + Assert.Equal("metadata · 0 ms · unknown error", details); + } + + [Fact] + public void Exec_FailedInvoke_KeepsErrorForDiagnostics() + { + var details = NodeInvokeActivityFormatter.BuildDetails( + "system.run", + ok: false, + durationMs: 100, + error: "exit code 1"); + + Assert.Equal("exec · 100 ms · exit code 1", details); + } + + [Fact] + public void NegativeDuration_ClampsToZero() + { + var details = NodeInvokeActivityFormatter.BuildDetails("device.status", ok: true, durationMs: -7, error: null); + Assert.Equal("metadata · 0 ms", details); + } + + [Theory] + [InlineData("stt.transcribe", "privacy-sensitive")] + [InlineData("STT.Transcribe", "privacy-sensitive")] + [InlineData("camera.snap", "privacy-sensitive")] + [InlineData("camera.clip", "privacy-sensitive")] + [InlineData("screen.snapshot", "privacy-sensitive")] + [InlineData("screen.record", "privacy-sensitive")] + [InlineData("system.run", "exec")] + [InlineData("system.run.shell", "exec")] + [InlineData("device.status", "metadata")] + [InlineData("tts.speak", "metadata")] // TTS speaks audio out, not in — not classified privacy-sensitive + [InlineData("", "metadata")] + public void GetPrivacyClass_KnownCommands(string command, string expected) + { + Assert.Equal(expected, NodeInvokeActivityFormatter.GetPrivacyClass(command)); + } +} diff --git a/tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj b/tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj index 6f653ad2..117244ef 100644 --- a/tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj +++ b/tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj @@ -25,6 +25,8 @@ + + diff --git a/tests/OpenClaw.Tray.Tests/SettingsWindowLocalizationCoverageTests.cs b/tests/OpenClaw.Tray.Tests/SettingsWindowLocalizationCoverageTests.cs new file mode 100644 index 00000000..7ab07709 --- /dev/null +++ b/tests/OpenClaw.Tray.Tests/SettingsWindowLocalizationCoverageTests.cs @@ -0,0 +1,113 @@ +using System.Xml.Linq; + +namespace OpenClaw.Tray.Tests; + +/// +/// Pins that the TTS / STT controls in SettingsWindow.xaml are localized +/// (have an x:Uid) and that en-us\Resources.resw provides matching keys. +/// +/// LocalizationValidationTests only catches drift between locales — it does +/// not catch the case where a developer adds a new control with hardcoded +/// English text and never registers it in any .resw file. This test closes +/// that hole for the privacy-sensitive voice surface. +/// +public sealed class SettingsWindowLocalizationCoverageTests +{ + private static readonly XNamespace XamlNs = "http://schemas.microsoft.com/winfx/2006/xaml/presentation"; + private static readonly XNamespace XNs = "http://schemas.microsoft.com/winfx/2006/xaml"; + + private static string GetRepositoryRoot() + { + var envRepoRoot = Environment.GetEnvironmentVariable("OPENCLAW_REPO_ROOT"); + if (!string.IsNullOrWhiteSpace(envRepoRoot) && Directory.Exists(envRepoRoot)) + return envRepoRoot; + + var directory = new DirectoryInfo(AppContext.BaseDirectory); + while (directory != null) + { + if ((Directory.Exists(Path.Combine(directory.FullName, ".git")) || + File.Exists(Path.Combine(directory.FullName, ".git"))) && + File.Exists(Path.Combine(directory.FullName, "README.md"))) + return directory.FullName; + directory = directory.Parent; + } + + throw new InvalidOperationException( + "Could not find repository root. Set OPENCLAW_REPO_ROOT to the repo path."); + } + + private static string GetSettingsXamlPath() => + Path.Combine(GetRepositoryRoot(), "src", "OpenClaw.Tray.WinUI", "Windows", "SettingsWindow.xaml"); + + private static string GetEnUsReswPath() => + Path.Combine(GetRepositoryRoot(), "src", "OpenClaw.Tray.WinUI", "Strings", "en-us", "Resources.resw"); + + private static HashSet LoadReswKeys() + { + var doc = XDocument.Load(GetEnUsReswPath()); + return doc.Descendants("data") + .Select(e => e.Attribute("name")!.Value) + .ToHashSet(StringComparer.Ordinal); + } + + /// + /// Each entry: (x:Uid, list of resw key suffixes that MUST exist in en-us). + /// Suffixes are appended to the Uid to form the resw key (".Header", + /// ".Text", ".Content", ".PlaceholderText"). + /// + /// This is the contract for the new TTS / STT settings surface. Any + /// developer adding/renaming a control here must update both the XAML + /// and the .resw entries; this test is the trip-wire. + /// + public static IEnumerable TtsAndSttUids => new[] + { + // Phase 1 / TTS + new object[] { "SettingsNodeTtsToggle", new[] { ".Header" } }, + new object[] { "SettingsNodeTtsDescription", new[] { ".Text" } }, + new object[] { "SettingsTtsProviderComboBox", new[] { ".Header" } }, + new object[] { "SettingsTtsProviderWindowsItem", new[] { ".Content" } }, + new object[] { "SettingsTtsProviderElevenLabsItem",new[] { ".Content" } }, + new object[] { "SettingsTtsElevenLabsApiKey", new[] { ".Header" } }, + new object[] { "SettingsTtsElevenLabsVoiceId", new[] { ".Header" } }, + new object[] { "SettingsTtsElevenLabsModel", new[] { ".Header", ".PlaceholderText" } }, + // Phase 2 / STT + new object[] { "SettingsNodeSttHeader", new[] { ".Text" } }, + new object[] { "SettingsNodeSttDescription", new[] { ".Text" } }, + new object[] { "SettingsNodeSttToggle", new[] { ".Header" } }, + new object[] { "SettingsSttLanguageLabel", new[] { ".Text" } }, + new object[] { "SettingsSttLanguageTextBox", new[] { ".PlaceholderText" } }, + new object[] { "SettingsSttLanguageHelp", new[] { ".Text" } }, + }; + + private static HashSet LoadSettingsXamlUids() + { + var doc = XDocument.Load(GetSettingsXamlPath()); + return doc.Descendants() + .Select(e => e.Attribute(XNs + "Uid")?.Value) + .Where(v => !string.IsNullOrEmpty(v)) + .Cast() + .ToHashSet(StringComparer.Ordinal); + } + + [Theory] + [MemberData(nameof(TtsAndSttUids))] + public void TtsAndSttControl_HasXUid_InSettingsWindowXaml(string uid, string[] _) + { + var uids = LoadSettingsXamlUids(); + Assert.Contains(uid, uids); + } + + [Theory] + [MemberData(nameof(TtsAndSttUids))] + public void TtsAndSttControl_AllExpectedReswKeys_ExistInEnUs(string uid, string[] suffixes) + { + var keys = LoadReswKeys(); + var missing = suffixes + .Select(suffix => uid + suffix) + .Where(key => !keys.Contains(key)) + .ToList(); + + Assert.True(missing.Count == 0, + $"Missing en-us resw keys for x:Uid '{uid}': {string.Join(", ", missing)}"); + } +} From 0f50220e91ca5c3702a1e5bfe05d0d6588118a91 Mon Sep 17 00:00:00 2001 From: Ranjesh Jaganathan Date: Tue, 5 May 2026 05:19:54 -0700 Subject: [PATCH 03/34] Remove fake/sample data from 6 UI pages Replace constructor-injected sample data with empty/loading states: - UsagePage: remove fabricated provider costs and daily data - SessionsPage: remove 3 fake AI conversation sessions - NodesPage: remove fake Desktop-PC/MacBook-Pro nodes - ChannelsPage: remove fake Telegram/WhatsApp channels - SkillsPage: remove fake skills and stale 'API not yet wired' warning - CronPage: remove fake cron jobs, stale warning, fix hardcoded defaults All pages now show proper empty states until real gateway data arrives. The Skills and Cron APIs were already fully wired; the warnings were simply outdated and misleading. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Pages/ChannelsPage.xaml.cs | 8 --- src/OpenClaw.Tray.WinUI/Pages/CronPage.xaml | 15 ++---- .../Pages/CronPage.xaml.cs | 53 ------------------- .../Pages/NodesPage.xaml.cs | 20 ------- .../Pages/SessionsPage.xaml.cs | 10 ---- src/OpenClaw.Tray.WinUI/Pages/SkillsPage.xaml | 7 --- .../Pages/SkillsPage.xaml.cs | 49 ----------------- src/OpenClaw.Tray.WinUI/Pages/UsagePage.xaml | 8 +-- .../Pages/UsagePage.xaml.cs | 15 ------ 9 files changed, 8 insertions(+), 177 deletions(-) diff --git a/src/OpenClaw.Tray.WinUI/Pages/ChannelsPage.xaml.cs b/src/OpenClaw.Tray.WinUI/Pages/ChannelsPage.xaml.cs index 61f00ae8..1a151952 100644 --- a/src/OpenClaw.Tray.WinUI/Pages/ChannelsPage.xaml.cs +++ b/src/OpenClaw.Tray.WinUI/Pages/ChannelsPage.xaml.cs @@ -17,14 +17,6 @@ public sealed partial class ChannelsPage : Page public ChannelsPage() { InitializeComponent(); - - // Sample data for design-time preview - var samples = new List - { - new() { Name = "Telegram", Status = "connected", StatusColor = "Green", IsRunning = true, ProbeInfo = "Bot: @myclaw_bot · 45ms" }, - new() { Name = "WhatsApp", Status = "disconnected", StatusColor = "Red", IsRunning = false, ProbeInfo = null }, - }; - RenderChannels(samples); } public void Initialize(HubWindow hub) diff --git a/src/OpenClaw.Tray.WinUI/Pages/CronPage.xaml b/src/OpenClaw.Tray.WinUI/Pages/CronPage.xaml index ca037a11..b6a5a4f7 100644 --- a/src/OpenClaw.Tray.WinUI/Pages/CronPage.xaml +++ b/src/OpenClaw.Tray.WinUI/Pages/CronPage.xaml @@ -9,13 +9,6 @@ - - - - + - diff --git a/src/OpenClaw.Tray.WinUI/Pages/CronPage.xaml.cs b/src/OpenClaw.Tray.WinUI/Pages/CronPage.xaml.cs index 29bb50b2..c63b65d3 100644 --- a/src/OpenClaw.Tray.WinUI/Pages/CronPage.xaml.cs +++ b/src/OpenClaw.Tray.WinUI/Pages/CronPage.xaml.cs @@ -12,7 +12,6 @@ namespace OpenClawTray.Pages; public sealed partial class CronPage : Page { private HubWindow? _hub; - private bool _hasLiveData; public CronPage() { @@ -24,57 +23,9 @@ public void Initialize(HubWindow hub) _hub = hub; if (hub.GatewayClient != null) { - NotWiredInfoBar.IsOpen = false; _ = hub.GatewayClient.RequestCronListAsync(); _ = hub.GatewayClient.RequestCronStatusAsync(); } - LoadSampleJobs(); - } - - private void LoadSampleJobs() - { - if (_hasLiveData) return; - - var jobs = new List - { - new() - { - Id = "sample-1", - Name = "Daily email summary", - Schedule = "0 9 * * *", - IsEnabled = true, - LastRunTime = DateTime.Now.AddHours(-3).ToString("yyyy-MM-dd HH:mm"), - LastResult = "success", - ResultBadgeBackground = new SolidColorBrush(Colors.Green), - NextRunTime = DateTime.Now.AddHours(21).ToString("yyyy-MM-dd HH:mm"), - }, - new() - { - Id = "sample-2", - Name = "Backup config", - Schedule = "0 0 * * 0", - IsEnabled = true, - LastRunTime = DateTime.Now.AddDays(-3).ToString("yyyy-MM-dd HH:mm"), - LastResult = "success", - ResultBadgeBackground = new SolidColorBrush(Colors.Green), - NextRunTime = DateTime.Now.AddDays(4).ToString("yyyy-MM-dd HH:mm"), - }, - new() - { - Id = "sample-3", - Name = "Health check", - Schedule = "*/15 * * * *", - IsEnabled = true, - LastRunTime = DateTime.Now.AddMinutes(-7).ToString("yyyy-MM-dd HH:mm"), - LastResult = "fail", - ResultBadgeBackground = new SolidColorBrush(Colors.Red), - NextRunTime = DateTime.Now.AddMinutes(8).ToString("yyyy-MM-dd HH:mm"), - }, - }; - - JobsList.ItemsSource = jobs; - JobsList.Visibility = Visibility.Visible; - EmptyState.Visibility = Visibility.Collapsed; } private void OnRunNowClick(object sender, RoutedEventArgs e) @@ -167,9 +118,6 @@ private void ParseCronList(JsonElement payload) DispatcherQueue?.TryEnqueue(() => { - _hasLiveData = true; - NotWiredInfoBar.IsOpen = false; - if (jobs.Count > 0) { JobsList.ItemsSource = jobs; @@ -205,7 +153,6 @@ private void ParseCronStatus(JsonElement payload) DispatcherQueue?.TryEnqueue(() => { - NotWiredInfoBar.IsOpen = false; SchedulerToggle.IsOn = enabled; SchedulerStatusText.Text = enabled ? "Enabled" : "Disabled"; SchedulerStatusIndicator.Fill = new SolidColorBrush(enabled ? Colors.LimeGreen : Colors.Gray); diff --git a/src/OpenClaw.Tray.WinUI/Pages/NodesPage.xaml.cs b/src/OpenClaw.Tray.WinUI/Pages/NodesPage.xaml.cs index e6a54102..032cb7a9 100644 --- a/src/OpenClaw.Tray.WinUI/Pages/NodesPage.xaml.cs +++ b/src/OpenClaw.Tray.WinUI/Pages/NodesPage.xaml.cs @@ -19,26 +19,6 @@ public sealed partial class NodesPage : Page public NodesPage() { InitializeComponent(); - - // Sample data - var samples = new List - { - new() - { - Name = "Desktop-PC", DeviceId = "a1b2c3d4e5f6g7h8i9j0", - Platform = "windows", IsOnline = true, - Capabilities = new[] { "canvas", "camera", "screen", "system", "clipboard", "browser" }, - Commands = new[] { "screenshot", "open-url", "run-command", "read-clipboard", "write-clipboard" }, - }, - new() - { - Name = "MacBook-Pro", DeviceId = "z9y8x7w6v5u4t3s2r1q0", - Platform = "macos", IsOnline = false, - Capabilities = new[] { "canvas", "screen", "system" }, - Commands = new[] { "screenshot", "open-url" }, - }, - }; - RenderNodes(samples); } public void Initialize(HubWindow hub) diff --git a/src/OpenClaw.Tray.WinUI/Pages/SessionsPage.xaml.cs b/src/OpenClaw.Tray.WinUI/Pages/SessionsPage.xaml.cs index 8875285a..0fee428d 100644 --- a/src/OpenClaw.Tray.WinUI/Pages/SessionsPage.xaml.cs +++ b/src/OpenClaw.Tray.WinUI/Pages/SessionsPage.xaml.cs @@ -15,16 +15,6 @@ public sealed partial class SessionsPage : Page public SessionsPage() { InitializeComponent(); - - // Sample data for design-time preview - var samples = new List - { - new() { Key = "agent:main", Preview = "Help me refactor the authentication module to use JWT tokens...", TimeAgo = "2m ago", ThinkingLevel = "medium", VerboseLevel = null, IsActive = true }, - new() { Key = "agent:cron:daily-summary", Preview = "Generated daily summary for 3 channels with 47 messages.", TimeAgo = "1h ago", ThinkingLevel = null, VerboseLevel = "detailed", IsActive = false }, - new() { Key = "telegram:user:12345", Preview = "Remind me to check the deployment status at 5pm today.", TimeAgo = "15m ago", ThinkingLevel = null, VerboseLevel = null, IsActive = true }, - }; - SessionListView.ItemsSource = samples; - EmptyState.Visibility = Visibility.Collapsed; } public void Initialize(HubWindow hub) diff --git a/src/OpenClaw.Tray.WinUI/Pages/SkillsPage.xaml b/src/OpenClaw.Tray.WinUI/Pages/SkillsPage.xaml index 190c18f9..cec5ce2e 100644 --- a/src/OpenClaw.Tray.WinUI/Pages/SkillsPage.xaml +++ b/src/OpenClaw.Tray.WinUI/Pages/SkillsPage.xaml @@ -15,13 +15,6 @@ - - - diff --git a/src/OpenClaw.Tray.WinUI/Pages/SkillsPage.xaml.cs b/src/OpenClaw.Tray.WinUI/Pages/SkillsPage.xaml.cs index 322f9e37..1c5f791b 100644 --- a/src/OpenClaw.Tray.WinUI/Pages/SkillsPage.xaml.cs +++ b/src/OpenClaw.Tray.WinUI/Pages/SkillsPage.xaml.cs @@ -11,7 +11,6 @@ namespace OpenClawTray.Pages; public sealed partial class SkillsPage : Page { private HubWindow? _hub; - private bool _hasLiveData; public SkillsPage() { @@ -24,10 +23,8 @@ public void Initialize(HubWindow hub) PopulateAgentFilter(hub); if (hub.GatewayClient != null) { - NotWiredInfoBar.IsOpen = false; _ = hub.GatewayClient.RequestSkillsStatusAsync(GetSelectedAgentId()); } - LoadSampleSkills(); } private void PopulateAgentFilter(HubWindow hub) @@ -58,49 +55,6 @@ private void OnAgentFilterChanged(object sender, SelectionChangedEventArgs e) _ = client.RequestSkillsStatusAsync(GetSelectedAgentId()); } - private void LoadSampleSkills() - { - if (_hasLiveData) return; - - var skills = new List - { - new() - { - Id = "github", - Name = "GitHub Integration", - Version = "v2.1", - Description = "Connect OpenClaw to GitHub for issue tracking, PR reviews, and repository management.", - StatusText = "Active", - StatusBackground = new SolidColorBrush(Colors.Green), - ActionLabel = "Update", - }, - new() - { - Id = "email", - Name = "Email Digest", - Version = "v1.3", - Description = "Automatically summarize and send email digests of daily activity and session outcomes.", - StatusText = "Active", - StatusBackground = new SolidColorBrush(Colors.Green), - ActionLabel = "Update", - }, - new() - { - Id = "calendar", - Name = "Calendar Sync", - Version = "v0.9", - Description = "Sync scheduled tasks and cron jobs with your calendar provider for visibility.", - StatusText = "Inactive", - StatusBackground = new SolidColorBrush(Colors.Gray), - ActionLabel = "Enable", - }, - }; - - SkillsList.ItemsSource = skills; - SkillsList.Visibility = Visibility.Visible; - EmptyState.Visibility = Visibility.Collapsed; - } - private void OnSkillActionClick(object sender, RoutedEventArgs e) { var skillId = (sender as Button)?.Tag as string; @@ -165,9 +119,6 @@ public void UpdateFromGateway(JsonElement data) DispatcherQueue?.TryEnqueue(() => { - _hasLiveData = true; - NotWiredInfoBar.IsOpen = false; - if (skills.Count > 0) { SkillsList.ItemsSource = skills; diff --git a/src/OpenClaw.Tray.WinUI/Pages/UsagePage.xaml b/src/OpenClaw.Tray.WinUI/Pages/UsagePage.xaml index c7a789e5..aae2a52a 100644 --- a/src/OpenClaw.Tray.WinUI/Pages/UsagePage.xaml +++ b/src/OpenClaw.Tray.WinUI/Pages/UsagePage.xaml @@ -21,7 +21,7 @@ - @@ -34,7 +34,7 @@ - + @@ -44,7 +44,7 @@ - + @@ -54,7 +54,7 @@ - + diff --git a/src/OpenClaw.Tray.WinUI/Pages/UsagePage.xaml.cs b/src/OpenClaw.Tray.WinUI/Pages/UsagePage.xaml.cs index 7af75d9a..dd3375c5 100644 --- a/src/OpenClaw.Tray.WinUI/Pages/UsagePage.xaml.cs +++ b/src/OpenClaw.Tray.WinUI/Pages/UsagePage.xaml.cs @@ -16,21 +16,6 @@ public sealed partial class UsagePage : Page public UsagePage() { InitializeComponent(); - - // Sample data - ProviderListView.ItemsSource = new List - { - new() { Name = "OpenAI", Requests = "87 req", Tokens = "182.3K tok", Cost = "$1.24" }, - new() { Name = "Anthropic", Requests = "41 req", Tokens = "78.1K tok", Cost = "$0.89" }, - new() { Name = "Google", Requests = "14 req", Tokens = "24.1K tok", Cost = "$0.28" }, - }; - - var today = DateTime.Today; - DailyListView.ItemsSource = Enumerable.Range(0, 7).Select(i => new DailyRow - { - Date = today.AddDays(-i).ToString("ddd, MMM d"), - Cost = $"${(0.15 + i * 0.12 + (i % 3) * 0.18):F2}", - }).ToList(); } public void Initialize(HubWindow hub) From e77bd55852a76f0315e00bede322ad37441f1723 Mon Sep 17 00:00:00 2001 From: Ranjesh Jaganathan Date: Tue, 5 May 2026 09:49:10 -0700 Subject: [PATCH 04/34] feat: add voice/audio support with local Whisper STT Add full voice interaction capabilities to the Windows node: Core audio pipeline: - NAudio WASAPI microphone capture with MTA thread initialization - Energy-based voice activity detection with hysteresis - Whisper.net speech-to-text with multi-threaded inference - Pre-buffer to capture speech onset before VAD triggers - Auto-download of Whisper models from HuggingFace Voice overlay window: - Modern WinUI 3 floating window with Mica backdrop and custom title bar - Chat-style transcript bubbles with segment consolidation - Real-time audio level visualization - Start/Stop, Mute, and Settings controls STT node capability: - stt.listen and stt.status MCP commands for agent-initiated listening - Follows existing capability pattern (like TTS) Voice settings page: - Model size selection (tiny/base/small) with download management - Language selection (auto-detect + 9 languages) - Silence timeout slider - TTS voice picker with Windows neural voice enumeration - ElevenLabs provider configuration - Voice preview button Integration: - Tray menu Voice item - Ctrl+Alt+Shift+V global hotkey for push-to-talk - Deep links: openclaw://voice, openclaw://voice-stop - Gateway chat responses shown in voice overlay - TTS response playback with mic muting to prevent echo - Capabilities page STT toggle - Hub navigation Voice & Audio page Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/OpenClaw.Shared/Audio/AudioModels.cs | 48 ++ .../Audio/SpeechToTextService.cs | 150 ++++++ .../Audio/VoiceActivityDetector.cs | 108 ++++ .../Audio/WhisperModelManager.cs | 138 +++++ .../Capabilities/SttCapability.cs | 147 ++++++ src/OpenClaw.Shared/OpenClaw.Shared.csproj | 7 + src/OpenClaw.Shared/SettingsData.cs | 13 + src/OpenClaw.Tray.WinUI/App.xaml.cs | 119 ++++- .../OpenClaw.Tray.WinUI.csproj | 1 + .../Pages/CapabilitiesPage.xaml.cs | 1 + .../Pages/VoiceSettingsPage.xaml | 154 ++++++ .../Pages/VoiceSettingsPage.xaml.cs | 360 +++++++++++++ .../Services/AudioPipeline.cs | 471 ++++++++++++++++++ .../Services/DeepLinkHandler.cs | 11 + .../Services/GlobalHotkeyService.cs | 23 +- .../Services/NodeService.cs | 31 ++ .../Services/SettingsManager.cs | 19 + .../Services/VoiceService.cs | 456 +++++++++++++++++ .../Windows/HubWindow.xaml | 3 + .../Windows/HubWindow.xaml.cs | 3 + .../Windows/VoiceOverlayWindow.xaml | 125 +++++ .../Windows/VoiceOverlayWindow.xaml.cs | 314 ++++++++++++ 22 files changed, 2699 insertions(+), 3 deletions(-) create mode 100644 src/OpenClaw.Shared/Audio/AudioModels.cs create mode 100644 src/OpenClaw.Shared/Audio/SpeechToTextService.cs create mode 100644 src/OpenClaw.Shared/Audio/VoiceActivityDetector.cs create mode 100644 src/OpenClaw.Shared/Audio/WhisperModelManager.cs create mode 100644 src/OpenClaw.Shared/Capabilities/SttCapability.cs create mode 100644 src/OpenClaw.Tray.WinUI/Pages/VoiceSettingsPage.xaml create mode 100644 src/OpenClaw.Tray.WinUI/Pages/VoiceSettingsPage.xaml.cs create mode 100644 src/OpenClaw.Tray.WinUI/Services/AudioPipeline.cs create mode 100644 src/OpenClaw.Tray.WinUI/Services/VoiceService.cs create mode 100644 src/OpenClaw.Tray.WinUI/Windows/VoiceOverlayWindow.xaml create mode 100644 src/OpenClaw.Tray.WinUI/Windows/VoiceOverlayWindow.xaml.cs diff --git a/src/OpenClaw.Shared/Audio/AudioModels.cs b/src/OpenClaw.Shared/Audio/AudioModels.cs new file mode 100644 index 00000000..c5f97b34 --- /dev/null +++ b/src/OpenClaw.Shared/Audio/AudioModels.cs @@ -0,0 +1,48 @@ +using System; + +namespace OpenClaw.Shared.Audio; + +/// Result of a speech-to-text transcription segment. +public sealed class TranscriptionResult +{ + public string Text { get; init; } = ""; + public TimeSpan Start { get; init; } + public TimeSpan End { get; init; } + public string Language { get; init; } = "en"; +} + +/// Voice-activity detection event. +public sealed class VadEvent +{ + public bool IsSpeaking { get; init; } + public float Probability { get; init; } +} + +/// Configuration for the audio pipeline. +public sealed class AudioPipelineOptions +{ + /// Path to the Whisper GGML model file. + public string ModelPath { get; init; } = ""; + + /// Language code for STT (e.g. "en", "auto"). + public string Language { get; init; } = "auto"; + + /// Seconds of silence before a speech segment is finalized. + public float SilenceTimeoutSeconds { get; init; } = 1.5f; + + /// Optional audio device ID. Null = system default microphone. + public string? DeviceId { get; init; } + + /// VAD probability threshold (0.0–1.0). Audio above this is considered speech. + public float VadThreshold { get; init; } = 0.3f; +} + +/// Pipeline state. +public enum AudioPipelineState +{ + Stopped, + Starting, + Listening, + Processing, + Error +} diff --git a/src/OpenClaw.Shared/Audio/SpeechToTextService.cs b/src/OpenClaw.Shared/Audio/SpeechToTextService.cs new file mode 100644 index 00000000..2ef8ecf7 --- /dev/null +++ b/src/OpenClaw.Shared/Audio/SpeechToTextService.cs @@ -0,0 +1,150 @@ +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using Whisper.net; +using Whisper.net.Ggml; + +namespace OpenClaw.Shared.Audio; + +/// +/// Wraps Whisper.net for speech-to-text transcription. +/// Lazily loads the model on first use and caches the factory. +/// Thread-safe: concurrent calls are serialized by a semaphore. +/// +public sealed class SpeechToTextService : IDisposable +{ + private readonly IOpenClawLogger _logger; + private readonly SemaphoreSlim _gate = new(1, 1); + private WhisperFactory? _factory; + private string? _loadedModelPath; + + public bool IsModelLoaded => _factory != null; + public string? LoadedModelPath => _loadedModelPath; + + public SpeechToTextService(IOpenClawLogger logger) + { + _logger = logger; + } + + /// Load (or reload) the Whisper model from disk. + public void LoadModel(string modelPath) + { + if (!System.IO.File.Exists(modelPath)) + throw new System.IO.FileNotFoundException($"Whisper model not found: {modelPath}"); + + _factory?.Dispose(); + _factory = WhisperFactory.FromPath(modelPath); + _loadedModelPath = modelPath; + _logger.Info($"Whisper model loaded: {modelPath}"); + } + + /// Unload the current model and free memory. + public void UnloadModel() + { + _factory?.Dispose(); + _factory = null; + _loadedModelPath = null; + _logger.Info("Whisper model unloaded"); + } + + /// + /// Transcribe raw 16 kHz mono PCM float samples. + /// Returns all detected segments. + /// + public async Task> TranscribeAsync( + float[] samples, + string language = "auto", + CancellationToken cancellationToken = default) + { + if (_factory == null) + throw new InvalidOperationException("No Whisper model is loaded. Call LoadModel first."); + + await _gate.WaitAsync(cancellationToken); + try + { + var builder = _factory.CreateBuilder() + .WithLanguage(language == "auto" ? "auto" : language) + .WithThreads(Math.Max(1, Environment.ProcessorCount / 2)); + + using var processor = builder.Build(); + + using var wavStream = PcmToWavStream(samples, 16000); + + var results = new List(); + await foreach (var segment in processor.ProcessAsync(wavStream, cancellationToken)) + { + var text = segment.Text?.Trim(); + if (!string.IsNullOrEmpty(text)) + { + results.Add(new TranscriptionResult + { + Text = text, + Start = segment.Start, + End = segment.End, + Language = language + }); + } + } + + return results; + } + finally + { + _gate.Release(); + } + } + + /// + /// Convert raw 16-bit PCM float samples to a WAV MemoryStream. + /// Whisper.net processes WAV streams natively. + /// + private static System.IO.MemoryStream PcmToWavStream(float[] samples, int sampleRate) + { + var ms = new System.IO.MemoryStream(); + using var writer = new System.IO.BinaryWriter(ms, System.Text.Encoding.UTF8, leaveOpen: true); + + int bitsPerSample = 16; + short channels = 1; + int byteRate = sampleRate * channels * bitsPerSample / 8; + short blockAlign = (short)(channels * bitsPerSample / 8); + int dataSize = samples.Length * blockAlign; + + // RIFF header + writer.Write("RIFF"u8); + writer.Write(36 + dataSize); + writer.Write("WAVE"u8); + + // fmt subchunk + writer.Write("fmt "u8); + writer.Write(16); // subchunk size + writer.Write((short)1); // PCM format + writer.Write(channels); + writer.Write(sampleRate); + writer.Write(byteRate); + writer.Write(blockAlign); + writer.Write((short)bitsPerSample); + + // data subchunk + writer.Write("data"u8); + writer.Write(dataSize); + + // Convert float [-1.0, 1.0] to int16 + foreach (var sample in samples) + { + var clamped = Math.Clamp(sample, -1.0f, 1.0f); + var int16 = (short)(clamped * 32767); + writer.Write(int16); + } + + writer.Flush(); + ms.Position = 0; + return ms; + } + + public void Dispose() + { + _factory?.Dispose(); + _gate.Dispose(); + } +} diff --git a/src/OpenClaw.Shared/Audio/VoiceActivityDetector.cs b/src/OpenClaw.Shared/Audio/VoiceActivityDetector.cs new file mode 100644 index 00000000..3112d3a4 --- /dev/null +++ b/src/OpenClaw.Shared/Audio/VoiceActivityDetector.cs @@ -0,0 +1,108 @@ +using System; +using Microsoft.ML.OnnxRuntime; +using Microsoft.ML.OnnxRuntime.Tensors; + +namespace OpenClaw.Shared.Audio; + +/// +/// Voice Activity Detection using Silero VAD ONNX model. +/// Processes 16 kHz mono audio in 512-sample chunks (~32 ms each) +/// and returns a speech probability per chunk. +/// +public sealed class VoiceActivityDetector : IDisposable +{ + private InferenceSession? _session; + private float[] _state; // internal RNN state: shape [2, 1, 128] + private readonly int _stateSize; + private readonly IOpenClawLogger _logger; + + /// Expected sample rate for input audio. + public const int SampleRate = 16000; + + /// Number of samples per VAD chunk (512 @ 16 kHz = 32 ms). + public const int ChunkSamples = 512; + + public bool IsLoaded => _session != null; + + public VoiceActivityDetector(IOpenClawLogger logger) + { + _logger = logger; + _stateSize = 2 * 1 * 128; + _state = new float[_stateSize]; + } + + /// Load the Silero VAD ONNX model from disk. + public void LoadModel(string modelPath) + { + if (!System.IO.File.Exists(modelPath)) + throw new System.IO.FileNotFoundException($"VAD model not found: {modelPath}"); + + var opts = new SessionOptions + { + InterOpNumThreads = 1, + IntraOpNumThreads = 1, + EnableCpuMemArena = true + }; + opts.GraphOptimizationLevel = GraphOptimizationLevel.ORT_ENABLE_ALL; + + _session?.Dispose(); + _session = new InferenceSession(modelPath, opts); + ResetState(); + _logger.Info($"Silero VAD model loaded: {modelPath}"); + } + + /// Reset the internal RNN state (call between utterances). + public void ResetState() + { + Array.Clear(_state, 0, _state.Length); + } + + /// + /// Process a single chunk of audio and return the speech probability (0.0–1.0). + /// Input must be exactly float samples at 16 kHz. + /// + public float ProcessChunk(float[] audioChunk) + { + if (_session == null) + throw new InvalidOperationException("VAD model not loaded. Call LoadModel first."); + + if (audioChunk.Length != ChunkSamples) + throw new ArgumentException($"Audio chunk must be exactly {ChunkSamples} samples, got {audioChunk.Length}"); + + // Build input tensors matching Silero VAD v5 expected shapes. + // See: github.com/snakers4/silero-vad/blob/master/examples/csharp/SileroVadOnnxModel.cs + var inputTensor = new DenseTensor(audioChunk, new[] { 1, ChunkSamples }); + var srTensor = new DenseTensor(new long[] { SampleRate }, new[] { 1 }); + var stateTensor = new DenseTensor(_state, new[] { 2, 1, 128 }); + + using var results = _session.Run(new List + { + NamedOnnxValue.CreateFromTensor("input", inputTensor), + NamedOnnxValue.CreateFromTensor("sr", srTensor), + NamedOnnxValue.CreateFromTensor("state", stateTensor) + }); + + float probability = 0f; + foreach (var result in results) + { + if (result.Name == "output") + { + var tensor = result.AsTensor(); + probability = tensor.Length > 0 ? tensor.GetValue(0) : 0f; + } + else if (result.Name == "stateN") + { + var newState = result.AsTensor(); + for (int i = 0; i < _stateSize && i < newState.Length; i++) + _state[i] = newState.GetValue(i); + } + } + + return probability; + } + + public void Dispose() + { + _session?.Dispose(); + } +} diff --git a/src/OpenClaw.Shared/Audio/WhisperModelManager.cs b/src/OpenClaw.Shared/Audio/WhisperModelManager.cs new file mode 100644 index 00000000..b23bc054 --- /dev/null +++ b/src/OpenClaw.Shared/Audio/WhisperModelManager.cs @@ -0,0 +1,138 @@ +using System; +using System.IO; +using System.Net.Http; +using System.Threading; +using System.Threading.Tasks; + +namespace OpenClaw.Shared.Audio; + +/// +/// Manages Whisper GGML model downloads, storage, and lifecycle. +/// Models are stored in %APPDATA%\OpenClawTray\models\ (or the +/// configured data directory). +/// +public sealed class WhisperModelManager +{ + private readonly string _modelsDirectory; + private readonly IOpenClawLogger _logger; + + /// Known Whisper model definitions. + public static readonly WhisperModelInfo[] AvailableModels = + [ + new("ggml-tiny.bin", "tiny", 75_000_000, "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin"), + new("ggml-base.bin", "base", 142_000_000, "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"), + new("ggml-small.bin", "small", 466_000_000, "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin"), + ]; + + public WhisperModelManager(string dataDirectory, IOpenClawLogger logger) + { + _modelsDirectory = Path.Combine(dataDirectory, "models"); + _logger = logger; + Directory.CreateDirectory(_modelsDirectory); + } + + /// Full file path for a given model name. + public string GetModelPath(string modelName) + { + var info = FindModel(modelName); + return Path.Combine(_modelsDirectory, info.FileName); + } + + /// Check whether a model file already exists on disk. + public bool IsModelDownloaded(string modelName) + { + var path = GetModelPath(modelName); + return File.Exists(path); + } + + /// Get the size of a downloaded model, or 0 if not downloaded. + public long GetModelSize(string modelName) + { + var path = GetModelPath(modelName); + return File.Exists(path) ? new FileInfo(path).Length : 0; + } + + /// + /// Download a model from HuggingFace if not already present. + /// Reports progress as bytes downloaded / total bytes. + /// + public async Task DownloadModelAsync( + string modelName, + IProgress<(long downloaded, long total)>? progress = null, + CancellationToken cancellationToken = default) + { + var info = FindModel(modelName); + var destPath = Path.Combine(_modelsDirectory, info.FileName); + + if (File.Exists(destPath)) + { + _logger.Info($"Model '{modelName}' already exists at {destPath}"); + return; + } + + _logger.Info($"Downloading model '{modelName}' from {info.DownloadUrl}"); + var tempPath = destPath + ".tmp"; + + try + { + using var httpClient = new HttpClient(); + httpClient.Timeout = TimeSpan.FromMinutes(30); + using var response = await httpClient.GetAsync(info.DownloadUrl, HttpCompletionOption.ResponseHeadersRead, cancellationToken); + response.EnsureSuccessStatusCode(); + + var totalBytes = response.Content.Headers.ContentLength ?? info.ApproximateSizeBytes; + using var contentStream = await response.Content.ReadAsStreamAsync(cancellationToken); + using var fileStream = new FileStream(tempPath, FileMode.Create, FileAccess.Write, FileShare.None, 81920); + + var buffer = new byte[81920]; + long downloadedBytes = 0; + int bytesRead; + + while ((bytesRead = await contentStream.ReadAsync(buffer, cancellationToken)) > 0) + { + await fileStream.WriteAsync(buffer.AsMemory(0, bytesRead), cancellationToken); + downloadedBytes += bytesRead; + progress?.Report((downloadedBytes, totalBytes)); + } + + await fileStream.FlushAsync(cancellationToken); + fileStream.Close(); + + File.Move(tempPath, destPath, overwrite: true); + _logger.Info($"Model '{modelName}' downloaded successfully ({downloadedBytes:N0} bytes)"); + } + catch + { + // Clean up partial download + try { if (File.Exists(tempPath)) File.Delete(tempPath); } catch { /* best effort */ } + throw; + } + } + + /// Delete a downloaded model file. + public bool DeleteModel(string modelName) + { + var path = GetModelPath(modelName); + if (!File.Exists(path)) return false; + File.Delete(path); + _logger.Info($"Deleted model '{modelName}'"); + return true; + } + + private static WhisperModelInfo FindModel(string modelName) + { + foreach (var m in AvailableModels) + { + if (string.Equals(m.Name, modelName, StringComparison.OrdinalIgnoreCase)) + return m; + } + throw new ArgumentException($"Unknown model: '{modelName}'. Available: tiny, base, small"); + } +} + +/// Metadata about a Whisper model variant. +public sealed record WhisperModelInfo( + string FileName, + string Name, + long ApproximateSizeBytes, + string DownloadUrl); diff --git a/src/OpenClaw.Shared/Capabilities/SttCapability.cs b/src/OpenClaw.Shared/Capabilities/SttCapability.cs new file mode 100644 index 00000000..f6abef33 --- /dev/null +++ b/src/OpenClaw.Shared/Capabilities/SttCapability.cs @@ -0,0 +1,147 @@ +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; + +namespace OpenClaw.Shared.Capabilities; + +/// +/// Speech-to-text node capability. Allows the OpenClaw agent to +/// trigger microphone listening and receive transcribed text. +/// +public sealed class SttCapability : NodeCapabilityBase +{ + public const string ListenCommand = "stt.listen"; + public const string StatusCommand = "stt.status"; + + private static readonly string[] _commands = [ListenCommand, StatusCommand]; + + public override string Category => "stt"; + public override IReadOnlyList Commands => _commands; + + /// + /// Fired when the agent requests listening. The handler should start + /// the microphone, wait for speech + silence, transcribe, and return + /// the text result. + /// + public event Func>? ListenRequested; + + /// + /// Fired when the agent queries STT status (model loaded, etc.). + /// + public event Func>? StatusRequested; + + public SttCapability(IOpenClawLogger logger) : base(logger) { } + + public override Task ExecuteAsync(NodeInvokeRequest request) + => ExecuteAsync(request, CancellationToken.None); + + public override async Task ExecuteAsync( + NodeInvokeRequest request, + CancellationToken cancellationToken) + { + return request.Command switch + { + ListenCommand => await HandleListenAsync(request, cancellationToken), + StatusCommand => await HandleStatusAsync(), + _ => Error($"Unknown command: {request.Command}") + }; + } + + private async Task HandleListenAsync( + NodeInvokeRequest request, + CancellationToken cancellationToken) + { + if (ListenRequested == null) + return Error("STT listen not available"); + + var timeoutMs = GetIntArg(request.Args, "timeoutMs", 30000); + if (timeoutMs < 1000) timeoutMs = 1000; + if (timeoutMs > 120000) timeoutMs = 120000; + + var language = GetStringArg(request.Args, "language", "auto") ?? "auto"; + + var args = new SttListenArgs + { + TimeoutMs = timeoutMs, + Language = language + }; + + Logger.Info($"stt.listen: timeoutMs={timeoutMs}, language={language}"); + + try + { + var result = await ListenRequested(args, cancellationToken); + return Success(new + { + text = result.Text, + language = result.Language, + durationMs = result.DurationMs, + segments = result.Segments + }); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + return Error("Listen canceled"); + } + catch (TimeoutException) + { + return Error("No speech detected within timeout"); + } + catch (Exception ex) + { + Logger.Error("STT listen failed", ex); + return Error($"Listen failed: {ex.Message}"); + } + } + + private async Task HandleStatusAsync() + { + if (StatusRequested == null) + return Error("STT status not available"); + + try + { + var result = await StatusRequested(); + return Success(new + { + modelLoaded = result.ModelLoaded, + modelName = result.ModelName, + isListening = result.IsListening + }); + } + catch (Exception ex) + { + Logger.Error("STT status failed", ex); + return Error($"Status failed: {ex.Message}"); + } + } +} + +public sealed class SttListenArgs +{ + public int TimeoutMs { get; set; } = 30000; + public string Language { get; set; } = "auto"; +} + +public sealed class SttListenResult +{ + public string Text { get; set; } = ""; + public string? Language { get; set; } + public int DurationMs { get; set; } + public List? Segments { get; set; } +} + +public sealed class SttSegment +{ + public string Text { get; set; } = ""; + public int StartMs { get; set; } + public int EndMs { get; set; } +} + +public sealed class SttStatusResult +{ + public bool ModelLoaded { get; set; } + public string? ModelName { get; set; } + public bool IsListening { get; set; } +} diff --git a/src/OpenClaw.Shared/OpenClaw.Shared.csproj b/src/OpenClaw.Shared/OpenClaw.Shared.csproj index ac832955..2c0a6df5 100644 --- a/src/OpenClaw.Shared/OpenClaw.Shared.csproj +++ b/src/OpenClaw.Shared/OpenClaw.Shared.csproj @@ -15,6 +15,13 @@ + + + + + + + diff --git a/src/OpenClaw.Shared/SettingsData.cs b/src/OpenClaw.Shared/SettingsData.cs index f1983d88..c822e93e 100644 --- a/src/OpenClaw.Shared/SettingsData.cs +++ b/src/OpenClaw.Shared/SettingsData.cs @@ -64,6 +64,19 @@ public class SettingsData public bool PreferStructuredCategories { get; set; } = true; public List? UserRules { get; set; } + // ── Voice / STT settings ── + public bool NodeSttEnabled { get; set; } = false; + /// Whisper model name: "tiny", "base", or "small". + public string SttModelName { get; set; } = "base"; + /// STT language code ("auto" for auto-detect, or ISO 639-1). + public string SttLanguage { get; set; } = "auto"; + /// Seconds of silence before auto-submit in voice chat mode. + public float SttSilenceTimeout { get; set; } = 2.5f; + /// Enable TTS playback of responses during voice sessions. + public bool VoiceTtsEnabled { get; set; } = true; + /// Play audio feedback chimes on listen start/stop. + public bool VoiceAudioFeedback { get; set; } = true; + private static readonly JsonSerializerOptions s_options = new() { WriteIndented = true, diff --git a/src/OpenClaw.Tray.WinUI/App.xaml.cs b/src/OpenClaw.Tray.WinUI/App.xaml.cs index faa5db80..f1c89e28 100644 --- a/src/OpenClaw.Tray.WinUI/App.xaml.cs +++ b/src/OpenClaw.Tray.WinUI/App.xaml.cs @@ -378,6 +378,7 @@ protected override async void OnLaunched(LaunchActivatedEventArgs args) { _globalHotkey = new GlobalHotkeyService(); _globalHotkey.HotkeyPressed += OnGlobalHotkeyPressed; + _globalHotkey.VoiceHotkeyPressed += OnVoiceHotkeyPressed; _globalHotkey.Register(); } @@ -452,6 +453,35 @@ private void ShowChatWindow() } } + private VoiceOverlayWindow? _voiceOverlayWindow; + + private void ShowVoiceOverlay() + { + var voiceService = _nodeService?.VoiceService; + if (voiceService == null) + { + // STT not enabled — show settings + ShowHub("voice"); + return; + } + + if (_voiceOverlayWindow == null || _voiceOverlayWindow.AppWindow == null) + { + _voiceOverlayWindow = new VoiceOverlayWindow(voiceService, new AppLogger()); + _voiceOverlayWindow.Closed += (_, _) => _voiceOverlayWindow = null; + // Wire transcription to gateway chat when connected + _voiceOverlayWindow.TextSubmitted += text => + { + if (_gatewayClient != null && _currentStatus == ConnectionStatus.Connected) + { + _ = _gatewayClient.SendChatMessageAsync(text); + } + }; + } + + _voiceOverlayWindow.Activate(); + } + private void OnTrayContextMenu(TrayIcon sender, TrayIconEventArgs e) { // Right-click: show menu @@ -502,6 +532,7 @@ private void OnTrayMenuItemClicked(object? sender, string action) case "dashboard": OpenDashboard(); break; case "canvas": _nodeService?.ShowCanvasWindow(); break; case "openchat": ShowChatWindow(); break; + case "voice": ShowVoiceOverlay(); break; case "webchat": ShowWebChat(); break; case "hub": ShowHub(); break; case "companion": @@ -1029,6 +1060,7 @@ private void BuildTrayMenuPopup(TrayMenuWindow menu) menu.AddMenuItem("Dashboard", "🌐", "dashboard"); menu.AddMenuItem("Chat", "💬", "openchat"); menu.AddMenuItem("Canvas", "🎨", "canvas"); + menu.AddMenuItem("Voice", "🎙️", "voice"); menu.AddMenuItem("Companion", "🦞", "companion"); menu.AddMenuItem(LocalizationHelper.GetString("Menu_QuickSend"), "📤", "quicksend"); @@ -1850,6 +1882,7 @@ private void SyncHubNodeState() _hubWindow.NodeIsPendingApproval = _nodeService.IsPendingApproval; _hubWindow.NodeShortDeviceId = _nodeService.ShortDeviceId; _hubWindow.NodeFullDeviceId = _nodeService.FullDeviceId; + _hubWindow.VoiceServiceInstance = _nodeService.VoiceService; } else { @@ -2286,6 +2319,26 @@ private void OnNotificationReceived(object? sender, OpenClawNotification notific $"{notification.Type ?? "info"}: {notification.Title ?? "notification"}", category: "notification", details: notification.Message); + + // Voice overlay: show agent chat responses and optionally speak them + if (notification.IsChat && _voiceOverlayWindow != null && !string.IsNullOrEmpty(notification.Message)) + { + _dispatcherQueue?.TryEnqueue(() => + { + try + { + _voiceOverlayWindow?.AddAgentResponse(notification.Message); + } + catch { } + }); + + // TTS: read response aloud if voice is active and TTS enabled + if (_settings?.VoiceTtsEnabled == true && _nodeService?.VoiceService?.CurrentMode != VoiceMode.Inactive) + { + _ = SpeakResponseAsync(notification.Message); + } + } + if (_settings?.ShowNotifications != true) return; if (!ShouldShowNotification(notification)) return; @@ -3602,8 +3655,6 @@ private void CopyExtensibilitySummary() private void OnGlobalHotkeyPressed(object? sender, EventArgs e) { - // Hotkey events are raised from a dedicated Win32 message-loop thread. - // Creating/activating WinUI windows must happen on the app's UI thread. if (_dispatcherQueue == null) { Logger.Warn("Hotkey pressed but DispatcherQueue is null"); @@ -3617,6 +3668,12 @@ private void OnGlobalHotkeyPressed(object? sender, EventArgs e) } } + private void OnVoiceHotkeyPressed(object? sender, EventArgs e) + { + if (_dispatcherQueue == null) return; + _dispatcherQueue.TryEnqueue(() => ShowVoiceOverlay()); + } + #endregion #region Updates @@ -3836,6 +3893,8 @@ private void HandleDeepLink(string uri) OpenDashboard = OpenDashboard, OpenQuickSend = ShowQuickSend, OpenHub = (page) => ShowHub(page), + OpenVoice = () => ShowVoiceOverlay(), + StopVoice = () => _ = StopVoiceAsync(), SendMessage = async (msg) => { if (_gatewayClient != null) @@ -3846,6 +3905,62 @@ private void HandleDeepLink(string uri) }); } + private async Task StopVoiceAsync() + { + var voiceService = _nodeService?.VoiceService; + if (voiceService != null) + await voiceService.StopAsync(); + } + + private int _ttsMuteCount; + + private async Task SpeakResponseAsync(string text) + { + var voiceService = _nodeService?.VoiceService; + try + { + if (voiceService == null || _settings == null) return; + + // Increment mute counter — multiple concurrent TTS won't unmute prematurely + Interlocked.Increment(ref _ttsMuteCount); + voiceService.IsMutedForPlayback = true; + + var speakText = text.Length > 500 ? text[..500] + "..." : text; + + var speakArgs = new OpenClaw.Shared.Capabilities.TtsSpeakArgs + { + Text = speakText, + Provider = _settings.TtsProvider ?? "windows", + VoiceId = _settings.TtsElevenLabsVoiceId, + Interrupt = true + }; + + var tts = new TextToSpeechService(new AppLogger(), _settings); + try + { + await tts.SpeakAsync(speakArgs); + } + finally + { + tts.Dispose(); + } + } + catch (Exception ex) + { + Logger.Warn($"TTS response playback failed: {ex.Message}"); + } + finally + { + // Only unmute when all concurrent TTS operations have finished + if (voiceService != null) + { + await Task.Delay(300); + if (Interlocked.Decrement(ref _ttsMuteCount) <= 0) + voiceService.IsMutedForPlayback = false; + } + } + } + private static void SendDeepLinkToRunningInstance(string uri) { try diff --git a/src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj b/src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj index b9873035..2974ab5d 100644 --- a/src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj +++ b/src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj @@ -60,6 +60,7 @@ + diff --git a/src/OpenClaw.Tray.WinUI/Pages/CapabilitiesPage.xaml.cs b/src/OpenClaw.Tray.WinUI/Pages/CapabilitiesPage.xaml.cs index 0430cdc3..9b037a2b 100644 --- a/src/OpenClaw.Tray.WinUI/Pages/CapabilitiesPage.xaml.cs +++ b/src/OpenClaw.Tray.WinUI/Pages/CapabilitiesPage.xaml.cs @@ -43,6 +43,7 @@ private void BuildCapabilityToggles(HubWindow hub) ("🖥️", "Screen Capture", settings.NodeScreenEnabled, v => settings.NodeScreenEnabled = v), ("📍", "Location", settings.NodeLocationEnabled, v => settings.NodeLocationEnabled = v), ("🔊", "Text-to-Speech", settings.NodeTtsEnabled, v => settings.NodeTtsEnabled = v), + ("🎙️", "Speech-to-Text", settings.NodeSttEnabled, v => settings.NodeSttEnabled = v), }; var items = new List(); diff --git a/src/OpenClaw.Tray.WinUI/Pages/VoiceSettingsPage.xaml b/src/OpenClaw.Tray.WinUI/Pages/VoiceSettingsPage.xaml new file mode 100644 index 00000000..a77bc9d8 --- /dev/null +++ b/src/OpenClaw.Tray.WinUI/Pages/VoiceSettingsPage.xaml @@ -0,0 +1,154 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/OpenClaw.Tray.WinUI/Windows/VoiceOverlayWindow.xaml.cs b/src/OpenClaw.Tray.WinUI/Windows/VoiceOverlayWindow.xaml.cs new file mode 100644 index 00000000..3d2aafd4 --- /dev/null +++ b/src/OpenClaw.Tray.WinUI/Windows/VoiceOverlayWindow.xaml.cs @@ -0,0 +1,314 @@ +using System; +using System.Threading.Tasks; +using Microsoft.UI.Dispatching; +using Microsoft.UI.Xaml; +using Microsoft.UI.Xaml.Controls; +using Microsoft.UI.Xaml.Media; +using OpenClaw.Shared; +using OpenClaw.Shared.Audio; +using OpenClawTray.Services; +using WinUIEx; + +namespace OpenClawTray.Windows; + +/// +/// Floating voice overlay window for voice chat sessions. +/// Shows conversation transcript, audio levels, and controls. +/// +public sealed partial class VoiceOverlayWindow : WindowEx +{ + private readonly VoiceService _voiceService; + private readonly IOpenClawLogger _logger; + private readonly DispatcherQueue _dispatcherQueue; + private bool _isMuted; + + /// Fired when the user submits transcribed text to the agent. + public event Action? TextSubmitted; + + public VoiceOverlayWindow(VoiceService voiceService, IOpenClawLogger logger) + { + InitializeComponent(); + _voiceService = voiceService; + _logger = logger; + _dispatcherQueue = DispatcherQueue.GetForCurrentThread(); + + // Modern custom title bar + ExtendsContentIntoTitleBar = true; + SetTitleBar(AppTitleBar); + + _voiceService.TranscriptionReceived += OnTranscriptionReceived; + _voiceService.SpeakingChanged += OnSpeakingChanged; + _voiceService.AudioLevelChanged += OnAudioLevelChanged; + _voiceService.ModeChanged += OnModeChanged; + _voiceService.PipelineStateChanged += OnPipelineStateChanged; + _voiceService.DiagnosticMessage += OnDiagnosticMessage; + + Closed += WindowClosed; + UpdateUI(); + } + + private DateTime _lastUserBubbleTime = DateTime.MinValue; + private TextBlock? _lastUserTextBlock; + + private void OnTranscriptionReceived(string text) + { + _dispatcherQueue.TryEnqueue(() => + { + // Consolidate: if the last bubble was a user bubble within 5 seconds, + // append to it instead of creating a new one. + var elapsed = DateTime.UtcNow - _lastUserBubbleTime; + if (_lastUserTextBlock != null && elapsed.TotalSeconds < 5) + { + _lastUserTextBlock.Text += " " + text; + _lastUserBubbleTime = DateTime.UtcNow; + try + { + TranscriptScroller.UpdateLayout(); + TranscriptScroller.ChangeView(null, TranscriptScroller.ScrollableHeight, null); + } + catch { } + } + else + { + AddTranscriptBubble(text, isUser: true); + } + TextSubmitted?.Invoke(text); + }); + } + + /// Add an agent response to the transcript. + public void AddAgentResponse(string text) + { + _dispatcherQueue.TryEnqueue(() => + { + AddTranscriptBubble(text, isUser: false); + }); + } + + private void AddTranscriptBubble(string text, bool isUser) + { + try + { + // Hide empty state on first message + if (EmptyState.Visibility == Visibility.Visible) + EmptyState.Visibility = Visibility.Collapsed; + + var bubble = new Border + { + Background = isUser + ? new SolidColorBrush(Microsoft.UI.Colors.DodgerBlue) + : (Brush)Application.Current.Resources["CardBackgroundFillColorDefaultBrush"], + CornerRadius = isUser + ? new CornerRadius(12, 12, 4, 12) + : new CornerRadius(12, 12, 12, 4), + Padding = new Thickness(12, 10, 12, 10), + HorizontalAlignment = isUser + ? HorizontalAlignment.Right + : HorizontalAlignment.Left, + Margin = new Thickness(isUser ? 24 : 0, 4, isUser ? 0 : 24, 4) + }; + + var icon = isUser ? "\uE77B" : "\uE799"; // Person / Robot + var grid = new Grid { ColumnSpacing = 8 }; + grid.ColumnDefinitions.Add(new ColumnDefinition { Width = GridLength.Auto }); + grid.ColumnDefinitions.Add(new ColumnDefinition { Width = new GridLength(1, GridUnitType.Star) }); + + var fontIcon = new FontIcon { Glyph = icon, FontSize = 12, VerticalAlignment = VerticalAlignment.Top, Margin = new Thickness(0, 3, 0, 0) }; + Grid.SetColumn(fontIcon, 0); + grid.Children.Add(fontIcon); + + var textBlock = new TextBlock + { + Text = text, + TextWrapping = TextWrapping.Wrap, + FontSize = 13, + IsTextSelectionEnabled = true + }; + if (isUser) + { + textBlock.Foreground = new SolidColorBrush(Microsoft.UI.Colors.White); + _lastUserTextBlock = textBlock; + _lastUserBubbleTime = DateTime.UtcNow; + } + else + { + // Agent response breaks the consolidation window + _lastUserTextBlock = null; + } + Grid.SetColumn(textBlock, 1); + grid.Children.Add(textBlock); + + bubble.Child = grid; + TranscriptPanel.Children.Add(bubble); + + // Auto-scroll to bottom + TranscriptScroller.UpdateLayout(); + TranscriptScroller.ChangeView(null, TranscriptScroller.ScrollableHeight, null); + } + catch (Exception ex) + { + _logger.Error("Failed to add transcript bubble", ex); + } + } + + private void OnSpeakingChanged(bool isSpeaking) + { + _dispatcherQueue.TryEnqueue(() => + { + StatusText.Text = isSpeaking ? "🗣️ Listening..." : "Speak now — I'm listening"; + }); + } + + private void OnAudioLevelChanged(float level) + { + _dispatcherQueue.TryEnqueue(() => + { + // Scale the level bar width (max width = parent width) + var maxWidth = AudioLevelBar.Parent is FrameworkElement parent ? parent.ActualWidth : 300; + AudioLevelBar.Width = Math.Max(0, level * maxWidth); + }); + } + + private void OnModeChanged(VoiceMode mode) + { + _dispatcherQueue.TryEnqueue(UpdateUI); + } + + private void OnDiagnosticMessage(string message) + { + _dispatcherQueue.TryEnqueue(() => + { + StatusText.Text = message; + }); + } + + private void OnPipelineStateChanged(AudioPipelineState state) + { + _dispatcherQueue.TryEnqueue(() => + { + StatusBadge.Text = state switch + { + AudioPipelineState.Stopped => "Stopped", + AudioPipelineState.Starting => "Starting...", + AudioPipelineState.Listening => "Listening", + AudioPipelineState.Processing => "Processing...", + AudioPipelineState.Error => "Error", + _ => "Unknown" + }; + + StatusText.Text = state switch + { + AudioPipelineState.Stopped => "Press Start to begin", + AudioPipelineState.Starting => "Initializing microphone...", + AudioPipelineState.Listening => "Speak now — I'm listening", + AudioPipelineState.Processing => "Transcribing your speech...", + AudioPipelineState.Error => "An error occurred", + _ => "" + }; + }); + } + + private void UpdateUI() + { + var isActive = _voiceService.CurrentMode != VoiceMode.Inactive; + + StartStopIcon.Glyph = isActive ? "\uE71A" : "\uE768"; // Stop / Play + StartStopText.Text = isActive ? "Stop" : "Start Listening"; + MuteButton.IsEnabled = isActive; + + if (!isActive) + { + StatusBadge.Text = "Ready"; + StatusText.Text = "Press Start to begin"; + AudioLevelBar.Width = 0; + } + } + + private async void OnStartStopClick(object sender, RoutedEventArgs e) + { + try + { + if (_voiceService.CurrentMode == VoiceMode.Inactive) + { + StatusText.Text = "Initializing..."; + StatusBadge.Text = "Starting"; + StartStopButton.IsEnabled = false; + + // Initialize models if needed (may trigger downloads) + if (!_voiceService.IsModelLoaded) + { + if (!_voiceService.IsModelDownloaded) + { + StatusText.Text = "Downloading speech model..."; + var progress = new Progress<(long downloaded, long total)>(p => + { + _dispatcherQueue.TryEnqueue(() => + { + var pct = p.total > 0 ? (int)(p.downloaded * 100 / p.total) : 0; + StatusText.Text = $"Downloading model... {pct}%"; + }); + }); + await _voiceService.DownloadModelAsync(progress: progress); + } + + StatusText.Text = "Loading speech model..."; + await _voiceService.InitializeAsync(); + } + + StatusText.Text = "Starting microphone..."; + await _voiceService.StartVoiceChatAsync(); + } + else + { + StatusText.Text = "Stopping..."; + await _voiceService.StopAsync(); + } + } + catch (Exception ex) + { + _logger.Error("Voice overlay start/stop failed", ex); + StatusText.Text = $"Error: {ex.Message}"; + StatusBadge.Text = "Error"; + } + finally + { + StartStopButton.IsEnabled = true; + UpdateUI(); + } + } + + private async void OnMuteClick(object sender, RoutedEventArgs e) + { + _isMuted = !_isMuted; + MuteIcon.Glyph = _isMuted ? "\uE74F" : "\uE767"; // Muted / Volume + + if (_isMuted) + { + await _voiceService.StopAsync(); + StatusText.Text = "Muted"; + } + else + { + await _voiceService.StartVoiceChatAsync(); + } + } + + private void OnSettingsClick(object sender, RoutedEventArgs e) + { + // Navigate to voice settings via deep link + // This will be handled by the deep link system + } + + private void WindowClosed(object sender, WindowEventArgs args) + { + _voiceService.TranscriptionReceived -= OnTranscriptionReceived; + _voiceService.SpeakingChanged -= OnSpeakingChanged; + _voiceService.AudioLevelChanged -= OnAudioLevelChanged; + _voiceService.ModeChanged -= OnModeChanged; + _voiceService.PipelineStateChanged -= OnPipelineStateChanged; + _voiceService.DiagnosticMessage -= OnDiagnosticMessage; + + // Stop voice session when window closes + _ = _voiceService.StopAsync(); + } +} From 72d0d9ddba7a8a24569258039431025ebd64d8d1 Mon Sep 17 00:00:00 2001 From: Regis Brid Date: Tue, 5 May 2026 09:57:32 -0700 Subject: [PATCH 05/34] Merge master into user/rbrid/stt-capability Master refactored 8 separate windows into a unified Hub app (#272), which removed src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml(.cs) and WebChatWindow.xaml.cs. Node-capability toggles now live in Pages/CapabilitiesPage as a code-built list (one icon + label per capability) instead of an XAML page with x:Uid-localized headers. Conflict resolution and re-integration: * Accepted master's deletion of SettingsWindow.xaml, SettingsWindow.xaml.cs, and WebChatWindow.xaml.cs. The TTS/STT controls and code-behind that this branch added to those files are obsolete with the new Hub UI. * Pages/CapabilitiesPage.xaml.cs gains a Speech-to-Text toggle alongside the existing Camera/Canvas/Screen/Location/TTS toggles, plus 'stt' in the active-capabilities summary string. This is the natural minimal alignment with the new pattern: one capability = one entry in the toggle list. * The TTS provider / ElevenLabs key/voice/model UI that this branch had added is dropped because master removed the corresponding settings surface entirely. The backend services (TextToSpeechService, ElevenLabsTextToSpeechClient) and the SettingsManager keys are intact; the values can be set via direct settings.json edit until a new UI surface lands. * Resolved 5 .resw conflicts (en-us, fr-fr, nl-nl, zh-cn, zh-tw) by taking master's content. All TTS/STT resource keys this branch had added are removed because the controls referencing them are gone. The earlier SettingsMcpDescription update (adding 'microphone' and 'speakers' to the capability list) is outside the conflict region and is preserved. * Deleted tests/OpenClaw.Tray.Tests/SettingsWindowLocalizationCoverageTests.cs. It pinned that 14 specific x:Uids on SettingsWindow.xaml had matching resw entries; the controls and the file no longer exist. Refactors from this branch survived the auto-merge cleanly: * App.xaml.cs OnNodeInvokeCompleted still delegates to NodeInvokeActivityFormatter for privacy-class scrubbing. * NodeService.RegisterCapabilities still calls NodeCapabilityGating predicates for every optional capability, including TTS and STT. Validation: * .\build.ps1 * dotnet test tests/OpenClaw.Shared.Tests/OpenClaw.Shared.Tests.csproj --no-restore (1183 passed, 20 skipped) * dotnet test tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj --no-restore (418 passed; restore required first because master's Tray.Tests now links GatewayDiscoveryService.cs which needs Zeroconf) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/workflows/copilot-setup-steps.yml | 2 +- .github/workflows/repo-assist.lock.yml | 72 +- DEVELOPMENT.md | 16 +- README.md | 22 +- docs/LOCALIZATION.md | 10 +- docs/MCP_MODE.md | 2 +- docs/MISSION_CONTROL.md | 6 +- docs/ONBOARDING_WIZARD.md | 4 +- docs/POWERTOYS.md | 6 +- docs/SETUP.md | 10 +- docs/TEST_COVERAGE.md | 18 +- docs/a2ui/grading.md | 2 +- .../Capabilities/AppCapability.cs | 154 ++ src/OpenClaw.Shared/DeviceIdentity.cs | 8 + src/OpenClaw.Shared/Mcp/McpAuthToken.cs | 11 +- src/OpenClaw.Shared/Mcp/McpHttpServer.cs | 7 +- src/OpenClaw.Shared/Mcp/McpToolBridge.cs | 22 + src/OpenClaw.Shared/Models.cs | 263 +++ src/OpenClaw.Shared/OpenClawGatewayClient.cs | 578 +++++- src/OpenClaw.Shared/SettingsData.cs | 5 + src/OpenClaw.Tray.WinUI/App.xaml | 3 + src/OpenClaw.Tray.WinUI/App.xaml.cs | 1624 +++++++++++++---- .../Controls/SchemaConfigEditor.xaml | 10 + .../Controls/SchemaConfigEditor.xaml.cs | 477 +++++ .../Helpers/CommandCenterTextHelper.cs | 496 +++++ .../Helpers/VisualTestCapture.cs | 21 - .../Onboarding/OnboardingApp.cs | 2 +- .../Onboarding/OnboardingWindow.cs | 2 +- .../Onboarding/Pages/ConnectionPage.cs | 54 - .../Onboarding/Pages/ReadyPage.cs | 6 +- .../Onboarding/Pages/WizardPage.cs | 38 +- .../Onboarding/Services/SetupCodeDecoder.cs | 17 +- .../Onboarding/Widgets/FeatureRow.cs | 32 - .../Onboarding/Widgets/OnboardingCard.cs | 22 - .../Onboarding/Widgets/WizardStepModels.cs | 26 - .../Onboarding/Widgets/WizardStepView.cs | 166 -- .../OpenClaw.Tray.WinUI.csproj | 3 +- src/OpenClaw.Tray.WinUI/Pages/AboutPage.xaml | 103 ++ .../Pages/AboutPage.xaml.cs | 129 ++ .../Pages/ActivityPage.xaml | 101 + .../Pages/ActivityPage.xaml.cs | 143 ++ .../Pages/AgentEventsPage.xaml | 113 ++ .../Pages/AgentEventsPage.xaml.cs | 151 ++ .../Pages/BindingsPage.xaml | 87 + .../Pages/BindingsPage.xaml.cs | 108 ++ .../Pages/CapabilitiesPage.xaml | 82 + .../Pages/CapabilitiesPage.xaml.cs | 167 ++ .../Pages/ChannelsPage.xaml | 29 + .../Pages/ChannelsPage.xaml.cs | 200 ++ src/OpenClaw.Tray.WinUI/Pages/ChatPage.xaml | 85 + .../Pages/ChatPage.xaml.cs | 179 ++ src/OpenClaw.Tray.WinUI/Pages/ConfigPage.xaml | 111 ++ .../Pages/ConfigPage.xaml.cs | 766 ++++++++ .../Pages/ConnectionPage.xaml | 253 +++ .../Pages/ConnectionPage.xaml.cs | 572 ++++++ .../Pages/ConversationsPage.xaml | 96 + .../Pages/ConversationsPage.xaml.cs | 133 ++ src/OpenClaw.Tray.WinUI/Pages/CronPage.xaml | 130 ++ .../Pages/CronPage.xaml.cs | 228 +++ src/OpenClaw.Tray.WinUI/Pages/DebugPage.xaml | 121 ++ .../Pages/DebugPage.xaml.cs | 199 ++ src/OpenClaw.Tray.WinUI/Pages/HomePage.xaml | 84 + .../Pages/HomePage.xaml.cs | 304 +++ .../Pages/InstancesPage.xaml | 33 + .../Pages/InstancesPage.xaml.cs | 183 ++ src/OpenClaw.Tray.WinUI/Pages/NodesPage.xaml | 47 + .../Pages/NodesPage.xaml.cs | 453 +++++ .../Pages/PermissionsPage.xaml | 129 ++ .../Pages/PermissionsPage.xaml.cs | 241 +++ .../Pages/SessionsPage.xaml | 90 + .../Pages/SessionsPage.xaml.cs | 185 ++ .../Pages/SettingsPage.xaml | 93 + .../Pages/SettingsPage.xaml.cs | 115 ++ src/OpenClaw.Tray.WinUI/Pages/SkillsPage.xaml | 92 + .../Pages/SkillsPage.xaml.cs | 196 ++ src/OpenClaw.Tray.WinUI/Pages/UsagePage.xaml | 119 ++ .../Pages/UsagePage.xaml.cs | 124 ++ .../Pages/WorkspacePage.xaml | 56 + .../Pages/WorkspacePage.xaml.cs | 180 ++ .../Services/DeepLinkHandler.cs | 47 +- .../Services/GatewayDiscoveryService.cs | 221 +++ .../Services/NodeService.cs | 12 + .../Services/SettingsManager.cs | 3 + .../ElevenLabsTextToSpeechClient.cs | 10 +- .../Strings/en-us/Resources.resw | 941 +++++++++- .../Strings/fr-fr/Resources.resw | 1047 +++++++++-- .../Strings/nl-nl/Resources.resw | 1134 ++++++++++-- .../Strings/zh-cn/Resources.resw | 1030 +++++++++-- .../Strings/zh-tw/Resources.resw | 1032 +++++++++-- .../Windows/ActivityStreamWindow.xaml | 102 -- .../Windows/ActivityStreamWindow.xaml.cs | 209 --- .../Windows/CanvasWindow.xaml | 99 +- .../Windows/CanvasWindow.xaml.cs | 2 + .../Windows/ChatWindow.xaml | 91 + .../Windows/ChatWindow.xaml.cs | 250 +++ .../Windows/CommandPaletteDialog.xaml | 36 + .../Windows/CommandPaletteDialog.xaml.cs | 102 ++ .../Windows/HubWindow.xaml | 166 ++ .../Windows/HubWindow.xaml.cs | 798 ++++++++ .../Windows/NotificationHistoryWindow.xaml | 88 - .../Windows/NotificationHistoryWindow.xaml.cs | 116 -- .../Windows/SettingsWindow.xaml | 348 ---- .../Windows/SettingsWindow.xaml.cs | 839 --------- .../Windows/StatusDetailWindow.xaml | 653 ------- .../Windows/StatusDetailWindow.xaml.cs | 1174 ------------ .../Windows/TrayMenuWindow.xaml | 11 +- .../Windows/TrayMenuWindow.xaml.cs | 181 +- .../Windows/WebChatWindow.xaml | 69 - .../Windows/WebChatWindow.xaml.cs | 525 ------ .../OpenClawTray.FunctionalUI.csproj | 2 +- .../AppCapabilityTests.cs | 78 + .../DeviceIdentityTests.cs | 9 + tests/OpenClaw.Shared.Tests/ModelsTests.cs | 10 + .../OpenClawGatewayClientTests.cs | 19 + tests/OpenClaw.Shared.Tests/SystemRunTests.cs | 4 +- .../DeepLinkParserTests.cs | 14 +- .../ElevenLabsTextToSpeechClientTests.cs | 5 +- .../GatewayDiscoveryServiceTests.cs | 76 + .../LocalizationValidationTests.cs | 125 +- .../OpenClaw.Tray.Tests.csproj | 3 +- .../SecurityValidationTests.cs | 6 +- ...SettingsWindowLocalizationCoverageTests.cs | 113 -- .../SetupCodeDecoderTests.cs | 36 +- .../TrayMenuWindowMarkupTests.cs | 382 +--- .../WizardStepPropsTests.cs | 78 - .../OpenClaw.Tray.UITests.csproj | 2 +- 126 files changed, 17318 insertions(+), 6207 deletions(-) create mode 100644 src/OpenClaw.Shared/Capabilities/AppCapability.cs create mode 100644 src/OpenClaw.Tray.WinUI/Controls/SchemaConfigEditor.xaml create mode 100644 src/OpenClaw.Tray.WinUI/Controls/SchemaConfigEditor.xaml.cs create mode 100644 src/OpenClaw.Tray.WinUI/Helpers/CommandCenterTextHelper.cs delete mode 100644 src/OpenClaw.Tray.WinUI/Onboarding/Widgets/FeatureRow.cs delete mode 100644 src/OpenClaw.Tray.WinUI/Onboarding/Widgets/OnboardingCard.cs delete mode 100644 src/OpenClaw.Tray.WinUI/Onboarding/Widgets/WizardStepModels.cs delete mode 100644 src/OpenClaw.Tray.WinUI/Onboarding/Widgets/WizardStepView.cs create mode 100644 src/OpenClaw.Tray.WinUI/Pages/AboutPage.xaml create mode 100644 src/OpenClaw.Tray.WinUI/Pages/AboutPage.xaml.cs create mode 100644 src/OpenClaw.Tray.WinUI/Pages/ActivityPage.xaml create mode 100644 src/OpenClaw.Tray.WinUI/Pages/ActivityPage.xaml.cs create mode 100644 src/OpenClaw.Tray.WinUI/Pages/AgentEventsPage.xaml create mode 100644 src/OpenClaw.Tray.WinUI/Pages/AgentEventsPage.xaml.cs create mode 100644 src/OpenClaw.Tray.WinUI/Pages/BindingsPage.xaml create mode 100644 src/OpenClaw.Tray.WinUI/Pages/BindingsPage.xaml.cs create mode 100644 src/OpenClaw.Tray.WinUI/Pages/CapabilitiesPage.xaml create mode 100644 src/OpenClaw.Tray.WinUI/Pages/CapabilitiesPage.xaml.cs create mode 100644 src/OpenClaw.Tray.WinUI/Pages/ChannelsPage.xaml create mode 100644 src/OpenClaw.Tray.WinUI/Pages/ChannelsPage.xaml.cs create mode 100644 src/OpenClaw.Tray.WinUI/Pages/ChatPage.xaml create mode 100644 src/OpenClaw.Tray.WinUI/Pages/ChatPage.xaml.cs create mode 100644 src/OpenClaw.Tray.WinUI/Pages/ConfigPage.xaml create mode 100644 src/OpenClaw.Tray.WinUI/Pages/ConfigPage.xaml.cs create mode 100644 src/OpenClaw.Tray.WinUI/Pages/ConnectionPage.xaml create mode 100644 src/OpenClaw.Tray.WinUI/Pages/ConnectionPage.xaml.cs create mode 100644 src/OpenClaw.Tray.WinUI/Pages/ConversationsPage.xaml create mode 100644 src/OpenClaw.Tray.WinUI/Pages/ConversationsPage.xaml.cs create mode 100644 src/OpenClaw.Tray.WinUI/Pages/CronPage.xaml create mode 100644 src/OpenClaw.Tray.WinUI/Pages/CronPage.xaml.cs create mode 100644 src/OpenClaw.Tray.WinUI/Pages/DebugPage.xaml create mode 100644 src/OpenClaw.Tray.WinUI/Pages/DebugPage.xaml.cs create mode 100644 src/OpenClaw.Tray.WinUI/Pages/HomePage.xaml create mode 100644 src/OpenClaw.Tray.WinUI/Pages/HomePage.xaml.cs create mode 100644 src/OpenClaw.Tray.WinUI/Pages/InstancesPage.xaml create mode 100644 src/OpenClaw.Tray.WinUI/Pages/InstancesPage.xaml.cs create mode 100644 src/OpenClaw.Tray.WinUI/Pages/NodesPage.xaml create mode 100644 src/OpenClaw.Tray.WinUI/Pages/NodesPage.xaml.cs create mode 100644 src/OpenClaw.Tray.WinUI/Pages/PermissionsPage.xaml create mode 100644 src/OpenClaw.Tray.WinUI/Pages/PermissionsPage.xaml.cs create mode 100644 src/OpenClaw.Tray.WinUI/Pages/SessionsPage.xaml create mode 100644 src/OpenClaw.Tray.WinUI/Pages/SessionsPage.xaml.cs create mode 100644 src/OpenClaw.Tray.WinUI/Pages/SettingsPage.xaml create mode 100644 src/OpenClaw.Tray.WinUI/Pages/SettingsPage.xaml.cs create mode 100644 src/OpenClaw.Tray.WinUI/Pages/SkillsPage.xaml create mode 100644 src/OpenClaw.Tray.WinUI/Pages/SkillsPage.xaml.cs create mode 100644 src/OpenClaw.Tray.WinUI/Pages/UsagePage.xaml create mode 100644 src/OpenClaw.Tray.WinUI/Pages/UsagePage.xaml.cs create mode 100644 src/OpenClaw.Tray.WinUI/Pages/WorkspacePage.xaml create mode 100644 src/OpenClaw.Tray.WinUI/Pages/WorkspacePage.xaml.cs create mode 100644 src/OpenClaw.Tray.WinUI/Services/GatewayDiscoveryService.cs delete mode 100644 src/OpenClaw.Tray.WinUI/Windows/ActivityStreamWindow.xaml delete mode 100644 src/OpenClaw.Tray.WinUI/Windows/ActivityStreamWindow.xaml.cs create mode 100644 src/OpenClaw.Tray.WinUI/Windows/ChatWindow.xaml create mode 100644 src/OpenClaw.Tray.WinUI/Windows/ChatWindow.xaml.cs create mode 100644 src/OpenClaw.Tray.WinUI/Windows/CommandPaletteDialog.xaml create mode 100644 src/OpenClaw.Tray.WinUI/Windows/CommandPaletteDialog.xaml.cs create mode 100644 src/OpenClaw.Tray.WinUI/Windows/HubWindow.xaml create mode 100644 src/OpenClaw.Tray.WinUI/Windows/HubWindow.xaml.cs delete mode 100644 src/OpenClaw.Tray.WinUI/Windows/NotificationHistoryWindow.xaml delete mode 100644 src/OpenClaw.Tray.WinUI/Windows/NotificationHistoryWindow.xaml.cs delete mode 100644 src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml delete mode 100644 src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml.cs delete mode 100644 src/OpenClaw.Tray.WinUI/Windows/StatusDetailWindow.xaml delete mode 100644 src/OpenClaw.Tray.WinUI/Windows/StatusDetailWindow.xaml.cs delete mode 100644 src/OpenClaw.Tray.WinUI/Windows/WebChatWindow.xaml delete mode 100644 src/OpenClaw.Tray.WinUI/Windows/WebChatWindow.xaml.cs create mode 100644 tests/OpenClaw.Shared.Tests/AppCapabilityTests.cs create mode 100644 tests/OpenClaw.Tray.Tests/GatewayDiscoveryServiceTests.cs delete mode 100644 tests/OpenClaw.Tray.Tests/SettingsWindowLocalizationCoverageTests.cs delete mode 100644 tests/OpenClaw.Tray.Tests/WizardStepPropsTests.cs diff --git a/.github/workflows/copilot-setup-steps.yml b/.github/workflows/copilot-setup-steps.yml index 8cfd3fdc..cde4be10 100644 --- a/.github/workflows/copilot-setup-steps.yml +++ b/.github/workflows/copilot-setup-steps.yml @@ -21,6 +21,6 @@ jobs: - name: Checkout repository uses: actions/checkout@v6 - name: Install gh-aw extension - uses: github/gh-aw-actions/setup-cli@239aec45b78c8799417efdd5bc6d8cc036629ec1 # v0.71.1 + uses: github/gh-aw-actions/setup-cli@07c7335cd76c4d4d9f00dd7874f85ff55ed71f24 # v0.71.3 with: version: v0.68.1 diff --git a/.github/workflows/repo-assist.lock.yml b/.github/workflows/repo-assist.lock.yml index f88f8d0d..134d922b 100644 --- a/.github/workflows/repo-assist.lock.yml +++ b/.github/workflows/repo-assist.lock.yml @@ -47,9 +47,9 @@ # Custom actions used: # - actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 # - actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 -# - actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 +# - actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 # - actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 -# - github/gh-aw-actions/setup@239aec45b78c8799417efdd5bc6d8cc036629ec1 # v0.71.1 +# - github/gh-aw-actions/setup@07c7335cd76c4d4d9f00dd7874f85ff55ed71f24 # v0.71.3 # # Container images used: # - ghcr.io/github/gh-aw-firewall/agent:0.25.20@sha256:9161f2415a3306a344aca34dd671ee69f122317e0a512e66dc64c94b9c508682 @@ -131,7 +131,7 @@ jobs: steps: - name: Setup Scripts id: setup - uses: github/gh-aw-actions/setup@239aec45b78c8799417efdd5bc6d8cc036629ec1 # v0.71.1 + uses: github/gh-aw-actions/setup@07c7335cd76c4d4d9f00dd7874f85ff55ed71f24 # v0.71.3 with: destination: ${{ runner.temp }}/gh-aw/actions job-name: ${{ github.job }} @@ -155,7 +155,7 @@ jobs: GH_AW_INFO_AWMG_VERSION: "" GH_AW_INFO_FIREWALL_TYPE: "squid" GH_AW_COMPILED_STRICT: "true" - uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 with: script: | const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); @@ -165,7 +165,7 @@ jobs: - name: Add eyes reaction for immediate feedback id: react if: github.event_name == 'issues' || github.event_name == 'issue_comment' || github.event_name == 'pull_request_review_comment' || github.event_name == 'discussion' || github.event_name == 'discussion_comment' || github.event_name == 'pull_request' && github.event.pull_request.head.repo.id == github.repository_id - uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 env: GH_AW_REACTION: "eyes" with: @@ -191,7 +191,7 @@ jobs: fetch-depth: 1 - name: Check workflow lock file id: check-lock-file - uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 env: GH_AW_WORKFLOW_FILE: "repo-assist.lock.yml" GH_AW_CONTEXT_WORKFLOW_REF: "${{ github.workflow_ref }}" @@ -202,7 +202,7 @@ jobs: const { main } = require('${{ runner.temp }}/gh-aw/actions/check_workflow_timestamp_api.cjs'); await main(); - name: Check compile-agentic version - uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 env: GH_AW_COMPILED_VERSION: "v0.68.3" with: @@ -213,7 +213,7 @@ jobs: await main(); - name: Compute current body text id: sanitized - uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 with: script: | const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); @@ -223,7 +223,7 @@ jobs: - name: Add comment with workflow run link id: add-comment if: github.event_name == 'issues' || github.event_name == 'issue_comment' || github.event_name == 'pull_request_review_comment' || github.event_name == 'discussion' || github.event_name == 'discussion_comment' || github.event_name == 'pull_request' && github.event.pull_request.head.repo.id == github.repository_id - uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 env: GH_AW_WORKFLOW_NAME: "Repo Assist" GH_AW_SAFE_OUTPUT_MESSAGES: "{\"footer\":\"\\u003e Generated by 🌈 {workflow_name}, see [workflow run]({run_url}). [Learn more](https://github.com/githubnext/agentics/blob/main/docs/repo-assist.md).\",\"runStarted\":\"{workflow_name} is processing {event_type}, see [workflow run]({run_url})...\",\"runSuccess\":\"✓ {workflow_name} completed successfully, see [workflow run]({run_url}).\",\"runFailure\":\"✗ {workflow_name} encountered {status}, see [workflow run]({run_url}).\"}" @@ -314,7 +314,7 @@ jobs: GH_AW_PROMPT_0b7a82d8a513bd25_EOF } > "$GH_AW_PROMPT" - name: Interpolate variables and render templates - uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 env: GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt GH_AW_GITHUB_REPOSITORY: ${{ github.repository }} @@ -328,7 +328,7 @@ jobs: const { main } = require('${{ runner.temp }}/gh-aw/actions/interpolate_prompt.cjs'); await main(); - name: Substitute placeholders - uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 env: GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt GH_AW_GITHUB_ACTOR: ${{ github.actor }} @@ -430,7 +430,7 @@ jobs: steps: - name: Setup Scripts id: setup - uses: github/gh-aw-actions/setup@239aec45b78c8799417efdd5bc6d8cc036629ec1 # v0.71.1 + uses: github/gh-aw-actions/setup@07c7335cd76c4d4d9f00dd7874f85ff55ed71f24 # v0.71.3 with: destination: ${{ runner.temp }}/gh-aw/actions job-name: ${{ github.job }} @@ -503,7 +503,7 @@ jobs: id: checkout-pr if: | github.event.pull_request || github.event.issue.pull_request - uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 env: GH_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} with: @@ -835,7 +835,7 @@ jobs: "customValidation": "requiresOneOf:status,title,body" } } - uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 with: script: | const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); @@ -1027,7 +1027,7 @@ jobs: bash "${RUNNER_TEMP}/gh-aw/actions/stop_mcp_gateway.sh" "$GATEWAY_PID" - name: Redact secrets in logs if: always() - uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 with: script: | const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); @@ -1053,7 +1053,7 @@ jobs: - name: Ingest agent output id: collect_output if: always() - uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 env: GH_AW_SAFE_OUTPUTS: ${{ steps.set-runtime-paths.outputs.GH_AW_SAFE_OUTPUTS }} GH_AW_ALLOWED_DOMAINS: "*.gradle-enterprise.cloud,*.pythonhosted.org,*.vsblob.vsassets.io,adoptium.net,anaconda.org,api.adoptium.net,api.business.githubcopilot.com,api.enterprise.githubcopilot.com,api.foojay.io,api.github.com,api.githubcopilot.com,api.individual.githubcopilot.com,api.npms.io,api.nuget.org,api.snapcraft.io,archive.apache.org,archive.ubuntu.com,azure.archive.ubuntu.com,azuresearch-usnc.nuget.org,azuresearch-ussc.nuget.org,binstar.org,bootstrap.pypa.io,builds.dotnet.microsoft.com,bun.sh,cdn.azul.com,cdn.jsdelivr.net,central.sonatype.com,ci.dot.net,conda.anaconda.org,conda.binstar.org,crates.io,crl.geotrust.com,crl.globalsign.com,crl.identrust.com,crl.sectigo.com,crl.thawte.com,crl.usertrust.com,crl.verisign.com,crl3.digicert.com,crl4.digicert.com,crls.ssl.com,dc.services.visualstudio.com,deb.nodesource.com,deno.land,develocity.apache.org,dist.nuget.org,dl.google.com,dlcdn.apache.org,dot.net,dotnet.microsoft.com,dotnetcli.blob.core.windows.net,download.eclipse.org,download.java.net,download.oracle.com,downloads.gradle-dn.com,esm.sh,files.pythonhosted.org,ge.spockframework.org,get.pnpm.io,github.com,googleapis.deno.dev,googlechromelabs.github.io,gradle.org,host.docker.internal,index.crates.io,jcenter.bintray.com,jdk.java.net,json-schema.org,json.schemastore.org,jsr.io,keyserver.ubuntu.com,maven-central.storage-download.googleapis.com,maven.apache.org,maven.google.com,maven.oracle.com,maven.pkg.github.com,nodejs.org,npm.pkg.github.com,npmjs.com,npmjs.org,nuget.org,nuget.pkg.github.com,nugetregistryv2prod.blob.core.windows.net,ocsp.digicert.com,ocsp.geotrust.com,ocsp.globalsign.com,ocsp.identrust.com,ocsp.sectigo.com,ocsp.ssl.com,ocsp.thawte.com,ocsp.usertrust.com,ocsp.verisign.com,oneocsp.microsoft.com,packagecloud.io,packages.cloud.google.com,packages.microsoft.com,pip.pypa.io,pkgs.dev.azure.com,plugins-artifacts.gradle.org,plugins.gradle.org,ppa.launchpad.net,pypi.org,pypi.python.org,raw.githubusercontent.com,registry.bower.io,registry.npmjs.com,registry.npmjs.org,registry.yarnpkg.com,repo.anaconda.com,repo.continuum.io,repo.gradle.org,repo.grails.org,repo.maven.apache.org,repo.spring.io,repo.yarnpkg.com,repo1.maven.org,repository.apache.org,s.symcb.com,s.symcd.com,scans-in.gradle.com,security.ubuntu.com,services.gradle.org,sh.rustup.rs,skimdb.npmjs.com,static.crates.io,static.rust-lang.org,storage.googleapis.com,telemetry.enterprise.githubcopilot.com,telemetry.vercel.com,ts-crl.ws.symantec.com,ts-ocsp.ws.symantec.com,www.googleapis.com,www.java.com,www.microsoft.com,www.npmjs.com,www.npmjs.org,yarnpkg.com" @@ -1068,7 +1068,7 @@ jobs: await main(); - name: Parse agent logs for step summary if: always() - uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 env: GH_AW_AGENT_OUTPUT: /tmp/gh-aw/sandbox/agent/logs/ with: @@ -1080,7 +1080,7 @@ jobs: - name: Parse MCP Gateway logs for step summary if: always() id: parse-mcp-gateway - uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 with: script: | const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); @@ -1105,7 +1105,7 @@ jobs: - name: Parse token usage for step summary if: always() continue-on-error: true - uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 with: script: | const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); @@ -1179,7 +1179,7 @@ jobs: steps: - name: Setup Scripts id: setup - uses: github/gh-aw-actions/setup@239aec45b78c8799417efdd5bc6d8cc036629ec1 # v0.71.1 + uses: github/gh-aw-actions/setup@07c7335cd76c4d4d9f00dd7874f85ff55ed71f24 # v0.71.3 with: destination: ${{ runner.temp }}/gh-aw/actions job-name: ${{ github.job }} @@ -1200,7 +1200,7 @@ jobs: echo "GH_AW_AGENT_OUTPUT=/tmp/gh-aw/agent_output.json" >> "$GITHUB_OUTPUT" - name: Process no-op messages id: noop - uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 env: GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }} GH_AW_NOOP_MAX: "1" @@ -1219,7 +1219,7 @@ jobs: await main(); - name: Log detection run id: detection_runs - uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 env: GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }} GH_AW_WORKFLOW_NAME: "Repo Assist" @@ -1237,7 +1237,7 @@ jobs: await main(); - name: Record missing tool id: missing_tool - uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 env: GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }} GH_AW_MISSING_TOOL_CREATE_ISSUE: "true" @@ -1253,7 +1253,7 @@ jobs: await main(); - name: Record incomplete id: report_incomplete - uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 env: GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }} GH_AW_REPORT_INCOMPLETE_CREATE_ISSUE: "true" @@ -1270,7 +1270,7 @@ jobs: - name: Handle agent failure id: handle_agent_failure if: always() - uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 env: GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }} GH_AW_WORKFLOW_NAME: "Repo Assist" @@ -1307,7 +1307,7 @@ jobs: await main(); - name: Update reaction comment with completion status id: conclusion - uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 env: GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }} GH_AW_COMMENT_ID: ${{ needs.activation.outputs.comment_id }} @@ -1342,7 +1342,7 @@ jobs: steps: - name: Setup Scripts id: setup - uses: github/gh-aw-actions/setup@239aec45b78c8799417efdd5bc6d8cc036629ec1 # v0.71.1 + uses: github/gh-aw-actions/setup@07c7335cd76c4d4d9f00dd7874f85ff55ed71f24 # v0.71.3 with: destination: ${{ runner.temp }}/gh-aw/actions job-name: ${{ github.job }} @@ -1409,7 +1409,7 @@ jobs: ls -la /tmp/gh-aw/threat-detection/ 2>/dev/null || true - name: Setup threat detection if: always() && steps.detection_guard.outputs.run_detection == 'true' - uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 env: WORKFLOW_NAME: "Repo Assist" WORKFLOW_DESCRIPTION: "A friendly repository assistant that runs 2 times a day to support contributors and maintainers.\nCan also be triggered on-demand via '/repo-assist ' to perform specific tasks.\n- Labels and triages open issues\n- Comments helpfully on open issues to unblock contributors and onboard newcomers\n- Identifies issues that can be fixed and creates draft pull requests with fixes\n- Improves performance, testing, and code quality via PRs\n- Makes engineering investments: dependency updates, CI improvements, tooling\n- Updates its own PRs when CI fails or merge conflicts arise\n- Nudges stale PRs waiting for author response\n- Takes the repository forward with proactive improvements\n- Maintains a persistent memory of work done and what remains\nAlways polite, constructive, and mindful of the project's goals." @@ -1472,7 +1472,7 @@ jobs: - name: Parse and conclude threat detection id: detection_conclusion if: always() - uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 env: RUN_DETECTION: ${{ steps.detection_guard.outputs.run_detection }} GH_AW_DETECTION_CONTINUE_ON_ERROR: "true" @@ -1493,13 +1493,13 @@ jobs: steps: - name: Setup Scripts id: setup - uses: github/gh-aw-actions/setup@239aec45b78c8799417efdd5bc6d8cc036629ec1 # v0.71.1 + uses: github/gh-aw-actions/setup@07c7335cd76c4d4d9f00dd7874f85ff55ed71f24 # v0.71.3 with: destination: ${{ runner.temp }}/gh-aw/actions job-name: ${{ github.job }} - name: Check team membership for command workflow id: check_membership - uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 env: GH_AW_REQUIRED_ROLES: "admin,maintainer,write" with: @@ -1511,7 +1511,7 @@ jobs: await main(); - name: Check command position id: check_command_position - uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 env: GH_AW_COMMANDS: "[\"repo-assist\"]" with: @@ -1542,7 +1542,7 @@ jobs: steps: - name: Setup Scripts id: setup - uses: github/gh-aw-actions/setup@239aec45b78c8799417efdd5bc6d8cc036629ec1 # v0.71.1 + uses: github/gh-aw-actions/setup@07c7335cd76c4d4d9f00dd7874f85ff55ed71f24 # v0.71.3 with: destination: ${{ runner.temp }}/gh-aw/actions job-name: ${{ github.job }} @@ -1574,7 +1574,7 @@ jobs: - name: Push repo-memory changes (default) id: push_repo_memory_default if: always() - uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 env: GH_TOKEN: ${{ github.token }} GITHUB_RUN_ID: ${{ github.run_id }} @@ -1637,7 +1637,7 @@ jobs: steps: - name: Setup Scripts id: setup - uses: github/gh-aw-actions/setup@239aec45b78c8799417efdd5bc6d8cc036629ec1 # v0.71.1 + uses: github/gh-aw-actions/setup@07c7335cd76c4d4d9f00dd7874f85ff55ed71f24 # v0.71.3 with: destination: ${{ runner.temp }}/gh-aw/actions job-name: ${{ github.job }} @@ -1695,7 +1695,7 @@ jobs: echo "GH_HOST=${GH_HOST}" >> "$GITHUB_ENV" - name: Process Safe Outputs id: process_safe_outputs - uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 env: GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }} GH_AW_ALLOWED_DOMAINS: "*.gradle-enterprise.cloud,*.pythonhosted.org,*.vsblob.vsassets.io,adoptium.net,anaconda.org,api.adoptium.net,api.business.githubcopilot.com,api.enterprise.githubcopilot.com,api.foojay.io,api.github.com,api.githubcopilot.com,api.individual.githubcopilot.com,api.npms.io,api.nuget.org,api.snapcraft.io,archive.apache.org,archive.ubuntu.com,azure.archive.ubuntu.com,azuresearch-usnc.nuget.org,azuresearch-ussc.nuget.org,binstar.org,bootstrap.pypa.io,builds.dotnet.microsoft.com,bun.sh,cdn.azul.com,cdn.jsdelivr.net,central.sonatype.com,ci.dot.net,conda.anaconda.org,conda.binstar.org,crates.io,crl.geotrust.com,crl.globalsign.com,crl.identrust.com,crl.sectigo.com,crl.thawte.com,crl.usertrust.com,crl.verisign.com,crl3.digicert.com,crl4.digicert.com,crls.ssl.com,dc.services.visualstudio.com,deb.nodesource.com,deno.land,develocity.apache.org,dist.nuget.org,dl.google.com,dlcdn.apache.org,dot.net,dotnet.microsoft.com,dotnetcli.blob.core.windows.net,download.eclipse.org,download.java.net,download.oracle.com,downloads.gradle-dn.com,esm.sh,files.pythonhosted.org,ge.spockframework.org,get.pnpm.io,github.com,googleapis.deno.dev,googlechromelabs.github.io,gradle.org,host.docker.internal,index.crates.io,jcenter.bintray.com,jdk.java.net,json-schema.org,json.schemastore.org,jsr.io,keyserver.ubuntu.com,maven-central.storage-download.googleapis.com,maven.apache.org,maven.google.com,maven.oracle.com,maven.pkg.github.com,nodejs.org,npm.pkg.github.com,npmjs.com,npmjs.org,nuget.org,nuget.pkg.github.com,nugetregistryv2prod.blob.core.windows.net,ocsp.digicert.com,ocsp.geotrust.com,ocsp.globalsign.com,ocsp.identrust.com,ocsp.sectigo.com,ocsp.ssl.com,ocsp.thawte.com,ocsp.usertrust.com,ocsp.verisign.com,oneocsp.microsoft.com,packagecloud.io,packages.cloud.google.com,packages.microsoft.com,pip.pypa.io,pkgs.dev.azure.com,plugins-artifacts.gradle.org,plugins.gradle.org,ppa.launchpad.net,pypi.org,pypi.python.org,raw.githubusercontent.com,registry.bower.io,registry.npmjs.com,registry.npmjs.org,registry.yarnpkg.com,repo.anaconda.com,repo.continuum.io,repo.gradle.org,repo.grails.org,repo.maven.apache.org,repo.spring.io,repo.yarnpkg.com,repo1.maven.org,repository.apache.org,s.symcb.com,s.symcd.com,scans-in.gradle.com,security.ubuntu.com,services.gradle.org,sh.rustup.rs,skimdb.npmjs.com,static.crates.io,static.rust-lang.org,storage.googleapis.com,telemetry.enterprise.githubcopilot.com,telemetry.vercel.com,ts-crl.ws.symantec.com,ts-ocsp.ws.symantec.com,www.googleapis.com,www.java.com,www.microsoft.com,www.npmjs.com,www.npmjs.org,yarnpkg.com" diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index fbcc4ebf..01774e61 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -87,7 +87,7 @@ OpenClaw.Tray.Tests ──tests──▶ OpenClaw.Shared |-----------|----------|---------| | **Gateway Communication** | `OpenClaw.Shared/OpenClawGatewayClient.cs` | WebSocket client with protocol v3, reconnect/backoff logic | | **Notification System** | `OpenClaw.Tray.WinUI/App.xaml.cs` | Event routing, toast notifications, classification | -| **WebView2 Integration** | `OpenClaw.Tray.WinUI/Windows/WebChatWindow.xaml.cs` | Embedded chat panel with lifecycle management | +| **WebView2 Integration** | `OpenClaw.Tray.WinUI/Windows/ChatWindow.xaml.cs` | Embedded chat panel with lifecycle management | | **Tray Icon Management** | `OpenClaw.Tray.WinUI/Helpers/IconHelper.cs` | GDI handle management, dynamic icon generation | | **Session Tracking** | `OpenClaw.Shared/OpenClawGatewayClient.cs` | Session state, activity tracking, polling | | **Settings & Logging** | `OpenClaw.Tray.WinUI/Services/` | JSON settings persistence, file rotation logging | @@ -285,7 +285,7 @@ Notifications are classified using two strategies: ### WebView2 Lifecycle -The `WebChatWindow` uses Microsoft Edge WebView2 for embedded web content: +The `ChatWindow` uses Microsoft Edge WebView2 for embedded web content: **Initialization:** 1. WebView2 control created in XAML @@ -299,7 +299,7 @@ Window Created → WebView2.EnsureCoreWebView2Async() → Navigate to Chat URL ``` **Key Design Decisions:** -- **Singleton pattern**: Only one WebChat window instance exists +- **Singleton pattern**: Only one chat window instance exists - **Hidden instead of disposed**: Window is hidden when closed to preserve state - **Separate user data folder**: Isolates cookies/storage from browser - **Navigation guard**: Prevents accidental navigation away from chat @@ -425,8 +425,8 @@ dotnet test --filter "FullyQualifiedName~AgentActivityTests" ``` **Test Coverage:** -- ✅ **652 tests** in `OpenClaw.Shared.Tests` — models, gateway client, exec approvals, capabilities, URL helpers, notification categorization, shell quoting -- ✅ **262 tests** in `OpenClaw.Tray.Tests` — menu display, menu positioning, settings round-trip, deep link parsing, onboarding state, setup code decoder, security validation, wizard step parsing, localization validation +- ✅ **1182 tests** in `OpenClaw.Shared.Tests` — models, gateway client, exec approvals, capabilities, URL helpers, notification categorization, shell quoting, MCP, device identity, and WinNode client coverage +- ✅ **388 tests** in `OpenClaw.Tray.Tests` — settings round-trip, deep link parsing, onboarding state, setup code decoder, gateway health/chat helpers, security validation, wizard step parsing, gateway discovery, localization validation - ✅ All tests are pure unit tests (no network, no file system, no external dependencies) See [tests/OpenClaw.Shared.Tests/README.md](tests/OpenClaw.Shared.Tests/README.md) for detailed test documentation. @@ -441,7 +441,7 @@ You can test the UI and basic functionality without a running gateway: 3. Enter a dummy gateway URL (e.g., `ws://localhost:18789`) 4. The app will show "Disconnected" status but you can: - Test the tray menu structure - - Open Settings dialog and configure preferences + - Open the Settings page and configure preferences - Test auto-start functionality - View logs @@ -487,8 +487,8 @@ You can test the UI and basic functionality without a running gateway: - Verify Windows toast notification appears (if enabled) - Click toast → should open relevant UI -2. **Notification History**: - - Right-click tray → **Notification History** +2. **Activity / notification history**: + - Right-click tray → **Activity Stream** or **Notification History** - Verify past notifications are listed - Test filtering by category diff --git a/README.md b/README.md index e030cec7..2fdf8bb4 100644 --- a/README.md +++ b/README.md @@ -98,13 +98,13 @@ Modern Windows 11-style system tray companion that connects to your local OpenCl - 🌐 **Web Chat** - Embedded chat window with WebView2 - 📊 **Live Status** - Real-time sessions, channels, and usage display - 🧭 **Command Center** - Dense gateway, channel, usage, node, pairing, and allowlist diagnostics from one window -- ⚡ **Activity Stream** - Dedicated flyout for live session, usage, node, and notification events +- ⚡ **Activity Stream** - Command Center page for live session, usage, node, and notification events - 🔔 **Toast Notifications** - Clickable Windows notifications with [smart categorization](docs/NOTIFICATION_CATEGORIZATION.md) - 📡 **Channel Control** - Start/stop Telegram & WhatsApp from the menu - 🖥️ **Node Observability** - Node inventory with online/offline state and copyable summary - ⏱ **Cron Jobs** - Quick access to scheduled tasks - 🚀 **Auto-start** - Launch with Windows -- ⚙️ **Settings** - Full configuration dialog +- ⚙️ **Settings** - Full configuration page - 🎯 **First-run onboarding** — 6-screen setup wizard (connection, permissions, chat, configuration) #### Quick Send scope requirement @@ -123,7 +123,7 @@ If Quick Send fails with `pairing required` / `NOT_PAIRED`, that is a **device a ### Menu Sections - **Status** - Gateway connection status with click-to-view details -- **Command Center** - Status detail window with diagnostics, channel health, usage, sessions, nodes, and copyable repair commands +- **Command Center** - Hub with diagnostics, channel health, usage, sessions, nodes, and copyable repair commands - **Sessions** - Active agent sessions with preview and per-session controls - **Usage** - Provider/cost summary with quick jump to activity details - **Channels** - Telegram/WhatsApp status with toggle control @@ -182,6 +182,8 @@ When Node Mode is enabled in Settings, your Windows PC becomes a **node** that t | **Device** | `device.info`, `device.status` | Return Windows host/app metadata and lightweight status | | **Text-to-speech** | `tts.speak` | Speak text aloud through Windows speech synthesis, or ElevenLabs when configured | +Packaged installs declare camera, microphone, and location capabilities. Windows may ask for consent the first time a node capability uses one of those protected resources. + #### Node Setup 1. **Enable Node Mode** in Settings (enabled by default) @@ -300,12 +302,12 @@ OpenClaw registers the `openclaw://` URL scheme for automation and integration: | Link | Description | |------|-------------| -| `openclaw://settings` | Open Settings dialog | +| `openclaw://settings` | Open the Settings page | | `openclaw://setup` | Open Setup Wizard | -| `openclaw://chat` | Open Web Chat window | +| `openclaw://chat` | Open the Chat page | | `openclaw://commandcenter` | Open Command Center diagnostics | -| `openclaw://activity` | Open Activity Stream | -| `openclaw://history` | Open Notification History | +| `openclaw://activity` | Open the Activity page | +| `openclaw://history` | Open the Activity page filtered to notification history | | `openclaw://dashboard` | Open Dashboard in browser | | `openclaw://dashboard/sessions` | Open specific dashboard page | | `openclaw://dashboard/channels` | Open Channels dashboard page | @@ -342,15 +344,15 @@ PowerToys Command Palette extension for quick OpenClaw access. - **📡 Dashboard: Channels** - Open the channel configuration dashboard - **🧩 Dashboard: Skills** - Open the skills dashboard - **⏱️ Dashboard: Cron** - Open the scheduled jobs dashboard -- **💬 Web Chat** - Open the embedded Web Chat window +- **💬 Web Chat** - Open the embedded Chat page - **📝 Quick Send** - Open the Quick Send dialog to compose a message - **🧭 Setup Wizard** - Open pairing/setup - **🧭 Command Center** - Open diagnostics and support actions - **🔄 Run Health Check** - Refresh connection health - **⬇️ Check for Updates** - Run a manual GitHub Releases update check - **⚡ Activity Stream** - Open recent activity -- **📋 Notification History** - Open notification history -- **⚙️ Settings** - Open the OpenClaw Tray Settings dialog +- **📋 Notification History** - Open notification history in the Activity page +- **⚙️ Settings** - Open the OpenClaw Tray Settings page - **📄 Open Log File / 📁 Logs / 🗂️ Config / 🧪 Diagnostics** - Open support files and folders - **📋 Copy Support Context** - Copy redacted Command Center metadata - **🧰 Copy Debug Bundle** - Copy combined support, port, capability, node, channel, and activity diagnostics diff --git a/docs/LOCALIZATION.md b/docs/LOCALIZATION.md index c53148c7..67f35a25 100644 --- a/docs/LOCALIZATION.md +++ b/docs/LOCALIZATION.md @@ -46,7 +46,7 @@ OpenClaw Tray uses WinUI `.resw` resource files for localization. Windows automa 5. **Do not translate resource key names** (the `name` attribute). Only translate `` content. -6. **Submit a pull request** with just your new `Resources.resw` file. No code changes are needed — the build system automatically discovers new locale folders. +6. **Submit a pull request** with just your new `Resources.resw` file. No code changes are needed — the build system and localization tests automatically discover new locale folders. ## How It Works @@ -104,15 +104,17 @@ All onboarding wizard strings use the `Onboarding_` prefix: ## Validation -All 5 resource files must have the **same set of keys**. You can verify with: +All resource files must have the **same set of keys**. Locale directories are discovered dynamically under `Strings/`, so adding a new `Strings//Resources.resw` file automatically brings it under validation. You can verify counts with: ```powershell -$locales = @("en-us", "fr-fr", "nl-nl", "zh-cn", "zh-tw") $base = "src\OpenClaw.Tray.WinUI\Strings" -foreach ($loc in $locales) { +Get-ChildItem $base -Directory | ForEach-Object { + $loc = $_.Name $count = (Select-String -Path "$base\$loc\Resources.resw" -Pattern '/Resources.resw` directories; the current locales are English, French, Dutch, Chinese Simplified, and Chinese Traditional. Translations are AI-generated following the repo convention. Technical terms (Gateway, Token, Node Mode) are kept in English across all locales. diff --git a/docs/POWERTOYS.md b/docs/POWERTOYS.md index 522fda05..1d294443 100644 --- a/docs/POWERTOYS.md +++ b/docs/POWERTOYS.md @@ -39,15 +39,15 @@ Open Command Palette (`Win+Alt+Space`), type **"OpenClaw"** — you should see t | **📡 Dashboard: Channels** | Opens the channel configuration dashboard | | **🧩 Dashboard: Skills** | Opens the skills dashboard | | **⏱️ Dashboard: Cron** | Opens the scheduled jobs dashboard | -| **💬 Web Chat** | Opens the embedded Web Chat window in OpenClaw Tray | +| **💬 Web Chat** | Opens the embedded Chat page in OpenClaw Tray | | **📝 Quick Send** | Opens the Quick Send dialog to compose a message | | **🧭 Setup Wizard** | Opens QR, setup code, and manual gateway pairing | | **🧭 Command Center** | Opens gateway, tunnel, node, browser, and support diagnostics | | **🔄 Run Health Check** | Refreshes gateway or node connection health | | **⬇️ Check for Updates** | Runs a manual GitHub Releases update check | | **⚡ Activity Stream** | Opens recent tray activity and support bundle actions | -| **📋 Notification History** | Opens recent OpenClaw tray notifications | -| **⚙️ Settings** | Opens the OpenClaw Tray Settings dialog | +| **📋 Notification History** | Opens recent OpenClaw tray notifications in the Activity page | +| **⚙️ Settings** | Opens the OpenClaw Tray Settings page | | **📄 Open Log File** | Opens the current OpenClaw Tray log | | **📁 Open Logs Folder** | Opens the OpenClaw Tray logs folder | | **🗂️ Open Config Folder** | Opens the OpenClaw Tray configuration folder | diff --git a/docs/SETUP.md b/docs/SETUP.md index b0db5776..2ae87713 100644 --- a/docs/SETUP.md +++ b/docs/SETUP.md @@ -63,7 +63,7 @@ On first launch, Molty opens a **6-screen onboarding wizard** that walks you thr - **Camera** — for camera capture - **Microphone** — for voice input - **Screen Capture** — for screenshots - - **Location** — optional, for location-aware features + - **Location** — optional, for location-aware features; packaged installs declare this capability so Windows may prompt for location consent the first time it is used Each permission shows its current status. Click **Open Settings** next to any permission to jump directly to the relevant Windows Settings page. @@ -95,14 +95,14 @@ OpenClaw Tray responds to `openclaw://` deep links, which can be invoked from a | `openclaw://dashboard/channels` | Open the channels dashboard page | | `openclaw://dashboard/skills` | Open the skills dashboard page | | `openclaw://dashboard/cron` | Open the cron dashboard page | -| `openclaw://chat` | Open the embedded Web Chat window | +| `openclaw://chat` | Open the embedded Chat page | | `openclaw://send` | Open the Quick Send dialog | | `openclaw://send?message=Hello` | Open Quick Send with pre-filled text | -| `openclaw://settings` | Open the Settings dialog | +| `openclaw://settings` | Open the Settings page | | `openclaw://setup` | Open the Setup Wizard | | `openclaw://commandcenter` | Open Command Center diagnostics | -| `openclaw://activity` | Open the Activity Stream | -| `openclaw://history` | Open Notification History | +| `openclaw://activity` | Open the Activity page | +| `openclaw://history` | Open the Activity page filtered to notification history | | `openclaw://healthcheck` | Run a manual health check | | `openclaw://check-updates` | Run a manual update check | | `openclaw://logs` | Open the current tray log file | diff --git a/docs/TEST_COVERAGE.md b/docs/TEST_COVERAGE.md index 243fd642..ea653465 100644 --- a/docs/TEST_COVERAGE.md +++ b/docs/TEST_COVERAGE.md @@ -1,17 +1,17 @@ # Test Coverage Summary -**914 tests total** (652 shared + 262 tray) — all passing ✅ +**1570 tests total** (1182 shared + 388 tray) — all passing ✅ | Metric | Value | |--------|-------| -| Total Tests | 914 | -| Passing | 914 (100%) | +| Total Tests | 1570 | +| Passing | 1570 (100%) | | Failing | 0 | | Framework | xUnit 2.9.3 / .NET 10.0 | ## Test Projects -### OpenClaw.Shared.Tests — 652 tests +### OpenClaw.Shared.Tests — 1182 tests #### ModelsTests - **AgentActivityTests** (~15) — glyph mapping for all ActivityKind values, display text formatting @@ -71,7 +71,7 @@ --- -### OpenClaw.Tray.Tests — 262 tests +### OpenClaw.Tray.Tests — 388 tests #### Core Tray Tests @@ -83,14 +83,14 @@ #### Onboarding Tests - **OnboardingStateTests** (19) — Page order, mode logic, route changes, wizard state persistence, completion, disposal -- **WizardStepPropsTests** (4) — Enum values, record defaults, callback verification - **GatewayChatHelperTests** (11) — URL scheme conversion, token encoding, localhost checks, session keys - **LocalGatewayApproverTests** (13) — IsLocalGateway for localhost/remote/edge cases - **SetupCodeDecoderTests** (14) — Base64url decode, size limits, JSON validation, URL/token extraction - **GatewayHealthCheckTests** (6) — Health URI building, scheme conversion, port preservation - **SecurityValidationTests** (16) — Locale whitelist, port range, path traversal, URI scheme validation - **WizardStepParsingTests** (12) — JSON step parsing, options, completion, sensitive fields -- **LocalizationValidationTests** (6) — 5-locale key parity, onboarding key presence, no duplicates +- **GatewayDiscoveryServiceTests** — mDNS host selection and connection URL regression coverage +- **LocalizationValidationTests** — locale key parity, onboarding key presence, duplicate detection, and all-or-none translation consistency --- @@ -124,6 +124,6 @@ dotnet test --logger "console;verbosity=detailed" --- -**Last Updated**: 2026-04-26 +**Last Updated**: 2026-05-04 **Framework**: xUnit 2.9.3 / .NET 10.0 -**Status**: ✅ 914 tests passing +**Status**: ✅ 1570 tests passing diff --git a/docs/a2ui/grading.md b/docs/a2ui/grading.md index 0754c385..05e7c17b 100644 --- a/docs/a2ui/grading.md +++ b/docs/a2ui/grading.md @@ -262,7 +262,7 @@ fake `WindowsNodeClient`). | Per-surface theme scope | `Hosting/SurfaceHost.cs ApplyThemeToScope` | multi-surface tab views don't bleed themes | | `IA2UITelemetry` seam | `Telemetry/IA2UITelemetry.cs` | structured events instead of log scraping | | Single-handler `Func` events on `CanvasCapability` | reviewed in commit `5b9c468` | catches accidental multi-subscribe instead of silent `Delegate.Combine` | -| MCP bearer token in Settings UI | `SettingsWindow.xaml.cs` | quality-of-life for MCP setup, kept out of action payloads | +| MCP bearer token in Settings UI | `SettingsPage.xaml.cs` | quality-of-life for MCP setup, kept out of action payloads | --- diff --git a/src/OpenClaw.Shared/Capabilities/AppCapability.cs b/src/OpenClaw.Shared/Capabilities/AppCapability.cs new file mode 100644 index 00000000..30092bac --- /dev/null +++ b/src/OpenClaw.Shared/Capabilities/AppCapability.cs @@ -0,0 +1,154 @@ +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; + +namespace OpenClaw.Shared.Capabilities; + +/// +/// App-level capability exposing navigation, status, and configuration +/// through the MCP server for programmatic testing and CLI agents. +/// +public class AppCapability : NodeCapabilityBase +{ + public override string Category => "app"; + + private static readonly string[] _commands = new[] + { + "app.navigate", + "app.status", + "app.sessions", + "app.agents", + "app.nodes", + "app.config.get", + "app.settings.get", + "app.settings.set", + "app.menu", + "app.search", + }; + + public override IReadOnlyList Commands => _commands; + + // Handler delegates — wired up by App.xaml.cs after construction. + public Func>? NavigateHandler; + public Func? StatusHandler; + public Func>? SessionsHandler; + public Func>? AgentsHandler; + public Func? NodesHandler; + public Func>? ConfigGetHandler; + public Func? SettingsGetHandler; + public Func? SettingsSetHandler; + public Func? MenuHandler; + public Func? SearchHandler; + + public AppCapability(IOpenClawLogger logger) : base(logger) { } + + public override async Task ExecuteAsync(NodeInvokeRequest request) + { + return request.Command switch + { + "app.navigate" => await HandleNavigate(request), + "app.status" => HandleStatus(), + "app.sessions" => await HandleSessions(request), + "app.agents" => await HandleAgents(), + "app.nodes" => HandleNodes(), + "app.config.get" => await HandleConfigGet(request), + "app.settings.get" => HandleSettingsGet(request), + "app.settings.set" => HandleSettingsSet(request), + "app.menu" => HandleMenu(), + "app.search" => HandleSearch(request), + _ => Error($"Unknown command: {request.Command}") + }; + } + + private async Task HandleNavigate(NodeInvokeRequest request) + { + var page = GetStringArg(request.Args, "page"); + if (string.IsNullOrEmpty(page)) + return Error("Missing required arg: page"); + if (NavigateHandler == null) + return Error("Navigate handler not registered"); + var result = await NavigateHandler(page); + return Success(result); + } + + private NodeInvokeResponse HandleStatus() + { + if (StatusHandler == null) + return Error("Status handler not registered"); + return Success(StatusHandler()); + } + + private async Task HandleSessions(NodeInvokeRequest request) + { + var agentId = GetStringArg(request.Args, "agentId"); + if (SessionsHandler == null) + return Error("Sessions handler not registered"); + var result = await SessionsHandler(agentId); + return Success(result); + } + + private async Task HandleAgents() + { + if (AgentsHandler == null) + return Error("Agents handler not registered"); + var result = await AgentsHandler(); + return Success(result); + } + + private NodeInvokeResponse HandleNodes() + { + if (NodesHandler == null) + return Error("Nodes handler not registered"); + return Success(NodesHandler()); + } + + private async Task HandleConfigGet(NodeInvokeRequest request) + { + var path = GetStringArg(request.Args, "path"); + if (ConfigGetHandler == null) + return Error("Config handler not registered"); + var result = await ConfigGetHandler(path); + return Success(result); + } + + private NodeInvokeResponse HandleSettingsGet(NodeInvokeRequest request) + { + var name = GetStringArg(request.Args, "name"); + if (string.IsNullOrEmpty(name)) + return Error("Missing required arg: name"); + if (SettingsGetHandler == null) + return Error("Settings handler not registered"); + return Success(SettingsGetHandler(name)); + } + + private NodeInvokeResponse HandleSettingsSet(NodeInvokeRequest request) + { + var name = GetStringArg(request.Args, "name"); + var value = GetStringArg(request.Args, "value"); + if (string.IsNullOrEmpty(name)) + return Error("Missing required arg: name"); + if (value == null) + return Error("Missing required arg: value"); + if (SettingsSetHandler == null) + return Error("Settings handler not registered"); + return Success(SettingsSetHandler(name, value)); + } + + private NodeInvokeResponse HandleMenu() + { + if (MenuHandler == null) + return Error("Menu handler not registered"); + return Success(MenuHandler()); + } + + private NodeInvokeResponse HandleSearch(NodeInvokeRequest request) + { + var query = GetStringArg(request.Args, "query"); + if (string.IsNullOrEmpty(query)) + return Error("Missing required arg: query"); + if (SearchHandler == null) + return Error("Search handler not registered"); + return Success(SearchHandler(query)); + } +} diff --git a/src/OpenClaw.Shared/DeviceIdentity.cs b/src/OpenClaw.Shared/DeviceIdentity.cs index e3c496e5..3fa66eff 100644 --- a/src/OpenClaw.Shared/DeviceIdentity.cs +++ b/src/OpenClaw.Shared/DeviceIdentity.cs @@ -3,6 +3,7 @@ using System.Security.Cryptography; using System.Text; using System.Text.Json; +using OpenClaw.Shared.Mcp; using NSec.Cryptography; namespace OpenClaw.Shared; @@ -146,8 +147,11 @@ private void GenerateNew() { Directory.CreateDirectory(dir); } + if (!string.IsNullOrEmpty(dir)) + McpAuthToken.TryRestrictDataDirectoryAcl(dir); File.WriteAllText(_keyPath, JsonSerializer.Serialize(data, new JsonSerializerOptions { WriteIndented = true })); + McpAuthToken.TryRestrictSensitiveFileAcl(_keyPath); _logger.Info($"Generated new Ed25519 device identity: {_deviceId}"); } @@ -307,6 +311,9 @@ public string BuildDebugPayload(string nonce, long signedAtMs, string clientId, /// public void StoreDeviceToken(string token) { + if (string.IsNullOrWhiteSpace(token)) + throw new ArgumentException("Device token cannot be empty.", nameof(token)); + _deviceToken = token; // Update the key file with the token @@ -320,6 +327,7 @@ public void StoreDeviceToken(string token) { data.DeviceToken = token; File.WriteAllText(_keyPath, JsonSerializer.Serialize(data, new JsonSerializerOptions { WriteIndented = true })); + McpAuthToken.TryRestrictSensitiveFileAcl(_keyPath); _logger.Info("Device token stored"); } } diff --git a/src/OpenClaw.Shared/Mcp/McpAuthToken.cs b/src/OpenClaw.Shared/Mcp/McpAuthToken.cs index 2335c638..50cda88f 100644 --- a/src/OpenClaw.Shared/Mcp/McpAuthToken.cs +++ b/src/OpenClaw.Shared/Mcp/McpAuthToken.cs @@ -96,7 +96,7 @@ public static string LoadOrCreate(string path) try { File.WriteAllText(tempPath, token, Encoding.UTF8); - TryRestrictFileAcl(tempPath); + TryRestrictSensitiveFileAcl(tempPath); File.Move(tempPath, path, overwrite: true); } catch @@ -104,7 +104,7 @@ public static string LoadOrCreate(string path) try { if (File.Exists(tempPath)) File.Delete(tempPath); } catch { } throw; } - TryRestrictFileAcl(path); + TryRestrictSensitiveFileAcl(path); return token; } @@ -127,7 +127,7 @@ public static string Reset(string path) try { File.WriteAllText(tempPath, token, Encoding.UTF8); - TryRestrictFileAcl(tempPath); + TryRestrictSensitiveFileAcl(tempPath); File.Move(tempPath, path, overwrite: true); } catch @@ -137,7 +137,7 @@ public static string Reset(string path) } // Move on Windows preserves the source's DACL; re-apply defensively in // case a future rename strategy substitutes a different file. - TryRestrictFileAcl(path); + TryRestrictSensitiveFileAcl(path); return token; } @@ -183,8 +183,9 @@ public static void TryRestrictDataDirectoryAcl(string dir) catch { /* best-effort; acl restriction is defense-in-depth, not load-bearing */ } } - private static void TryRestrictFileAcl(string path) + public static void TryRestrictSensitiveFileAcl(string path) { + if (string.IsNullOrEmpty(path)) return; if (!OperatingSystem.IsWindows()) return; try { RestrictFileAclWindows(path); } catch { /* see above */ } diff --git a/src/OpenClaw.Shared/Mcp/McpHttpServer.cs b/src/OpenClaw.Shared/Mcp/McpHttpServer.cs index c5bd53d1..a3a45eb3 100644 --- a/src/OpenClaw.Shared/Mcp/McpHttpServer.cs +++ b/src/OpenClaw.Shared/Mcp/McpHttpServer.cs @@ -89,11 +89,10 @@ public McpHttpServer(McpToolBridge bridge, int port, IOpenClawLogger logger, str _port = port; _authToken = string.IsNullOrEmpty(authToken) ? null : authToken; _listener = new HttpListener(); - // Loopback binding — not reachable from other machines. - // Register both numeric and hostname forms so clients that connect - // via http://localhost:port/ (common on Linux/macOS) are also served. + // Loopback binding — not reachable from other machines. Use only the + // numeric host on Windows so non-elevated startup does not require a + // separate netsh http urlacl reservation for http://localhost:port/. _listener.Prefixes.Add($"http://127.0.0.1:{port}/"); - _listener.Prefixes.Add($"http://localhost:{port}/"); } public void Start() diff --git a/src/OpenClaw.Shared/Mcp/McpToolBridge.cs b/src/OpenClaw.Shared/Mcp/McpToolBridge.cs index c386d4a7..b33058a1 100644 --- a/src/OpenClaw.Shared/Mcp/McpToolBridge.cs +++ b/src/OpenClaw.Shared/Mcp/McpToolBridge.cs @@ -243,6 +243,28 @@ private object HandleToolsList() // tts.* ["tts.speak"] = "Speak text aloud on the Windows node. Args: text (string, required), provider ('windows'|'elevenlabs', optional), voiceId (string, optional), model (string, optional), interrupt (bool, default false). Returns { spoken, provider, contentType, durationMs }.", + + // app.* + ["app.navigate"] = + "Navigate the companion app to a specific page (e.g., 'home', 'sessions', 'settings'). Args: page (string, required). Returns { navigated, page }.", + ["app.status"] = + "Get current connection status, node state, and gateway info. Returns { connectionStatus, nodeConnected, nodePaired, nodePendingApproval, gatewayVersion, sessionCount, nodeCount }.", + ["app.sessions"] = + "List active sessions with optional agent filter. Args: agentId (string, optional). Returns array of { Key, Status, Model, AgeText, tokens }.", + ["app.agents"] = + "List agents from the connected gateway. Returns the raw agents JSON array.", + ["app.nodes"] = + "List connected nodes and their capabilities. Returns array of { DisplayName, NodeId, IsOnline, Platform, CapabilityCount }.", + ["app.config.get"] = + "Read gateway configuration value at a dot-path. Args: path (string, optional). Returns the config subtree or full config.", + ["app.settings.get"] = + "Read a local app setting by name. Args: name (string, required). Returns the setting value.", + ["app.settings.set"] = + "Set a local app setting (name and value). Args: name (string, required), value (string, required). Returns { name, value }.", + ["app.menu"] = + "Get tray menu state (status, session count, node count). Returns array of menu items.", + ["app.search"] = + "Search the command palette and return matching commands. Args: query (string, required). Returns array of { Title, Subtitle, Icon }.", }; private async Task HandleToolsCallAsync(JsonElement parameters, CancellationToken cancellationToken) diff --git a/src/OpenClaw.Shared/Models.cs b/src/OpenClaw.Shared/Models.cs index da7ef89e..04d3c5cb 100644 --- a/src/OpenClaw.Shared/Models.cs +++ b/src/OpenClaw.Shared/Models.cs @@ -1511,3 +1511,266 @@ internal static string FormatLargeNumber(long n) } } +// ── Agent Events ── + +/// Raw agent event from gateway broadcast. +public class AgentEventInfo +{ + public string RunId { get; set; } = ""; + public int Seq { get; set; } + public string Stream { get; set; } = ""; + public double Ts { get; set; } + public JsonElement Data { get; set; } + public string? SessionKey { get; set; } + public string? Summary { get; set; } + + public DateTime Timestamp => DateTimeOffset.FromUnixTimeMilliseconds((long)Ts).LocalDateTime; + + public string FormattedTime => Timestamp.ToString("HH:mm:ss.fff"); + + public string StreamUpper => Stream.ToUpperInvariant(); + + /// Color hex for stream badge (used by UI to create brush). + public string BadgeColorHex => Stream.ToLowerInvariant() switch + { + "tool" => "#FFDC781E", // Orange + "assistant" => "#FF28A050", // Green + "error" => "#FFC83232", // Red + "lifecycle" => "#FF3C78C8", // Blue + "plan" => "#FF8C50C8", // Purple + "approval" => "#FFC8A01E", // Amber + "thinking" => "#FF648CB4", // Steel + "patch" => "#FF50A0A0", // Teal + _ => "#FF646464" // Gray + }; + + /// Human-readable summary extracted from event data. + public string SummaryLine + { + get + { + if (!string.IsNullOrEmpty(Summary)) return Summary; + try + { + var s = Stream.ToLowerInvariant(); + if (s == "tool" && Data.ValueKind == JsonValueKind.Object) + { + var name = Data.TryGetProperty("name", out var n) ? n.GetString() : null; + var phase = Data.TryGetProperty("phase", out var p) ? p.GetString() : null; + if (name != null) return phase != null ? $"🔧 {name} ({phase})" : $"🔧 {name}"; + } + if (s == "assistant" && Data.ValueKind == JsonValueKind.Object) + { + var text = Data.TryGetProperty("text", out var t) ? t.GetString() : null; + if (text != null) return text.Length > 120 ? text[..120] + "…" : text; + } + if (s == "error" && Data.ValueKind == JsonValueKind.Object) + { + var msg = Data.TryGetProperty("message", out var m) ? m.GetString() + : Data.TryGetProperty("error", out var e) ? e.GetString() : null; + if (msg != null) return $"❌ {msg}"; + } + if (s == "lifecycle" && Data.ValueKind == JsonValueKind.Object) + { + var state = Data.TryGetProperty("state", out var st) ? st.GetString() : null; + if (state != null) return $"⚡ {state}"; + } + } + catch { } + return ""; + } + } + + public bool HasSummary => !string.IsNullOrEmpty(SummaryLine); + + public string DataJson + { + get + { + try + { + return JsonSerializer.Serialize(Data, new JsonSerializerOptions { WriteIndented = true }); + } + catch + { + return Data.ToString() ?? "{}"; + } + } + } +} + +// ── Node/Device Pairing ── + +public class PairingRequest +{ + public string RequestId { get; set; } = ""; + public string NodeId { get; set; } = ""; + public string? DisplayName { get; set; } + public string? Platform { get; set; } + public string? Version { get; set; } + public string? RemoteIp { get; set; } + public bool IsRepair { get; set; } + public double Ts { get; set; } + + public DateTime Timestamp => DateTimeOffset.FromUnixTimeMilliseconds((long)Ts).LocalDateTime; + + public string Description + { + get + { + var lines = new List(); + lines.Add($"Node: {DisplayName ?? NodeId}"); + if (!string.IsNullOrEmpty(Platform)) lines.Add($"Platform: {Platform}"); + if (!string.IsNullOrEmpty(Version)) lines.Add($"Version: {Version}"); + if (!string.IsNullOrEmpty(RemoteIp)) lines.Add($"IP: {RemoteIp}"); + if (IsRepair) lines.Add("Repair: yes"); + return string.Join("\n", lines); + } + } +} + +public class DevicePairingRequest +{ + public string RequestId { get; set; } = ""; + public string DeviceId { get; set; } = ""; + public string? PublicKey { get; set; } + public string? DisplayName { get; set; } + public string? Platform { get; set; } + public string? ClientId { get; set; } + public string? ClientMode { get; set; } + public string? Role { get; set; } + public string[]? Scopes { get; set; } + public string? RemoteIp { get; set; } + public bool IsRepair { get; set; } + public double Ts { get; set; } + + public DateTime Timestamp => DateTimeOffset.FromUnixTimeMilliseconds((long)Ts).LocalDateTime; + + public string Description + { + get + { + var lines = new List(); + lines.Add($"Device: {DisplayName ?? DeviceId}"); + if (!string.IsNullOrEmpty(Platform)) lines.Add($"Platform: {Platform}"); + if (!string.IsNullOrEmpty(Role)) lines.Add($"Role: {Role}"); + if (Scopes is { Length: > 0 }) lines.Add($"Scopes: {string.Join(", ", Scopes)}"); + if (!string.IsNullOrEmpty(RemoteIp)) lines.Add($"IP: {RemoteIp}"); + if (IsRepair) lines.Add("Repair: yes"); + return string.Join("\n", lines); + } + } +} + +public class PairingListInfo +{ + public List Pending { get; set; } = new(); +} + +public class DevicePairingListInfo +{ + public List Pending { get; set; } = new(); +} + +// ── Models List ── + +public class ModelInfo +{ + public string Id { get; set; } = ""; + public string? Name { get; set; } + public string? Provider { get; set; } + public int? ContextWindow { get; set; } + public bool IsConfigured { get; set; } + + public string DisplayName => Name ?? Id; +} + +public class ModelsListInfo +{ + public List Models { get; set; } = new(); +} + +// ── Agent Info ── + +public class AgentInfo +{ + public string Id { get; set; } = ""; + public string? Name { get; set; } + public string? Emoji { get; set; } + public string? Workspace { get; set; } + public string? ModelPrimary { get; set; } + public string DisplayName => Name ?? Id; +} + +// ── Presence (connected clients/instances) ── + +public class PresenceEntry +{ + public string? Host { get; set; } + public string? Ip { get; set; } + public string? Version { get; set; } + public string? Platform { get; set; } + public string? DeviceFamily { get; set; } + public string? ModelIdentifier { get; set; } + public string? Mode { get; set; } + public int? LastInputSeconds { get; set; } + public string? Reason { get; set; } + public string[]? Tags { get; set; } + public string? Text { get; set; } + public long Ts { get; set; } + public string? DeviceId { get; set; } + public string[]? Roles { get; set; } + public string[]? Scopes { get; set; } + public string? InstanceId { get; set; } + + public string DisplayName => Host ?? DeviceId ?? Ip ?? "Unknown"; + public DateTime Timestamp => DateTimeOffset.FromUnixTimeSeconds(Ts).LocalDateTime; + public string PlatformLabel => Platform ?? "unknown"; + public string ModeLabel => Mode ?? "unknown"; + + public string LastSeenText + { + get + { + if (LastInputSeconds is not { } secs) return ""; + if (secs < 60) return $"{secs}s ago"; + if (secs < 3600) return $"{secs / 60}m ago"; + return $"{secs / 3600}h ago"; + } + } +} + +// ── Gateway Discovery ── + +public class DiscoveredGateway +{ + public string Id { get; set; } = ""; + public string DisplayName { get; set; } = ""; + public string? Host { get; set; } + public int Port { get; set; } + public string? LanHost { get; set; } + public string? TailnetDns { get; set; } + public bool TlsEnabled { get; set; } + public string? TlsFingerprint { get; set; } + + public string ConnectionUrl + { + get + { + var scheme = TlsEnabled ? "wss" : "ws"; + var host = Host ?? LanHost ?? "localhost"; + return $"{scheme}://{host}:{Port}"; + } + } + + public string HttpUrl + { + get + { + var scheme = TlsEnabled ? "https" : "http"; + var host = Host ?? LanHost ?? "localhost"; + return $"{scheme}://{host}:{Port}"; + } + } +} + diff --git a/src/OpenClaw.Shared/OpenClawGatewayClient.cs b/src/OpenClaw.Shared/OpenClawGatewayClient.cs index ae4f807f..0a3e402c 100644 --- a/src/OpenClaw.Shared/OpenClawGatewayClient.cs +++ b/src/OpenClaw.Shared/OpenClawGatewayClient.cs @@ -2,6 +2,7 @@ using System.Collections.Frozen; using System.Collections.Generic; using System.IO; +using System.Linq; using System.Text; using System.Text.Json; using System.Threading; @@ -63,6 +64,12 @@ private enum SignatureTokenMode private bool _usageCostUnsupported; private bool _sessionPreviewUnsupported; private bool _nodeListUnsupported; + private bool _modelsListUnsupported; + private bool _nodePairListUnsupported; + private bool _devicePairListUnsupported; + private bool _agentsListUnsupported; + private bool _agentFilesListUnsupported; + private bool _agentFileGetUnsupported; private bool _operatorReadScopeUnavailable; private bool _pairingRequiredAwaitingApproval; private bool _authFailed; @@ -90,6 +97,12 @@ private void ResetUnsupportedMethodFlags() _usageCostUnsupported = false; _sessionPreviewUnsupported = false; _nodeListUnsupported = false; + _modelsListUnsupported = false; + _nodePairListUnsupported = false; + _devicePairListUnsupported = false; + _agentsListUnsupported = false; + _agentFilesListUnsupported = false; + _agentFileGetUnsupported = false; _operatorReadScopeUnavailable = false; } @@ -145,6 +158,21 @@ protected override void OnDisposing() public event EventHandler? SessionPreviewUpdated; public event EventHandler? SessionCommandCompleted; public event EventHandler? GatewaySelfUpdated; + public event EventHandler? CronListUpdated; + public event EventHandler? CronStatusUpdated; + public event EventHandler? SkillsStatusUpdated; + public event EventHandler? ConfigUpdated; + public event EventHandler? ConfigSchemaUpdated; + + // New events for agent events, pairing, and models + public event EventHandler? AgentEventReceived; + public event EventHandler? NodePairListUpdated; + public event EventHandler? DevicePairListUpdated; + public event EventHandler? ModelsListUpdated; + public event EventHandler? PresenceUpdated; + public event EventHandler? AgentsListUpdated; + public event EventHandler? AgentFilesListUpdated; + public event EventHandler? AgentFileContentUpdated; public string? OperatorDeviceId => _operatorDeviceId; public IReadOnlyList GrantedOperatorScopes => _grantedOperatorScopes; @@ -289,10 +317,13 @@ public async Task SendWizardRequestAsync(string method, object? par } /// Request session list from gateway. - public async Task RequestSessionsAsync() + public async Task RequestSessionsAsync(string? agentId = null) { if (_operatorReadScopeUnavailable) return; - await SendTrackedRequestAsync("sessions.list"); + if (!string.IsNullOrEmpty(agentId)) + await SendTrackedRequestAsync("sessions.list", new { agentId }); + else + await SendTrackedRequestAsync("sessions.list"); } /// Request usage/context info from gateway (may not be supported on all gateways). @@ -388,6 +419,139 @@ public Task CompactSessionAsync(string key, int maxLines = 400) return TrySendTrackedRequestAsync("sessions.compact", new { key, maxLines }); } + // Cron job management + + public async Task RequestCronListAsync() + { + await SendTrackedRequestAsync("cron.list"); + } + + public async Task RequestCronStatusAsync() + { + await SendTrackedRequestAsync("cron.status"); + } + + public Task RunCronJobAsync(string jobId, bool force = true) + { + return TrySendTrackedRequestAsync("cron.run", new { jobId, force }); + } + + public Task RemoveCronJobAsync(string jobId) + { + return TrySendTrackedRequestAsync("cron.remove", new { id = jobId }); + } + + // Skills/plugin management + + public async Task RequestSkillsStatusAsync(string? agentId = null) + { + if (!string.IsNullOrEmpty(agentId)) + await SendTrackedRequestAsync("skills.status", new { agentId }); + else + await SendTrackedRequestAsync("skills.status"); + } + + public Task InstallSkillAsync(string skillId) + { + return TrySendTrackedRequestAsync("skills.install", new { id = skillId }); + } + + public Task UpdateSkillAsync(string skillId) + { + return TrySendTrackedRequestAsync("skills.update", new { id = skillId }); + } + + // Gateway config management + + public async Task RequestConfigAsync() + { + await SendTrackedRequestAsync("config.get"); + } + + public async Task RequestConfigSchemaAsync() + { + await SendTrackedRequestAsync("config.schema"); + } + + public Task SetConfigAsync(string path, object value) + { + return TrySendTrackedRequestAsync("config.set", new { path, value }); + } + + /// + /// Patch the gateway config. The gateway expects { raw: "full json string", baseHash: "..." }. + /// + public Task PatchConfigAsync(JsonElement fullConfig, string? baseHash) + { + var raw = fullConfig.GetRawText(); + if (baseHash != null) + return TrySendTrackedRequestAsync("config.patch", new { raw, baseHash }); + else + return TrySendTrackedRequestAsync("config.patch", new { raw }); + } + + // Agent methods + + public async Task RequestAgentsListAsync() + { + if (_agentsListUnsupported) return; + await SendTrackedRequestAsync("agents.list"); + } + + public async Task RequestAgentFilesListAsync(string agentId = "main") + { + if (_agentFilesListUnsupported) return; + await SendTrackedRequestAsync("agents.files.list", new { agentId }); + } + + public async Task RequestAgentFileGetAsync(string agentId, string name) + { + if (_agentFileGetUnsupported) return; + await SendTrackedRequestAsync("agents.files.get", new { agentId, name }); + } + + // Models list + + public async Task RequestModelsListAsync() + { + if (_modelsListUnsupported) return; + await SendTrackedRequestAsync("models.list", new { view = "configured" }); + } + + // Node/Device pairing + + public async Task RequestNodePairListAsync() + { + if (_nodePairListUnsupported) return; + await SendTrackedRequestAsync("node.pair.list"); + } + + public Task NodePairApproveAsync(string requestId) + { + return TrySendTrackedRequestAsync("node.pair.approve", new { requestId }); + } + + public Task NodePairRejectAsync(string requestId) + { + return TrySendTrackedRequestAsync("node.pair.reject", new { requestId }); + } + + public async Task RequestDevicePairListAsync() + { + if (_devicePairListUnsupported) return; + await SendTrackedRequestAsync("device.pair.list"); + } + + public Task DevicePairApproveAsync(string requestId) + { + return TrySendTrackedRequestAsync("device.pair.approve", new { requestId }); + } + + public Task DevicePairRejectAsync(string requestId) + { + return TrySendTrackedRequestAsync("device.pair.reject", new { requestId }); + } + /// Start a channel (telegram, whatsapp, etc). public async Task StartChannelAsync(string channelName) { @@ -803,6 +967,10 @@ private void HandleResponse(JsonElement root) _logger.Info($"Granted operator scopes: {string.Join(", ", _grantedOperatorScopes)}"); } _logger.Info($"Main session key: {_mainSessionKey}"); + + // Extract presence from snapshot + TryParsePresence(payload); + RaiseStatusChanged(ConnectionStatus.Connected); // Request initial state after handshake @@ -813,6 +981,7 @@ private void HandleResponse(JsonElement root) await RequestSessionsAsync(); await RequestUsageAsync(); await RequestNodesAsync(); + await RequestAgentsListAsync(); }); } @@ -876,6 +1045,56 @@ private bool HandleKnownResponse(string method, JsonElement payload) case "sessions.compact": ParseSessionCommandResult(method, payload); return true; + case "cron.list": + CronListUpdated?.Invoke(this, payload.Clone()); + return true; + case "cron.status": + CronStatusUpdated?.Invoke(this, payload.Clone()); + return true; + case "cron.run": + case "cron.remove": + return true; + case "skills.status": + SkillsStatusUpdated?.Invoke(this, payload.Clone()); + return true; + case "skills.install": + case "skills.update": + return true; + case "config.get": + ConfigUpdated?.Invoke(this, payload.Clone()); + return true; + case "config.schema": + ConfigSchemaUpdated?.Invoke(this, payload.Clone()); + return true; + case "config.set": + case "config.patch": + return true; + case "agents.list": + AgentsListUpdated?.Invoke(this, payload.Clone()); + return true; + case "agents.files.list": + AgentFilesListUpdated?.Invoke(this, payload.Clone()); + return true; + case "agents.files.get": + AgentFileContentUpdated?.Invoke(this, payload.Clone()); + return true; + case "models.list": + ParseModelsList(payload); + return true; + case "node.pair.list": + ParseNodePairList(payload); + return true; + case "node.pair.approve": + case "node.pair.reject": + _ = RequestNodePairListAsync(); + return true; + case "device.pair.list": + ParseDevicePairList(payload); + return true; + case "device.pair.approve": + case "device.pair.reject": + _ = RequestDevicePairListAsync(); + return true; default: return false; } @@ -968,6 +1187,30 @@ private void HandleRequestError(string? method, JsonElement root) _nodeListUnsupported = true; _logger.Warn("node.list unsupported on gateway"); return; + case "models.list": + _modelsListUnsupported = true; + _logger.Warn("models.list unsupported on gateway"); + return; + case "node.pair.list": + _nodePairListUnsupported = true; + _logger.Warn("node.pair.list unsupported on gateway"); + return; + case "device.pair.list": + _devicePairListUnsupported = true; + _logger.Warn("device.pair.list unsupported on gateway"); + return; + case "agents.list": + _agentsListUnsupported = true; + _logger.Warn("agents.list unsupported on gateway"); + return; + case "agents.files.list": + _agentFilesListUnsupported = true; + _logger.Warn("agents.files.list unsupported on gateway"); + return; + case "agents.files.get": + _agentFileGetUnsupported = true; + _logger.Warn("agents.files.get unsupported on gateway"); + return; } } @@ -1246,6 +1489,21 @@ private void HandleEvent(JsonElement root) case "session": HandleSessionEvent(root); break; + case "node.pair.requested": + case "node.pair.resolved": + // Refresh node pair list when pairing state changes + _ = RequestNodePairListAsync(); + break; + case "device.pair.requested": + case "device.pair.resolved": + // Refresh device pair list when pairing state changes + _ = RequestDevicePairListAsync(); + break; + case "presence": + // Presence snapshot broadcast when clients connect/disconnect + if (root.TryGetProperty("payload", out var presPayload)) + TryParsePresenceFromBroadcast(presPayload); + break; } } @@ -1275,13 +1533,33 @@ private void HandleAgentEvent(JsonElement root) { if (!root.TryGetProperty("payload", out var payload)) return; - // Determine session + // sessionKey is inside payload, not root var sessionKey = "unknown"; - if (root.TryGetProperty("sessionKey", out var sk)) + if (payload.TryGetProperty("sessionKey", out var sk)) sessionKey = sk.GetString() ?? "unknown"; var isMain = sessionKey == "main" || sessionKey.Contains(":main:"); - // Parse activity from stream field + // Emit raw agent event (cloned for thread safety) + try + { + var evt = new AgentEventInfo + { + RunId = payload.TryGetProperty("runId", out var rid) ? rid.GetString() ?? "" : "", + Seq = payload.TryGetProperty("seq", out var seqProp) && seqProp.ValueKind == JsonValueKind.Number ? seqProp.GetInt32() : 0, + Stream = payload.TryGetProperty("stream", out var streamProp2) ? streamProp2.GetString() ?? "" : "", + Ts = payload.TryGetProperty("ts", out var tsProp) && tsProp.ValueKind == JsonValueKind.Number ? tsProp.GetDouble() : 0, + Data = payload.TryGetProperty("data", out var dataProp) ? dataProp.Clone() : default, + SessionKey = sessionKey, + Summary = payload.TryGetProperty("summary", out var sumProp) ? sumProp.GetString() : null + }; + AgentEventReceived?.Invoke(this, evt); + } + catch (Exception ex) + { + _logger.Warn($"Failed to emit agent event: {ex.Message}"); + } + + // Parse activity from stream field (existing behavior) if (payload.TryGetProperty("stream", out var streamProp)) { var stream = streamProp.GetString(); @@ -1538,68 +1816,82 @@ private void ParseSessions(JsonElement sessions) SessionInfo[] snapshot; lock (_sessionsLock) { - _sessions.Clear(); - - // Handle both Array format and Object (dictionary) format + // Merge instead of clear — collect incoming keys, update/add, then remove absent + var incomingKeys = new HashSet(); + if (sessions.ValueKind == JsonValueKind.Array) { foreach (var item in sessions.EnumerateArray()) { - ParseSessionItem(item); + var key = ParseSessionItem(item); + if (key != null) incomingKeys.Add(key); } } else if (sessions.ValueKind == JsonValueKind.Object) { - // Object format: keys are session IDs, values could be session info objects or simple strings foreach (var prop in sessions.EnumerateObject()) { var sessionKey = prop.Name; - - // Skip metadata fields that aren't actual sessions + if (sessionKey is "recent" or "count" or "path" or "defaults" or "ts") continue; - - // Skip non-session keys (must look like a session key pattern) + if (!sessionKey.Equals("global", StringComparison.OrdinalIgnoreCase) && !sessionKey.Contains(':') && !sessionKey.Contains("agent") && !sessionKey.Contains("session")) continue; - - var session = new SessionInfo { Key = sessionKey }; + var item = prop.Value; - - // Detect main session from key pattern - "agent:main:main" ends with ":main" + + if (item.ValueKind == JsonValueKind.String) + { + var strVal = item.GetString() ?? ""; + if (strVal.StartsWith("/") || strVal.Contains("/.")) + continue; + } + else if (item.ValueKind == JsonValueKind.Number) + { + continue; + } + + // Update or create session + if (!_sessions.TryGetValue(sessionKey, out var session)) + { + session = new SessionInfo { Key = sessionKey }; + } + var endsWithMain = sessionKey.EndsWith(":main"); session.IsMain = sessionKey == "main" || endsWithMain || sessionKey.Contains(":main:main"); - _logger.Debug($"Session key={sessionKey}, endsWithMain={endsWithMain}, IsMain={session.IsMain}"); - - // Value might be an object with session details or just a string status + if (item.ValueKind == JsonValueKind.Object) { - // Only override IsMain if the JSON explicitly says true if (item.TryGetProperty("isMain", out var isMain) && isMain.GetBoolean()) session.IsMain = true; PopulateSessionFromObject(session, item); } else if (item.ValueKind == JsonValueKind.String) { - // Simple string value - skip if it looks like a path (metadata) - var strVal = item.GetString() ?? ""; - if (strVal.StartsWith("/") || strVal.Contains("/.")) - continue; - session.Status = strVal; + session.Status = item.GetString() ?? ""; } - else if (item.ValueKind == JsonValueKind.Number) - { - // Skip numeric values (like count) - continue; - } - - _sessions[session.Key] = session; + + _sessions[sessionKey] = session; + incomingKeys.Add(sessionKey); } } + // Remove sessions no longer present in the gateway response + { + var staleKeys = new List(); + foreach (var key in _sessions.Keys) + { + if (!incomingKeys.Contains(key)) + staleKeys.Add(key); + } + foreach (var key in staleKeys) + _sessions.Remove(key); + } + snapshot = GetSessionListInternal(); } @@ -1611,24 +1903,29 @@ private void ParseSessions(JsonElement sessions) } } - private void ParseSessionItem(JsonElement item) + private string? ParseSessionItem(JsonElement item) { - var session = new SessionInfo(); + var sessionKey = "unknown"; if (item.TryGetProperty("key", out var key)) - session.Key = key.GetString() ?? "unknown"; - - // Detect main from key pattern first - session.IsMain = session.Key == "main" || - session.Key.EndsWith(":main") || - session.Key.Contains(":main:main"); + sessionKey = key.GetString() ?? "unknown"; + + // Update or create + if (!_sessions.TryGetValue(sessionKey, out var session)) + { + session = new SessionInfo { Key = sessionKey }; + } + + session.IsMain = sessionKey == "main" || + sessionKey.EndsWith(":main") || + sessionKey.Contains(":main:main"); - // Only override if JSON explicitly says true if (item.TryGetProperty("isMain", out var isMain) && isMain.GetBoolean()) session.IsMain = true; PopulateSessionFromObject(session, item); _sessions[session.Key] = session; + return session.Key; } private void PopulateSessionFromObject(SessionInfo session, JsonElement item) @@ -1674,8 +1971,16 @@ private void PopulateSessionFromObject(SessionInfo session, JsonElement item) if (item.TryGetProperty("startedAt", out var started)) { - if (DateTime.TryParse(started.GetString(), out var dt)) - session.StartedAt = dt; + if (started.ValueKind == JsonValueKind.String) + { + if (DateTime.TryParse(started.GetString(), out var dt)) + session.StartedAt = dt; + } + else if (started.ValueKind == JsonValueKind.Number) + { + var ms = started.GetInt64(); + session.StartedAt = DateTimeOffset.FromUnixTimeMilliseconds(ms).LocalDateTime; + } } } @@ -2197,4 +2502,187 @@ private static string ShortenPath(string path) ? $"…/{parts[^2]}/{parts[^1]}" : parts[^1]; } + + // ── Parse methods for new features ── + + private void ParseModelsList(JsonElement payload) + { + try + { + var info = new ModelsListInfo(); + // Gateway returns { models: [...] } or just an array + var modelsArray = payload.ValueKind == JsonValueKind.Array + ? payload + : payload.TryGetProperty("models", out var m) ? m : default; + + if (modelsArray.ValueKind == JsonValueKind.Array) + { + foreach (var item in modelsArray.EnumerateArray()) + { + var model = new ModelInfo + { + Id = item.TryGetProperty("id", out var id) ? id.GetString() ?? "" : "", + Name = item.TryGetProperty("name", out var name) ? name.GetString() : null, + Provider = item.TryGetProperty("provider", out var prov) ? prov.GetString() : null, + ContextWindow = item.TryGetProperty("contextWindow", out var cw) && cw.ValueKind == JsonValueKind.Number ? cw.GetInt32() : null, + IsConfigured = item.TryGetProperty("configured", out var cfg) && cfg.ValueKind == JsonValueKind.True + }; + if (!string.IsNullOrEmpty(model.Id)) + info.Models.Add(model); + } + } + ModelsListUpdated?.Invoke(this, info); + } + catch (Exception ex) + { + _logger.Warn($"Failed to parse models.list: {ex.Message}"); + } + } + + private void ParseNodePairList(JsonElement payload) + { + try + { + var info = new PairingListInfo(); + var pending = payload.TryGetProperty("pending", out var p) ? p : default; + if (pending.ValueKind == JsonValueKind.Array) + { + foreach (var item in pending.EnumerateArray()) + { + info.Pending.Add(new PairingRequest + { + RequestId = item.TryGetProperty("requestId", out var rid) ? rid.GetString() ?? "" : "", + NodeId = item.TryGetProperty("nodeId", out var nid) ? nid.GetString() ?? "" : "", + DisplayName = item.TryGetProperty("displayName", out var dn) ? dn.GetString() : null, + Platform = item.TryGetProperty("platform", out var plat) ? plat.GetString() : null, + Version = item.TryGetProperty("version", out var ver) ? ver.GetString() : null, + RemoteIp = item.TryGetProperty("remoteIp", out var ip) ? ip.GetString() : null, + IsRepair = item.TryGetProperty("isRepair", out var rep) && rep.ValueKind == JsonValueKind.True, + Ts = item.TryGetProperty("ts", out var ts) && ts.ValueKind == JsonValueKind.Number ? ts.GetDouble() : 0 + }); + } + } + NodePairListUpdated?.Invoke(this, info); + } + catch (Exception ex) + { + _logger.Warn($"Failed to parse node.pair.list: {ex.Message}"); + } + } + + private void ParseDevicePairList(JsonElement payload) + { + try + { + var info = new DevicePairingListInfo(); + var pending = payload.TryGetProperty("pending", out var p) ? p : default; + if (pending.ValueKind == JsonValueKind.Array) + { + foreach (var item in pending.EnumerateArray()) + { + string[]? scopes = null; + if (item.TryGetProperty("scopes", out var sc) && sc.ValueKind == JsonValueKind.Array) + { + var scopeList = new List(); + foreach (var s in sc.EnumerateArray()) + if (s.GetString() is string sv) scopeList.Add(sv); + scopes = scopeList.ToArray(); + } + + info.Pending.Add(new DevicePairingRequest + { + RequestId = item.TryGetProperty("requestId", out var rid) ? rid.GetString() ?? "" : "", + DeviceId = item.TryGetProperty("deviceId", out var did) ? did.GetString() ?? "" : "", + PublicKey = item.TryGetProperty("publicKey", out var pk) ? pk.GetString() : null, + DisplayName = item.TryGetProperty("displayName", out var dn) ? dn.GetString() : null, + Platform = item.TryGetProperty("platform", out var plat) ? plat.GetString() : null, + ClientId = item.TryGetProperty("clientId", out var cid) ? cid.GetString() : null, + ClientMode = item.TryGetProperty("clientMode", out var cm) ? cm.GetString() : null, + Role = item.TryGetProperty("role", out var role) ? role.GetString() : null, + Scopes = scopes, + RemoteIp = item.TryGetProperty("remoteIp", out var ip) ? ip.GetString() : null, + IsRepair = item.TryGetProperty("isRepair", out var rep) && rep.ValueKind == JsonValueKind.True, + Ts = item.TryGetProperty("ts", out var ts) && ts.ValueKind == JsonValueKind.Number ? ts.GetDouble() : 0 + }); + } + } + DevicePairListUpdated?.Invoke(this, info); + } + catch (Exception ex) + { + _logger.Warn($"Failed to parse device.pair.list: {ex.Message}"); + } + } + + private void TryParsePresence(JsonElement payload) + { + try + { + if (!payload.TryGetProperty("snapshot", out var snapshot)) return; + if (!snapshot.TryGetProperty("presence", out var presenceArray)) return; + if (presenceArray.ValueKind != JsonValueKind.Array) return; + + var entries = ParsePresenceArray(presenceArray); + _logger.Info($"Parsed {entries.Length} presence entries from handshake"); + PresenceUpdated?.Invoke(this, entries); + } + catch (Exception ex) + { + _logger.Warn($"Failed to parse presence from handshake: {ex.Message}"); + } + } + + private void TryParsePresenceFromBroadcast(JsonElement payload) + { + try + { + // Broadcast may contain presence array directly or nested + var presenceArray = payload.ValueKind == JsonValueKind.Array + ? payload + : payload.TryGetProperty("presence", out var p) ? p : default; + + if (presenceArray.ValueKind != JsonValueKind.Array) return; + + var entries = ParsePresenceArray(presenceArray); + PresenceUpdated?.Invoke(this, entries); + } + catch (Exception ex) + { + _logger.Warn($"Failed to parse presence broadcast: {ex.Message}"); + } + } + + private static PresenceEntry[] ParsePresenceArray(JsonElement array) + { + var list = new List(); + foreach (var item in array.EnumerateArray()) + { + list.Add(new PresenceEntry + { + Host = item.TryGetProperty("host", out var h) ? h.GetString() : null, + Ip = item.TryGetProperty("ip", out var ip) ? ip.GetString() : null, + Version = item.TryGetProperty("version", out var v) ? v.GetString() : null, + Platform = item.TryGetProperty("platform", out var p) ? p.GetString() : null, + DeviceFamily = item.TryGetProperty("deviceFamily", out var df) ? df.GetString() : null, + ModelIdentifier = item.TryGetProperty("modelIdentifier", out var mi) ? mi.GetString() : null, + Mode = item.TryGetProperty("mode", out var m) ? m.GetString() : null, + LastInputSeconds = item.TryGetProperty("lastInputSeconds", out var lis) && lis.ValueKind == JsonValueKind.Number ? lis.GetInt32() : null, + Reason = item.TryGetProperty("reason", out var r) ? r.GetString() : null, + Tags = item.TryGetProperty("tags", out var t) && t.ValueKind == JsonValueKind.Array + ? t.EnumerateArray().Select(x => x.GetString() ?? "").Where(x => x.Length > 0).ToArray() + : null, + Text = item.TryGetProperty("text", out var tx) ? tx.GetString() : null, + Ts = item.TryGetProperty("ts", out var ts) && ts.ValueKind == JsonValueKind.Number ? ts.GetInt64() : 0, + DeviceId = item.TryGetProperty("deviceId", out var did) ? did.GetString() : null, + Roles = item.TryGetProperty("roles", out var roles) && roles.ValueKind == JsonValueKind.Array + ? roles.EnumerateArray().Select(x => x.GetString() ?? "").Where(x => x.Length > 0).ToArray() + : null, + Scopes = item.TryGetProperty("scopes", out var sc) && sc.ValueKind == JsonValueKind.Array + ? sc.EnumerateArray().Select(x => x.GetString() ?? "").Where(x => x.Length > 0).ToArray() + : null, + InstanceId = item.TryGetProperty("instanceId", out var iid) ? iid.GetString() : null, + }); + } + return list.ToArray(); + } } diff --git a/src/OpenClaw.Shared/SettingsData.cs b/src/OpenClaw.Shared/SettingsData.cs index a9d0fdd5..e39dfcd2 100644 --- a/src/OpenClaw.Shared/SettingsData.cs +++ b/src/OpenClaw.Shared/SettingsData.cs @@ -38,6 +38,10 @@ public class SettingsData public string SttLanguage { get; set; } = "en-US"; public bool NodeTtsEnabled { get; set; } = false; public string TtsProvider { get; set; } = "windows"; + /// + /// ElevenLabs API key storage slot. When persisted by the Windows tray's + /// SettingsManager this is an opaque dpapi:-prefixed blob, not plaintext. + /// public string? TtsElevenLabsApiKey { get; set; } public string? TtsElevenLabsModel { get; set; } public string? TtsElevenLabsVoiceId { get; set; } @@ -55,6 +59,7 @@ public class SettingsData /// [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] public bool? McpOnlyMode { get; set; } + public string? PreferredGatewayId { get; set; } public bool HasSeenActivityStreamTip { get; set; } = false; public string? SkippedUpdateTag { get; set; } public bool NotifyChatResponses { get; set; } = true; diff --git a/src/OpenClaw.Tray.WinUI/App.xaml b/src/OpenClaw.Tray.WinUI/App.xaml index 8b705905..6c3e4cd7 100644 --- a/src/OpenClaw.Tray.WinUI/App.xaml +++ b/src/OpenClaw.Tray.WinUI/App.xaml @@ -14,6 +14,9 @@ + + +