diff --git a/README.md b/README.md
index ea677b44..2fdf8bb4 100644
--- a/README.md
+++ b/README.md
@@ -177,6 +177,7 @@ When Node Mode is enabled in Settings, your Windows PC becomes a **node** that t
 | **Canvas** | `canvas.present`, `canvas.hide`, `canvas.navigate`, `canvas.eval`, `canvas.snapshot`, `canvas.a2ui.push`, `canvas.a2ui.pushJSONL`, `canvas.a2ui.reset` | Display and control a WebView2 window |
 | **Screen** | `screen.snapshot`, `screen.record` | Capture screenshots and fixed-duration MP4 screen recordings |
 | **Camera** | `camera.list`, `camera.snap`, `camera.clip` | Enumerate cameras and capture still photos or short video clips |
+| **Speech-to-text** | `stt.transcribe` | Capture audio from the default microphone for a bounded duration and return transcribed text. Default-off; opt-in via Settings. When enabled, advertised to both gateway callers (subject to gateway allowlist) and local MCP clients (subject to bearer token). |
 | **Location** | `location.get` | Return Windows geolocation when permission is available |
 | **Device** | `device.info`, `device.status` | Return Windows host/app metadata and lightweight status |
 | **Text-to-speech** | `tts.speak` | Speak text aloud through Windows speech synthesis, or ElevenLabs when configured |
diff --git a/docs/WINDOWS_NODE_TESTING.md b/docs/WINDOWS_NODE_TESTING.md
index 2c20c295..8ac4b06f 100644
--- a/docs/WINDOWS_NODE_TESTING.md
+++ b/docs/WINDOWS_NODE_TESTING.md
@@ -61,6 +61,7 @@ These features need the gateway to send `node.invoke` commands:
 | `location.get` | Get Windows location | Uses Windows location permission/settings |
 | `device.info` / `device.status` | Device metadata/status | Returns host/app/locale plus battery/storage/network/uptime payloads |
 | `browser.proxy` | Proxy browser-control host requests | Requires Browser proxy bridge enabled, a compatible browser-control host listening on gateway port + 2, and matching browser-control auth |
+| `stt.transcribe` | Speech-to-text from default microphone | Default-off; bounded `maxDurationMs` ≤ 30000; concatenates phrases until duration elapses; requires explicit gateway allowlist |
 | `tts.speak` | Speak text aloud | Requires Text-to-speech playback enabled in Settings; gateway mode also requires `tts.speak` in `gateway.nodes.allowCommands` |
 
 ## Capabilities Advertised
@@ -112,6 +113,40 @@ When the node connects, it advertises these capabilities:
 - If you see "Camera access blocked", enable camera access for desktop apps in Windows Privacy settings
 - Packaged MSIX builds will show the system consent prompt automatically
 
+### `stt.transcribe` returns "Speech recognition failed" or "Internal Speech Error"
+- Open Windows Settings → Privacy & security → Speech (`ms-settings:privacy-speech`)
+- Turn **Online speech recognition** = On. The Windows speech recognizer's default dictation grammar often fails without it, and Windows surfaces an unmapped HRESULT as "Internal Speech Error"
+- Open Windows Settings → Time & language → Language & region (`ms-settings:regionlanguage`), select your display language → Language options, and confirm **Speech** appears under Installed features (install it if not, ~50 MB; reboot or sign out/in afterward)
+- Verify the recognizer end-to-end with `ms-settings:speech` → "Microphone" → **Get started** before re-trying `stt.transcribe`
+
+### `stt.transcribe` returns "Microphone permission denied"
+- Open Windows Settings → Privacy & security → Microphone
+- Ensure **Microphone access** (top-level toggle) is on
+- For **unpackaged** tray builds (the default `.\build.ps1` output): ensure **Let desktop apps access your microphone** is on. The tray exe will **not** appear as its own row — desktop-app access is granted as a group, not per-app
+- For **packaged MSIX** tray builds: the tray appears as its own entry under "Let apps access your microphone" and must be individually enabled (the OS shows a consent prompt on first use)
+- After changing permissions, re-pair the node so the gateway picks up the new advertised command
+
+### `stt.transcribe` returns "Language pack 'X' is not installed"
+- Open Windows Settings → Time & language → Language & region
+- Add the requested display language and ensure the **Speech** optional feature is installed
+- Restart the tray after installing the speech pack
+
+### Manual STT validation
+1. Enable Node Mode in Settings.
+2. Enable **Speech-to-text (microphone)** in Settings → Node mode.
+3. Append `stt.transcribe` to your existing gateway allowlist (do **not** copy a literal `...` — substitute the commands you already allow). For example, starting from the recommended Windows safe companion list:
+   ```bash
+   openclaw config set gateway.nodes.allowCommands '["canvas.present","canvas.hide","canvas.navigate","canvas.eval","canvas.snapshot","canvas.a2ui.push","canvas.a2ui.pushJSONL","canvas.a2ui.reset","camera.list","location.get","screen.snapshot","device.info","device.status","system.execApprovals.get","system.execApprovals.set","stt.transcribe"]'
+   openclaw gateway restart
+   ```
+4. Re-pair or re-approve the node so the gateway refreshes its command snapshot.
+5. Invoke and speak a short phrase:
+   ```bash
+   openclaw nodes invoke --node <id> --command stt.transcribe \
+       --params '{"maxDurationMs":5000,"language":"en-US"}'
+   ```
+6. The Windows microphone OS indicator should appear during recognition. Confirm a `transcribed:true` payload returns the text.
+
 ## Remaining Work (Roadmap)
 
 1. ~~**system.run + exec approvals**~~ ✅ Implemented
diff --git a/docs/gateway-node-integration.md b/docs/gateway-node-integration.md
index 17fe3a9f..50e082d1 100644
--- a/docs/gateway-node-integration.md
+++ b/docs/gateway-node-integration.md
@@ -336,7 +336,7 @@ Recommended gateway defaults:
 | Command bucket | Windows default? | Reason |
 |----------------|------------------|--------|
 | Safe declared companion commands: `canvas.*`, `camera.list`, `location.get`, `screen.snapshot`, `device.info`, `device.status` | Yes | Matches macOS parity and only applies when declared by the node |
-| Dangerous/privacy-heavy commands: `camera.snap`, `camera.clip`, `screen.record`, write commands like `contacts.add` | No | Existing gateway model already requires explicit `gateway.nodes.allowCommands` |
+| Dangerous/privacy-heavy commands: `camera.snap`, `camera.clip`, `screen.record`, `stt.transcribe`, write commands like `contacts.add` | No | Existing gateway model already requires explicit `gateway.nodes.allowCommands` |
 | Exec commands: `system.run`, `system.run.prepare`, `system.which`, `system.notify`, `browser.proxy` | Yes | Existing Windows headless-host behavior |
 
 Until the gateway expands Windows safe defaults, the practical local solution is:
@@ -364,6 +364,7 @@ Privacy-sensitive commands should stay out of the default safe list and should o
 camera.snap
 camera.clip
 screen.record
+stt.transcribe
 ```
 
 After changing either `gateway.nodes.allowCommands` or `gateway.nodes.denyCommands`, re-approve or re-pair the Windows node. Approved device records may keep a snapshot of the commands that were visible at approval time, so a gateway restart alone may not refresh existing approvals.
@@ -424,6 +425,7 @@ Proposal:
   - `camera.snap`
   - `camera.clip`
   - `screen.record`
+  - `stt.transcribe`
   - write commands such as `contacts.add`, `calendar.add`, etc.
 
 This does not grant capabilities to headless Windows hosts by itself. A command still has to pass both gates: the node must declare it in `commands`, and the gateway policy must allow it. Headless Windows node hosts that only declare `system.run` / `system.which` remain exec-only.
@@ -441,7 +443,7 @@ When shipping the Windows node, README/wiki should tell users:
 > ```
 > Then re-pair the node (`openclaw devices reject <old-id>` + re-approve).
 >
-> Add `camera.snap`, `camera.clip`, and `screen.record` only when you explicitly want to allow privacy-sensitive camera or screen capture.
+> Add `camera.snap`, `camera.clip`, `screen.record`, and `stt.transcribe` only when you explicitly want to allow privacy-sensitive camera, screen, or microphone capture.
 >
 > The Windows tray Command Center (`openclaw://commandcenter`) surfaces these policy problems directly: it separates safe companion allowlist fixes from privacy-sensitive opt-ins and provides copyable repair text for safe fixes or pending pairing approval.
 
diff --git a/src/OpenClaw.Shared/Audio/AudioModels.cs b/src/OpenClaw.Shared/Audio/AudioModels.cs
new file mode 100644
index 00000000..8016ead8
--- /dev/null
+++ b/src/OpenClaw.Shared/Audio/AudioModels.cs
@@ -0,0 +1,70 @@
+using System;
+using System.Collections.Generic;
+
+namespace OpenClaw.Shared.Audio;
+
+/// <summary>Result of a speech-to-text transcription segment.</summary>
+public sealed class TranscriptionResult
+{
+    public string Text { get; init; } = "";
+    public TimeSpan Start { get; init; }
+    public TimeSpan End { get; init; }
+    public string Language { get; init; } = "en";
+}
+
+/// <summary>
+/// Aggregated result of a single silence-bounded utterance — i.e. all the
+/// Whisper segments produced from one VAD-bounded speech burst, combined.
+/// Consumers that need "what the user said" (chat submission, stt.listen)
+/// should listen for this event instead of per-segment TranscriptionResult
+/// to avoid sending partial text.
+/// </summary>
+public sealed class UtteranceResult
+{
+    /// <summary>Concatenated text across all segments, single-spaced.</summary>
+    public string Text { get; init; } = "";
+    /// <summary>Language detected on the first segment, or null if no segments.</summary>
+    public string? Language { get; init; }
+    /// <summary>Start of the first segment relative to capture start.</summary>
+    public TimeSpan Start { get; init; }
+    /// <summary>End of the last segment relative to capture start.</summary>
+    public TimeSpan End { get; init; }
+    /// <summary>Immutable snapshot of the per-segment results.</summary>
+    public IReadOnlyList<TranscriptionResult> Segments { get; init; } = Array.Empty<TranscriptionResult>();
+}
+
+/// <summary>Voice-activity detection event.</summary>
+public sealed class VadEvent
+{
+    public bool IsSpeaking { get; init; }
+    public float Probability { get; init; }
+}
+
+/// <summary>Configuration for the audio pipeline.</summary>
+public sealed class AudioPipelineOptions
+{
+    /// <summary>Path to the Whisper GGML model file.</summary>
+    public string ModelPath { get; init; } = "";
+
+    /// <summary>Language code for STT (e.g. "en", "auto").</summary>
+    public string Language { get; init; } = "auto";
+
+    /// <summary>Seconds of silence before a speech segment is finalized.</summary>
+    public float SilenceTimeoutSeconds { get; init; } = 1.5f;
+
+    /// <summary>Optional audio device ID. Null = system default microphone.</summary>
+    public string? DeviceId { get; init; }
+
+    /// <summary>VAD probability threshold (0.0–1.0). Audio above this is considered speech.</summary>
+    public float VadThreshold { get; init; } = 0.3f;
+}
+
+/// <summary>Pipeline state.</summary>
+public enum AudioPipelineState
+{
+    Stopped,
+    Starting,
+    Listening,
+    Processing,
+    Error
+}
diff --git a/src/OpenClaw.Shared/Audio/PiperVoiceManager.cs b/src/OpenClaw.Shared/Audio/PiperVoiceManager.cs
new file mode 100644
index 00000000..5c9f3a5f
--- /dev/null
+++ b/src/OpenClaw.Shared/Audio/PiperVoiceManager.cs
@@ -0,0 +1,390 @@
+using System;
+using System.Collections.Concurrent;
+using System.Collections.Generic;
+using System.IO;
+using System.IO.Compression;
+using System.Net.Http;
+using System.Threading;
+using System.Threading.Tasks;
+
+namespace OpenClaw.Shared.Audio;
+
+/// <summary>
+/// Manages downloads and on-disk lifecycle for Piper TTS voices.
+///
+/// Each "voice" is a sherpa-onnx pre-packaged tarball that contains
+/// everything needed for offline synthesis — the .onnx model, the
+/// tokens.txt phoneme map, and the language-specific espeak-ng-data.
+/// We use the sherpa-onnx repackaged distribution rather than the raw
+/// HuggingFace Piper voices because the latter requires the user (or
+/// us) to ship espeak-ng-data separately (~80 MB shared across voices).
+///
+/// Storage layout under the tray's data directory:
+///   models/piper/&lt;voice-id&gt;/
+///       &lt;voice-id&gt;.onnx
+///       tokens.txt
+///       espeak-ng-data/...
+///
+/// Each voice is ~50 MB compressed, ~80 MB extracted (with espeak data).
+///
+/// **TODO (pre-GA):** SHA-256 verification of downloaded tarballs before
+/// extraction (Audio_FollowUps.md §2). The current implementation trusts
+/// HTTPS + the system trust chain only.
+/// </summary>
+public sealed class PiperVoiceManager
+{
+    private readonly string _voicesDirectory;
+    private readonly IOpenClawLogger _logger;
+    // Per-voice single-flight gate: prevents racing the same voice download
+    // from two callers (e.g. UI and a programmatic caller). Static so two
+    // PiperVoiceManager instances over the same data directory still
+    // coalesce against the same in-flight task.
+    private static readonly ConcurrentDictionary<string, Lazy<Task>> InFlightDownloads = new(StringComparer.OrdinalIgnoreCase);
+
+    /// <summary>
+    /// Curated catalog of Piper voices we offer in the UI. Each entry is
+    /// a sherpa-onnx pre-packaged tarball from the project's GitHub
+    /// releases. To add a voice: pick its key from
+    /// https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models,
+    /// download the tarball, compute its SHA-256, and pin it below.
+    /// Sizes shown in the UI are approximate compressed sizes.
+    ///
+    /// SECURITY — pinned SHA-256 hashes (lowercase hex) verified against
+    /// the sherpa-onnx GitHub release on 2026-05-05. Downloads with a
+    /// different hash are rejected and the partial tarball is deleted.
+    /// Before any public release: re-verify each hash from an independent
+    /// source and document provenance in Audio_FollowUps.md §2.
+    /// </summary>
+    public static readonly PiperVoiceInfo[] AvailableVoices =
+    [
+        new("en_US-amy-low",     "English (US) — Amy (low quality, fast)",   "en-US",
+            "https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2",
+            "c70f5284a09a7fd4ed203b39b2ff51cac1432b422b852eb647b481dade3cf639"),
+        new("en_US-libritts-high","English (US) — LibriTTS (high quality)",  "en-US",
+            "https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-libritts-high.tar.bz2",
+            "d9d35056703fd38ed38e95c202a50f603fefdc8a92a7b6332c4f1a41616eac72"),
+        new("en_GB-alan-low",    "English (GB) — Alan (low quality, fast)",  "en-GB",
+            "https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_GB-alan-low.tar.bz2",
+            "1308e730b7a12c3b64b669d65daa0138fcb83b1a086edee92fa9fa68cb0290dd"),
+        new("fr_FR-siwis-low",   "Français (FR) — Siwis (low quality, fast)","fr-FR",
+            "https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-fr_FR-siwis-low.tar.bz2",
+            "3d69170c160c8375c4123901a72a3845222b39456d39ab74f5bbd7310952b5af"),
+        new("de_DE-thorsten-low","Deutsch (DE) — Thorsten (low quality)",    "de-DE",
+            "https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-de_DE-thorsten-low.tar.bz2",
+            "41fab35910fdcec4696b031951d8fd6c262e594cf77b35e1068fadbeb5a091a6"),
+        new("zh_CN-huayan-medium","中文 (CN) — Huayan (medium quality)",      "zh-CN",
+            "https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-zh_CN-huayan-medium.tar.bz2",
+            "dbdfec42b91d9cee31cce9ff4b3e9c305eb6fbf60546d071f7e46273554cce6b"),
+    ];
+
+    public PiperVoiceManager(string dataDirectory, IOpenClawLogger logger)
+    {
+        _voicesDirectory = Path.Combine(dataDirectory, "models", "piper");
+        _logger = logger;
+        Directory.CreateDirectory(_voicesDirectory);
+    }
+
+    /// <summary>Root directory where this voice's files live (created lazily).</summary>
+    public string GetVoiceDirectory(string voiceId)
+    {
+        var info = FindVoice(voiceId);
+        return Path.Combine(_voicesDirectory, info.VoiceId);
+    }
+
+    /// <summary>Path to the .onnx model file for a downloaded voice.</summary>
+    public string GetModelPath(string voiceId)
+    {
+        var dir = GetVoiceDirectory(voiceId);
+        // sherpa-onnx tarballs put files at the root of the voice dir; the
+        // model file is named after the voice id.
+        return Path.Combine(dir, $"{voiceId}.onnx");
+    }
+
+    /// <summary>Path to tokens.txt (phoneme map).</summary>
+    public string GetTokensPath(string voiceId) => Path.Combine(GetVoiceDirectory(voiceId), "tokens.txt");
+
+    /// <summary>Path to the espeak-ng-data directory bundled with this voice.</summary>
+    public string GetEspeakDataDir(string voiceId) => Path.Combine(GetVoiceDirectory(voiceId), "espeak-ng-data");
+
+    /// <summary>True when all three files are present on disk.</summary>
+    public bool IsVoiceDownloaded(string voiceId)
+    {
+        try
+        {
+            return File.Exists(GetModelPath(voiceId))
+                && File.Exists(GetTokensPath(voiceId))
+                && Directory.Exists(GetEspeakDataDir(voiceId));
+        }
+        catch
+        {
+            // FindVoice throws on unknown voiceId — treat as not-downloaded.
+            return false;
+        }
+    }
+
+    /// <summary>
+    /// Download and extract a Piper voice from the sherpa-onnx release.
+    /// Reports progress as bytes downloaded / total bytes (extraction
+    /// progress is not reported separately).
+    /// Per-voice single-flight: concurrent calls for the same voice await
+    /// the in-flight download instead of racing on the same temp tarball.
+    /// </summary>
+    public Task DownloadVoiceAsync(
+        string voiceId,
+        IProgress<(long downloaded, long total)>? progress = null,
+        CancellationToken cancellationToken = default)
+    {
+        var info = FindVoice(voiceId);
+        if (IsVoiceDownloaded(info.VoiceId))
+        {
+            _logger.Info($"Piper voice '{info.VoiceId}' already downloaded");
+            return Task.CompletedTask;
+        }
+
+        // Preflight: bail out before downloading 50-150 MB if the OS isn't
+        // capable of extracting the .tar.bz2 we'd produce. tar.exe ships with
+        // Windows 10 1803+; older systems would fail at the extract step
+        // after a long, wasted download.
+        EnsureExtractorAvailable();
+
+        var key = info.VoiceId;
+        return SingleFlightDownload.RunAsync(
+            InFlightDownloads,
+            key,
+            token => DownloadVoiceCoreAsync(info, progress, token),
+            cancellationToken);
+    }
+
+    private async Task DownloadVoiceCoreAsync(
+        PiperVoiceInfo info,
+        IProgress<(long downloaded, long total)>? progress,
+        CancellationToken cancellationToken)
+    {
+        // SECURITY: refuse to install any voice that doesn't have a pinned
+        // hash. See Audio_FollowUps.md §2.
+        if (string.IsNullOrWhiteSpace(info.Sha256))
+        {
+            throw new InvalidOperationException(
+                $"Piper voice '{info.VoiceId}' has no pinned SHA-256; refusing to download. " +
+                "Add a verified hash to AvailableVoices before enabling this voice.");
+        }
+
+        var voiceDir = Path.Combine(_voicesDirectory, info.VoiceId);
+        Directory.CreateDirectory(voiceDir);
+        var tarballPath = Path.Combine(voiceDir, $"{info.VoiceId}.tar.bz2.tmp");
+        _logger.Info($"Downloading Piper voice '{info.VoiceId}' from {info.DownloadUrl}");
+
+        try
+        {
+            using var httpClient = new HttpClient();
+            httpClient.Timeout = TimeSpan.FromMinutes(10);
+            using var response = await httpClient.GetAsync(info.DownloadUrl, HttpCompletionOption.ResponseHeadersRead, cancellationToken).ConfigureAwait(false);
+            response.EnsureSuccessStatusCode();
+
+            var totalBytes = response.Content.Headers.ContentLength ?? 0;
+            using (var contentStream = await response.Content.ReadAsStreamAsync(cancellationToken).ConfigureAwait(false))
+            using (var fileStream = new FileStream(tarballPath, FileMode.Create, FileAccess.Write, FileShare.None, 81920))
+            {
+                var buffer = new byte[81920];
+                long downloaded = 0;
+                int bytesRead;
+                while ((bytesRead = await contentStream.ReadAsync(buffer, cancellationToken).ConfigureAwait(false)) > 0)
+                {
+                    await fileStream.WriteAsync(buffer.AsMemory(0, bytesRead), cancellationToken).ConfigureAwait(false);
+                    downloaded += bytesRead;
+                    progress?.Report((downloaded, totalBytes));
+                }
+            }
+
+            // SECURITY: verify SHA-256 of the downloaded tarball BEFORE we
+            // hand it to the extractor. tar reads file contents to disk; an
+            // attacker-controlled tarball could plant arbitrary files (path
+            // traversal aside, the .onnx model itself is loaded into the
+            // process). Fail closed on mismatch — partial dir cleanup runs
+            // in the catch block below.
+            await VerifyHashAsync(tarballPath, info.Sha256, info.VoiceId, cancellationToken);
+
+            _logger.Info($"Extracting Piper voice '{info.VoiceId}'");
+            ExtractTarBz2(tarballPath, voiceDir, cancellationToken);
+
+            // Verify the extraction produced the files we expect; if not,
+            // tear the half-extracted dir down so a retry starts clean.
+            if (!IsVoiceDownloaded(info.VoiceId))
+            {
+                throw new InvalidOperationException(
+                    $"Extraction of Piper voice '{info.VoiceId}' did not produce the expected layout.");
+            }
+
+            _logger.Info($"Piper voice '{info.VoiceId}' verified and ready at {voiceDir}");
+        }
+        catch
+        {
+            // Best-effort cleanup — leaves the user able to retry without
+            // leftover partial files.
+            try { if (File.Exists(tarballPath)) File.Delete(tarballPath); } catch { /* swallow */ }
+            try { if (Directory.Exists(voiceDir) && !IsVoiceDownloaded(info.VoiceId)) Directory.Delete(voiceDir, recursive: true); } catch { /* swallow */ }
+            throw;
+        }
+        finally
+        {
+            try { if (File.Exists(tarballPath)) File.Delete(tarballPath); } catch { /* swallow */ }
+        }
+    }
+
+    /// <summary>
+    /// Compute SHA-256 of <paramref name="filePath"/> and compare to
+    /// <paramref name="expectedHex"/>. Throws on mismatch (caller is
+    /// expected to delete the file). Does not echo the actual hash to
+    /// avoid handing attackers a confirmation oracle.
+    /// </summary>
+    private static async Task VerifyHashAsync(string filePath, string expectedHex, string assetName, CancellationToken cancellationToken)
+    {
+        using var sha = System.Security.Cryptography.SHA256.Create();
+        await using var stream = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.Read, 81920, useAsync: true);
+        var actual = await sha.ComputeHashAsync(stream, cancellationToken);
+        var actualHex = Convert.ToHexString(actual).ToLowerInvariant();
+        if (!string.Equals(actualHex, expectedHex, StringComparison.OrdinalIgnoreCase))
+        {
+            throw new System.Security.SecurityException(
+                $"Piper voice '{assetName}' failed integrity check. The downloaded tarball does not match the pinned SHA-256.");
+        }
+    }
+
+    /// <summary>Delete a downloaded voice directory.</summary>
+    public bool DeleteVoice(string voiceId)
+    {
+        var info = FindVoice(voiceId);
+        var dir = Path.Combine(_voicesDirectory, info.VoiceId);
+        if (!Directory.Exists(dir)) return false;
+        Directory.Delete(dir, recursive: true);
+        _logger.Info($"Deleted Piper voice '{info.VoiceId}'");
+        return true;
+    }
+
+    /// <summary>Total disk usage of a downloaded voice, or 0 if not downloaded.</summary>
+    public long GetVoiceSize(string voiceId)
+    {
+        var info = FindVoice(voiceId);
+        var dir = Path.Combine(_voicesDirectory, info.VoiceId);
+        if (!Directory.Exists(dir)) return 0;
+        long total = 0;
+        foreach (var f in Directory.EnumerateFiles(dir, "*", SearchOption.AllDirectories))
+        {
+            try { total += new FileInfo(f).Length; } catch { /* skip */ }
+        }
+        return total;
+    }
+
+    /// <summary>
+    /// Probe the bundled OS tar.exe used by <see cref="ExtractTarBz2"/>.
+    /// Throws a clear error before any network I/O happens so users on
+    /// downlevel Windows aren't left with a half-downloaded tarball.
+    /// </summary>
+    private static void EnsureExtractorAvailable()
+    {
+        try
+        {
+            var psi = new System.Diagnostics.ProcessStartInfo
+            {
+                FileName = "tar",
+                ArgumentList = { "--version" },
+                UseShellExecute = false,
+                CreateNoWindow = true,
+                RedirectStandardOutput = true,
+                RedirectStandardError = true,
+            };
+            using var proc = System.Diagnostics.Process.Start(psi);
+            if (proc == null)
+            {
+                throw new InvalidOperationException("tar.exe not found on PATH.");
+            }
+            proc.WaitForExit(2000);
+            if (!proc.HasExited)
+            {
+                try { proc.Kill(entireProcessTree: true); } catch { /* swallow */ }
+                throw new InvalidOperationException("tar.exe didn't respond to --version.");
+            }
+            if (proc.ExitCode != 0)
+            {
+                throw new InvalidOperationException($"tar.exe --version returned exit code {proc.ExitCode}.");
+            }
+        }
+        catch (System.ComponentModel.Win32Exception ex)
+        {
+            throw new InvalidOperationException(
+                "Piper voices need bundled tar (Windows 10 1803+). " +
+                "Your system doesn't have tar on PATH; please update Windows or install a tar utility.", ex);
+        }
+    }
+
+    /// <summary>
+    /// Extract a .tar.bz2 archive in-place. We use SharpCompress (already a
+    /// transitive dependency via PiperSharp's ecosystem, but explicit here)
+    /// so we don't need to shell out to tar.exe.
+    /// </summary>
+    private static void ExtractTarBz2(string archivePath, string destinationDir, CancellationToken cancellationToken)
+    {
+        // SharpCompress isn't a direct dep of OpenClaw.Shared today; we
+        // intentionally use the BCL .tar reader on top of a bzip2 stream
+        // from a small inline implementation. Keeping the dep surface small
+        // matters in this assembly because everything here is also referenced
+        // from OpenClaw.Cli.
+        //
+        // .NET 7+ ships System.Formats.Tar; bzip2 is not in the BCL, so we
+        // bring it in via a thin wrapper. For now the simplest-correct path
+        // is to call out to the OS-bundled `tar` (Win10 1803+ ships it),
+        // which transparently handles bz2.
+        var psi = new System.Diagnostics.ProcessStartInfo
+        {
+            FileName = "tar",
+            ArgumentList = { "-xjf", archivePath, "-C", destinationDir, "--strip-components=1" },
+            UseShellExecute = false,
+            CreateNoWindow = true,
+            RedirectStandardError = true,
+        };
+        using var proc = System.Diagnostics.Process.Start(psi)
+            ?? throw new InvalidOperationException("Could not start tar to extract Piper voice");
+
+        // Cancellation: kill the tar process if requested.
+        using var reg = cancellationToken.Register(() => { try { proc.Kill(entireProcessTree: true); } catch { /* swallow */ } });
+
+        proc.WaitForExit();
+        if (proc.ExitCode != 0)
+        {
+            var err = proc.StandardError.ReadToEnd();
+            throw new InvalidOperationException($"tar extraction failed (exit {proc.ExitCode}): {err}");
+        }
+    }
+
+    private static PiperVoiceInfo FindVoice(string voiceId)
+    {
+        foreach (var v in AvailableVoices)
+        {
+            if (string.Equals(v.VoiceId, voiceId, StringComparison.OrdinalIgnoreCase))
+                return v;
+        }
+        var available = string.Join(", ", AvailableVoicesIds());
+        throw new ArgumentException($"Unknown Piper voice: '{voiceId}'. Available: {available}");
+    }
+
+    private static IEnumerable<string> AvailableVoicesIds()
+    {
+        foreach (var v in AvailableVoices) yield return v.VoiceId;
+    }
+}
+
+/// <summary>Metadata about a Piper voice variant.</summary>
+/// <param name="VoiceId">Short id, e.g. "en_US-amy-low".</param>
+/// <param name="DisplayName">Human-readable label for UI.</param>
+/// <param name="LanguageTag">BCP-47 tag.</param>
+/// <param name="DownloadUrl">HTTPS URL of the .tar.bz2.</param>
+/// <param name="Sha256">Pinned lowercase hex SHA-256 of the downloaded
+/// tarball. MUST be set; downloads are refused when null. See the catalog
+/// for the "verified on" date — these need re-verification before any
+/// public release (see Audio_FollowUps.md §2).</param>
+public sealed record PiperVoiceInfo(
+    string VoiceId,
+    string DisplayName,
+    string LanguageTag,
+    string DownloadUrl,
+    string? Sha256);
diff --git a/src/OpenClaw.Shared/Audio/SileroVadModelManifest.cs b/src/OpenClaw.Shared/Audio/SileroVadModelManifest.cs
new file mode 100644
index 00000000..73637ba0
--- /dev/null
+++ b/src/OpenClaw.Shared/Audio/SileroVadModelManifest.cs
@@ -0,0 +1,28 @@
+namespace OpenClaw.Shared.Audio;
+
+/// <summary>
+/// Pinned descriptor for the Silero VAD ONNX model that the audio
+/// pipeline auto-downloads on first use.
+///
+/// SECURITY — same fail-closed verification discipline as
+/// <see cref="WhisperModelManager"/> and <see cref="PiperVoiceManager"/>:
+/// the runtime checks the downloaded file's SHA-256 against
+/// <see cref="Sha256"/> before installing it. The pinned hash here was
+/// captured against the upstream raw URL on 2026-05-05; re-verify from
+/// an independent source before any public release (Audio_FollowUps.md
+/// §2 captures the broader signed-manifest plan).
+/// </summary>
+public static class SileroVadModelManifest
+{
+    public const string FileName = "silero_vad.onnx";
+
+    public const string DownloadUrl =
+        "https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx";
+
+    /// <summary>Lowercase hex SHA-256 of the canonical upstream file.</summary>
+    public const string Sha256 = "1a153a22f4509e292a94e67d6f9b85e8deb25b4988682b7e174c65279d8788e3";
+
+    /// <summary>Approximate compressed size in bytes (UI hint; actual size
+    /// is asserted via the SHA-256 check).</summary>
+    public const long ApproximateSizeBytes = 2_327_524;
+}
diff --git a/src/OpenClaw.Shared/Audio/SingleFlightDownload.cs b/src/OpenClaw.Shared/Audio/SingleFlightDownload.cs
new file mode 100644
index 00000000..215d46d6
--- /dev/null
+++ b/src/OpenClaw.Shared/Audio/SingleFlightDownload.cs
@@ -0,0 +1,52 @@
+using System;
+using System.Collections.Concurrent;
+using System.Collections.Generic;
+using System.Threading;
+using System.Threading.Tasks;
+
+namespace OpenClaw.Shared.Audio;
+
+internal static class SingleFlightDownload
+{
+    public static Task RunAsync(
+        ConcurrentDictionary<string, Lazy<Task>> inFlight,
+        string key,
+        Func<CancellationToken, Task> startDownload,
+        CancellationToken waitCancellationToken = default)
+    {
+        var candidate = new Lazy<Task>(() =>
+        {
+            try
+            {
+                return startDownload(CancellationToken.None)
+                    ?? Task.FromException(new InvalidOperationException("Download factory returned null."));
+            }
+            catch (Exception ex)
+            {
+                return Task.FromException(ex);
+            }
+        }, LazyThreadSafetyMode.ExecutionAndPublication);
+
+        var lazy = inFlight.GetOrAdd(key, candidate);
+        Task task;
+        try
+        {
+            task = lazy.Value;
+        }
+        catch
+        {
+            inFlight.TryRemove(new KeyValuePair<string, Lazy<Task>>(key, lazy));
+            throw;
+        }
+
+        _ = task.ContinueWith(
+            _ => inFlight.TryRemove(new KeyValuePair<string, Lazy<Task>>(key, lazy)),
+            CancellationToken.None,
+            TaskContinuationOptions.ExecuteSynchronously,
+            TaskScheduler.Default);
+
+        return waitCancellationToken.CanBeCanceled
+            ? task.WaitAsync(waitCancellationToken)
+            : task;
+    }
+}
diff --git a/src/OpenClaw.Shared/Audio/SpeechToTextService.cs b/src/OpenClaw.Shared/Audio/SpeechToTextService.cs
new file mode 100644
index 00000000..b0101c26
--- /dev/null
+++ b/src/OpenClaw.Shared/Audio/SpeechToTextService.cs
@@ -0,0 +1,182 @@
+using System;
+using System.Collections.Generic;
+using System.Threading;
+using System.Threading.Tasks;
+using Whisper.net;
+using Whisper.net.Ggml;
+
+namespace OpenClaw.Shared.Audio;
+
+/// <summary>
+/// Wraps Whisper.net for speech-to-text transcription.
+/// Lazily loads the model on first use and caches the factory.
+/// Thread-safe: concurrent calls are serialized by a semaphore.
+/// </summary>
+public sealed class SpeechToTextService : IDisposable
+{
+    private readonly IOpenClawLogger _logger;
+    private readonly SemaphoreSlim _gate = new(1, 1);
+    private WhisperFactory? _factory;
+    private string? _loadedModelPath;
+
+    public bool IsModelLoaded => _factory != null;
+    public string? LoadedModelPath => _loadedModelPath;
+
+    public SpeechToTextService(IOpenClawLogger logger)
+    {
+        _logger = logger;
+    }
+
+    /// <summary>Load (or reload) the Whisper model from disk.</summary>
+    public void LoadModel(string modelPath)
+    {
+        if (!System.IO.File.Exists(modelPath))
+            throw new System.IO.FileNotFoundException($"Whisper model not found: {modelPath}");
+
+        _factory?.Dispose();
+        _factory = WhisperFactory.FromPath(modelPath);
+        _loadedModelPath = modelPath;
+        _logger.Info($"Whisper model loaded: {modelPath}");
+    }
+
+    /// <summary>Unload the current model and free memory.</summary>
+    public void UnloadModel()
+    {
+        _factory?.Dispose();
+        _factory = null;
+        _loadedModelPath = null;
+        _logger.Info("Whisper model unloaded");
+    }
+
+    /// <summary>
+    /// Transcribe raw 16 kHz mono PCM float samples.
+    /// Returns all detected segments.
+    /// </summary>
+    public async Task<List<TranscriptionResult>> TranscribeAsync(
+        float[] samples,
+        string language = "auto",
+        CancellationToken cancellationToken = default)
+    {
+        if (_factory == null)
+            throw new InvalidOperationException("No Whisper model is loaded. Call LoadModel first.");
+
+        await _gate.WaitAsync(cancellationToken);
+        try
+        {
+            // Whisper.net's WithLanguage expects either "auto" or a 2-letter
+            // ISO 639-1 code. The capability validator accepts the broader
+            // BCP-47 shape ("en-US", "zh-Hans-CN") because that's what the
+            // public docs advertise; normalize down here so Whisper actually
+            // sees something it understands.
+            var whisperLang = NormalizeForWhisper(language);
+            var builder = _factory.CreateBuilder()
+                .WithLanguage(whisperLang)
+                .WithThreads(Math.Max(1, Environment.ProcessorCount / 2));
+
+            using var processor = builder.Build();
+
+            using var wavStream = PcmToWavStream(samples, 16000);
+
+            var results = new List<TranscriptionResult>();
+            await foreach (var segment in processor.ProcessAsync(wavStream, cancellationToken))
+            {
+                var text = segment.Text?.Trim();
+                if (!string.IsNullOrEmpty(text))
+                {
+                    results.Add(new TranscriptionResult
+                    {
+                        Text = text,
+                        Start = segment.Start,
+                        End = segment.End,
+                        Language = whisperLang
+                    });
+                }
+            }
+
+            return results;
+        }
+        finally
+        {
+            _gate.Release();
+        }
+    }
+
+    /// <summary>
+    /// Convert raw 16-bit PCM float samples to a WAV MemoryStream.
+    /// Whisper.net processes WAV streams natively.
+    /// </summary>
+    private static System.IO.MemoryStream PcmToWavStream(float[] samples, int sampleRate)
+    {
+        var ms = new System.IO.MemoryStream();
+        using var writer = new System.IO.BinaryWriter(ms, System.Text.Encoding.UTF8, leaveOpen: true);
+
+        int bitsPerSample = 16;
+        short channels = 1;
+        int byteRate = sampleRate * channels * bitsPerSample / 8;
+        short blockAlign = (short)(channels * bitsPerSample / 8);
+        int dataSize = samples.Length * blockAlign;
+
+        // RIFF header
+        writer.Write("RIFF"u8);
+        writer.Write(36 + dataSize);
+        writer.Write("WAVE"u8);
+
+        // fmt subchunk
+        writer.Write("fmt "u8);
+        writer.Write(16); // subchunk size
+        writer.Write((short)1); // PCM format
+        writer.Write(channels);
+        writer.Write(sampleRate);
+        writer.Write(byteRate);
+        writer.Write(blockAlign);
+        writer.Write((short)bitsPerSample);
+
+        // data subchunk
+        writer.Write("data"u8);
+        writer.Write(dataSize);
+
+        // Convert float [-1.0, 1.0] to int16
+        foreach (var sample in samples)
+        {
+            var clamped = Math.Clamp(sample, -1.0f, 1.0f);
+            var int16 = (short)(clamped * 32767);
+            writer.Write(int16);
+        }
+
+        writer.Flush();
+        ms.Position = 0;
+        return ms;
+    }
+
+    /// <summary>
+    /// Reduce a BCP-47 tag (e.g. "en-US", "zh-Hans-CN") to the 2-letter
+    /// language subtag that Whisper.net's WithLanguage call expects.
+    /// "auto" passes through unchanged. Returns "auto" for nulls/whitespace
+    /// or values that don't begin with at least 2 ASCII letters.
+    /// </summary>
+    internal static string NormalizeForWhisper(string? language)
+    {
+        if (string.IsNullOrWhiteSpace(language)) return "auto";
+        var trimmed = language.Trim();
+        if (string.Equals(trimmed, "auto", StringComparison.OrdinalIgnoreCase)) return "auto";
+
+        // Take everything up to the first '-' (the primary subtag) and lowercase.
+        var dash = trimmed.IndexOf('-');
+        var primary = (dash >= 0 ? trimmed[..dash] : trimmed).ToLowerInvariant();
+
+        // Whisper expects 2-letter ISO 639-1. If the caller handed us a
+        // 3-letter ISO 639-3 tag (no good cross-walk without a table) or
+        // garbage, fall back to auto-detection rather than silently
+        // sending an invalid value.
+        if (primary.Length != 2 || primary[0] is < 'a' or > 'z' || primary[1] is < 'a' or > 'z')
+            return "auto";
+
+        return primary;
+    }
+
+    public void Dispose()
+    {
+        _factory?.Dispose();
+        _gate.Dispose();
+    }
+}
diff --git a/src/OpenClaw.Shared/Audio/VoiceActivityDetector.cs b/src/OpenClaw.Shared/Audio/VoiceActivityDetector.cs
new file mode 100644
index 00000000..3112d3a4
--- /dev/null
+++ b/src/OpenClaw.Shared/Audio/VoiceActivityDetector.cs
@@ -0,0 +1,108 @@
+using System;
+using Microsoft.ML.OnnxRuntime;
+using Microsoft.ML.OnnxRuntime.Tensors;
+
+namespace OpenClaw.Shared.Audio;
+
+/// <summary>
+/// Voice Activity Detection using Silero VAD ONNX model.
+/// Processes 16 kHz mono audio in 512-sample chunks (~32 ms each)
+/// and returns a speech probability per chunk.
+/// </summary>
+public sealed class VoiceActivityDetector : IDisposable
+{
+    private InferenceSession? _session;
+    private float[] _state;      // internal RNN state: shape [2, 1, 128]
+    private readonly int _stateSize;
+    private readonly IOpenClawLogger _logger;
+
+    /// <summary>Expected sample rate for input audio.</summary>
+    public const int SampleRate = 16000;
+
+    /// <summary>Number of samples per VAD chunk (512 @ 16 kHz = 32 ms).</summary>
+    public const int ChunkSamples = 512;
+
+    public bool IsLoaded => _session != null;
+
+    public VoiceActivityDetector(IOpenClawLogger logger)
+    {
+        _logger = logger;
+        _stateSize = 2 * 1 * 128;
+        _state = new float[_stateSize];
+    }
+
+    /// <summary>Load the Silero VAD ONNX model from disk.</summary>
+    public void LoadModel(string modelPath)
+    {
+        if (!System.IO.File.Exists(modelPath))
+            throw new System.IO.FileNotFoundException($"VAD model not found: {modelPath}");
+
+        var opts = new SessionOptions
+        {
+            InterOpNumThreads = 1,
+            IntraOpNumThreads = 1,
+            EnableCpuMemArena = true
+        };
+        opts.GraphOptimizationLevel = GraphOptimizationLevel.ORT_ENABLE_ALL;
+
+        _session?.Dispose();
+        _session = new InferenceSession(modelPath, opts);
+        ResetState();
+        _logger.Info($"Silero VAD model loaded: {modelPath}");
+    }
+
+    /// <summary>Reset the internal RNN state (call between utterances).</summary>
+    public void ResetState()
+    {
+        Array.Clear(_state, 0, _state.Length);
+    }
+
+    /// <summary>
+    /// Process a single chunk of audio and return the speech probability (0.0–1.0).
+    /// Input must be exactly <see cref="ChunkSamples"/> float samples at 16 kHz.
+    /// </summary>
+    public float ProcessChunk(float[] audioChunk)
+    {
+        if (_session == null)
+            throw new InvalidOperationException("VAD model not loaded. Call LoadModel first.");
+
+        if (audioChunk.Length != ChunkSamples)
+            throw new ArgumentException($"Audio chunk must be exactly {ChunkSamples} samples, got {audioChunk.Length}");
+
+        // Build input tensors matching Silero VAD v5 expected shapes.
+        // See: github.com/snakers4/silero-vad/blob/master/examples/csharp/SileroVadOnnxModel.cs
+        var inputTensor = new DenseTensor<float>(audioChunk, new[] { 1, ChunkSamples });
+        var srTensor = new DenseTensor<long>(new long[] { SampleRate }, new[] { 1 });
+        var stateTensor = new DenseTensor<float>(_state, new[] { 2, 1, 128 });
+
+        using var results = _session.Run(new List<NamedOnnxValue>
+        {
+            NamedOnnxValue.CreateFromTensor("input", inputTensor),
+            NamedOnnxValue.CreateFromTensor("sr", srTensor),
+            NamedOnnxValue.CreateFromTensor("state", stateTensor)
+        });
+
+        float probability = 0f;
+        foreach (var result in results)
+        {
+            if (result.Name == "output")
+            {
+                var tensor = result.AsTensor<float>();
+                probability = tensor.Length > 0 ? tensor.GetValue(0) : 0f;
+            }
+            else if (result.Name == "stateN")
+            {
+                var newState = result.AsTensor<float>();
+                for (int i = 0; i < _stateSize && i < newState.Length; i++)
+                    _state[i] = newState.GetValue(i);
+            }
+        }
+
+        return probability;
+    }
+
+    public void Dispose()
+    {
+        _session?.Dispose();
+    }
+}
diff --git a/src/OpenClaw.Shared/Audio/WhisperModelManager.cs b/src/OpenClaw.Shared/Audio/WhisperModelManager.cs
new file mode 100644
index 00000000..6b49341e
--- /dev/null
+++ b/src/OpenClaw.Shared/Audio/WhisperModelManager.cs
@@ -0,0 +1,223 @@
+using System;
+using System.Collections.Concurrent;
+using System.Collections.Generic;
+using System.IO;
+using System.Net.Http;
+using System.Threading;
+using System.Threading.Tasks;
+
+namespace OpenClaw.Shared.Audio;
+
+/// <summary>
+/// Manages Whisper GGML model downloads, storage, and lifecycle.
+/// Models are stored in <c>%APPDATA%\OpenClawTray\models\</c> (or the
+/// configured data directory).
+/// </summary>
+public sealed class WhisperModelManager
+{
+    private readonly string _modelsDirectory;
+    private readonly IOpenClawLogger _logger;
+    // Per-model single-flight gate: a manual auto-download (VoiceService
+    // EnsureInitializedAsync) and a UI-triggered download for the same
+    // model would otherwise both write the same .tmp file. Static so an
+    // additional manager instance constructed elsewhere (e.g. the Settings
+    // page's status-only check) doesn't bypass the lock.
+    private static readonly ConcurrentDictionary<string, Lazy<Task>> InFlightDownloads = new(StringComparer.OrdinalIgnoreCase);
+
+    /// <summary>
+    /// Known Whisper model definitions.
+    ///
+    /// SECURITY — pinned SHA-256 hashes (lowercase hex) verified against
+    /// HuggingFace on 2026-05-05. Downloads with a different hash are
+    /// rejected and the partial file is deleted. Before any public release:
+    /// re-verify each hash from an independent source and document the
+    /// provenance in Audio_FollowUps.md §2 (also consider replacing this
+    /// inline table with a signed manifest).
+    /// </summary>
+    public static readonly WhisperModelInfo[] AvailableModels =
+    [
+        new("ggml-tiny.bin",    "tiny",    77_691_713,  "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin",
+            "be07e048e1e599ad46341c8d2a135645097a538221678b7acdd1b1919c6e1b21"),
+        new("ggml-base.bin",    "base",    147_951_465, "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin",
+            "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe"),
+        new("ggml-small.bin",   "small",   487_601_967, "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin",
+            "1be3a9b2063867b937e64e2ec7483364a79917e157fa98c5d94b5c1fffea987b"),
+    ];
+
+    public WhisperModelManager(string dataDirectory, IOpenClawLogger logger)
+    {
+        _modelsDirectory = Path.Combine(dataDirectory, "models");
+        _logger = logger;
+        Directory.CreateDirectory(_modelsDirectory);
+    }
+
+    /// <summary>Full file path for a given model name.</summary>
+    public string GetModelPath(string modelName)
+    {
+        var info = FindModel(modelName);
+        return Path.Combine(_modelsDirectory, info.FileName);
+    }
+
+    /// <summary>Check whether a model file already exists on disk.</summary>
+    public bool IsModelDownloaded(string modelName)
+    {
+        var path = GetModelPath(modelName);
+        return File.Exists(path);
+    }
+
+    /// <summary>Get the size of a downloaded model, or 0 if not downloaded.</summary>
+    public long GetModelSize(string modelName)
+    {
+        var path = GetModelPath(modelName);
+        return File.Exists(path) ? new FileInfo(path).Length : 0;
+    }
+
+    /// <summary>
+    /// Download a model from HuggingFace if not already present.
+    /// Reports progress as bytes downloaded / total bytes.
+    /// Per-model single-flight: concurrent calls for the same model await
+    /// the in-flight download instead of racing on the same .tmp file.
+    /// </summary>
+    public Task DownloadModelAsync(
+        string modelName,
+        IProgress<(long downloaded, long total)>? progress = null,
+        CancellationToken cancellationToken = default)
+    {
+        var info = FindModel(modelName);
+        var destPath = Path.Combine(_modelsDirectory, info.FileName);
+
+        if (File.Exists(destPath))
+        {
+            _logger.Info($"Model '{modelName}' already exists at {destPath}");
+            return Task.CompletedTask;
+        }
+
+        // Use the canonical key (FileName) so two callers that pass "base"
+        // and "ggml-base.bin" still coalesce.
+        var key = info.FileName;
+        return SingleFlightDownload.RunAsync(
+            InFlightDownloads,
+            key,
+            token => DownloadModelCoreAsync(info, destPath, progress, token),
+            cancellationToken);
+    }
+
+    private async Task DownloadModelCoreAsync(
+        WhisperModelInfo info,
+        string destPath,
+        IProgress<(long downloaded, long total)>? progress,
+        CancellationToken cancellationToken)
+    {
+        // SECURITY: a missing pinned hash is treated as a hard failure so we
+        // never install an unverified asset. The catalog above pins all
+        // shipped models; if you add a new one without a hash, this is the
+        // place that refuses to download it. See Audio_FollowUps.md §2.
+        if (string.IsNullOrWhiteSpace(info.Sha256))
+        {
+            throw new InvalidOperationException(
+                $"Whisper model '{info.Name}' has no pinned SHA-256; refusing to download. " +
+                "Add a verified hash to AvailableModels before enabling this model.");
+        }
+
+        _logger.Info($"Downloading model '{info.Name}' from {info.DownloadUrl}");
+        var tempPath = destPath + ".tmp";
+
+        try
+        {
+            using var httpClient = new HttpClient();
+            httpClient.Timeout = TimeSpan.FromMinutes(30);
+            using var response = await httpClient.GetAsync(info.DownloadUrl, HttpCompletionOption.ResponseHeadersRead, cancellationToken);
+            response.EnsureSuccessStatusCode();
+
+            var totalBytes = response.Content.Headers.ContentLength ?? info.ApproximateSizeBytes;
+            using (var contentStream = await response.Content.ReadAsStreamAsync(cancellationToken))
+            using (var fileStream = new FileStream(tempPath, FileMode.Create, FileAccess.Write, FileShare.None, 81920))
+            {
+                var buffer = new byte[81920];
+                long downloadedBytes = 0;
+                int bytesRead;
+
+                while ((bytesRead = await contentStream.ReadAsync(buffer, cancellationToken)) > 0)
+                {
+                    await fileStream.WriteAsync(buffer.AsMemory(0, bytesRead), cancellationToken);
+                    downloadedBytes += bytesRead;
+                    progress?.Report((downloadedBytes, totalBytes));
+                }
+
+                await fileStream.FlushAsync(cancellationToken);
+            }
+
+            // SECURITY: verify SHA-256 BEFORE the atomic rename, so a
+            // tampered file never lands at the canonical path. On mismatch
+            // we delete the temp file (no partial install) and surface a
+            // sanitized error — we deliberately do NOT echo the actual
+            // hash because that gives an attacker a confirmation oracle.
+            await VerifyHashAsync(tempPath, info.Sha256, info.Name, cancellationToken);
+
+            File.Move(tempPath, destPath, overwrite: true);
+            _logger.Info($"Model '{info.Name}' downloaded and verified");
+        }
+        catch
+        {
+            // Clean up partial download
+            try { if (File.Exists(tempPath)) File.Delete(tempPath); } catch { /* best effort */ }
+            throw;
+        }
+    }
+
+    /// <summary>
+    /// Compute SHA-256 of <paramref name="filePath"/> and compare to
+    /// <paramref name="expectedHex"/>. Throws on mismatch (and the caller
+    /// is expected to delete the file). Does not echo the actual hash to
+    /// avoid handing attackers a confirmation oracle.
+    /// </summary>
+    private static async Task VerifyHashAsync(string filePath, string expectedHex, string assetName, CancellationToken cancellationToken)
+    {
+        using var sha = System.Security.Cryptography.SHA256.Create();
+        await using var stream = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.Read, 81920, useAsync: true);
+        var actual = await sha.ComputeHashAsync(stream, cancellationToken);
+        var actualHex = Convert.ToHexString(actual).ToLowerInvariant();
+        if (!string.Equals(actualHex, expectedHex, StringComparison.OrdinalIgnoreCase))
+        {
+            throw new System.Security.SecurityException(
+                $"Whisper model '{assetName}' failed integrity check. The downloaded file does not match the pinned SHA-256.");
+        }
+    }
+
+    /// <summary>Delete a downloaded model file.</summary>
+    public bool DeleteModel(string modelName)
+    {
+        var path = GetModelPath(modelName);
+        if (!File.Exists(path)) return false;
+        File.Delete(path);
+        _logger.Info($"Deleted model '{modelName}'");
+        return true;
+    }
+
+    private static WhisperModelInfo FindModel(string modelName)
+    {
+        foreach (var m in AvailableModels)
+        {
+            if (string.Equals(m.Name, modelName, StringComparison.OrdinalIgnoreCase))
+                return m;
+        }
+        throw new ArgumentException($"Unknown model: '{modelName}'. Available: tiny, base, small");
+    }
+}
+
+/// <summary>Metadata about a Whisper model variant.</summary>
+/// <param name="FileName">On-disk filename (e.g. "ggml-base.bin").</param>
+/// <param name="Name">Short identifier used by callers ("tiny" / "base" / "small").</param>
+/// <param name="ApproximateSizeBytes">Approximate size hint for UI; the
+/// actual size is asserted against <paramref name="Sha256"/> after download.</param>
+/// <param name="DownloadUrl">HTTPS URL of the model file.</param>
+/// <param name="Sha256">Pinned lowercase hex SHA-256 of the downloaded file.
+/// MUST be set; downloads are refused when null. See the catalog for the
+/// "verified on" date — these need re-verification before any public
+/// release (see Audio_FollowUps.md §2).</param>
+public sealed record WhisperModelInfo(
+    string FileName,
+    string Name,
+    long ApproximateSizeBytes,
+    string DownloadUrl,
+    string? Sha256);
diff --git a/src/OpenClaw.Shared/Capabilities/SttCapability.cs b/src/OpenClaw.Shared/Capabilities/SttCapability.cs
new file mode 100644
index 00000000..4a944054
--- /dev/null
+++ b/src/OpenClaw.Shared/Capabilities/SttCapability.cs
@@ -0,0 +1,339 @@
+using System;
+using System.Collections.Generic;
+using System.Text.RegularExpressions;
+using System.Threading;
+using System.Threading.Tasks;
+
+namespace OpenClaw.Shared.Capabilities;
+
+/// <summary>
+/// Speech-to-text node capability. Three commands:
+///
+/// * <see cref="TranscribeCommand"/> — bounded fixed-duration capture + transcription.
+///   Caller must specify <c>maxDurationMs</c> (capped at <see cref="MaxTranscribeDurationMs"/>).
+///   Useful for quick "give me 5 seconds of audio" prompts.
+///
+/// * <see cref="ListenCommand"/> — VAD-driven capture that returns when speech ends
+///   or after <c>timeoutMs</c> (default <see cref="DefaultListenTimeoutMs"/>, range
+///   <see cref="MinListenTimeoutMs"/>..<see cref="MaxListenTimeoutMs"/>).
+///   Useful for conversational "listen until I stop talking" prompts.
+///
+/// * <see cref="StatusCommand"/> — reports engine readiness (no PII).
+///
+/// The actual engine lives in the tray (Whisper.net + NAudio + Silero VAD).
+/// Whisper is local-first and privacy-respecting; the legacy WinRT
+/// <c>SpeechRecognizer</c> + desktop SAPI fallback was removed because both
+/// stacks are old, can leak audio to the Microsoft cloud (online-speech),
+/// and don't work in unpackaged builds.
+///
+/// **Privacy invariants for the response surface:**
+/// - Validation errors never echo the caller-supplied language string.
+/// - Handler exceptions never propagate their <c>Message</c> into the response;
+///   full detail stays in the local logger only. This is critical because
+///   failed-invoke errors land in recent activity / support bundles.
+/// - <see cref="StatusCommand"/> response carries no PII (no transcript fragments,
+///   no language history, no device IDs, no model paths).
+/// </summary>
+public sealed class SttCapability : NodeCapabilityBase
+{
+    public const string TranscribeCommand = "stt.transcribe";
+    public const string ListenCommand = "stt.listen";
+    public const string StatusCommand = "stt.status";
+
+    public const int MaxTranscribeDurationMs = 30_000;
+    public const int MinListenTimeoutMs = 1_000;
+    public const int MaxListenTimeoutMs = 120_000;
+    public const int DefaultListenTimeoutMs = 30_000;
+
+    public const string DefaultLanguage = "en-US";
+    public const string AutoLanguage = "auto";
+
+    /// <summary>
+    /// Engine identifier returned in <c>engineEffective</c> on every successful
+    /// stt.* response. Currently always <c>"whisper"</c>; the field exists so
+    /// adding a future engine doesn't break the wire shape.
+    /// </summary>
+    public const string EngineWhisper = "whisper";
+
+    private static readonly string[] _commands = [TranscribeCommand, ListenCommand, StatusCommand];
+
+    // Conservative BCP-47 check: 2-3 letter language, optional script
+    // (4 letter), optional region (2 letter or 3 digit), each separated
+    // by a hyphen. Rejects whitespace and punctuation that would otherwise
+    // trip Windows.Globalization.Language ctor. The literal "auto"
+    // sentinel is accepted in addition (Whisper supports auto-detect).
+    private static readonly Regex BcpTagRegex = new(
+        "^[A-Za-z]{2,3}(?:-[A-Za-z]{4})?(?:-(?:[A-Za-z]{2}|[0-9]{3}))?$",
+        RegexOptions.Compiled);
+
+    public override string Category => "stt";
+    public override IReadOnlyList<string> Commands => _commands;
+
+    /// <summary>
+    /// Tray-side handler for <see cref="TranscribeCommand"/>: bounded fixed-duration
+    /// capture + transcription.
+    /// </summary>
+    public event Func<SttTranscribeArgs, CancellationToken, Task<SttTranscribeResult>>? TranscribeRequested;
+
+    /// <summary>
+    /// Tray-side handler for <see cref="ListenCommand"/>: VAD-driven capture that
+    /// returns on end-of-speech or after <c>timeoutMs</c>.
+    /// </summary>
+    public event Func<SttListenArgs, CancellationToken, Task<SttListenResult>>? ListenRequested;
+
+    /// <summary>
+    /// Tray-side handler for <see cref="StatusCommand"/>: returns per-engine readiness.
+    /// </summary>
+    public event Func<CancellationToken, Task<SttStatusResult>>? StatusRequested;
+
+    public SttCapability(IOpenClawLogger logger) : base(logger) { }
+
+    /// <summary>
+    /// Trim and validate a single language tag. Returns the trimmed tag on
+    /// success, the literal <see cref="AutoLanguage"/> sentinel on a case-insensitive
+    /// "auto" input, or <c>null</c> if the input is neither.
+    /// Public so UI surfaces can validate against the same rule the wire applies.
+    /// </summary>
+    public static string? NormalizeLanguageTag(string tag)
+    {
+        var trimmed = tag.Trim();
+        if (string.Equals(trimmed, AutoLanguage, StringComparison.OrdinalIgnoreCase))
+            return AutoLanguage;
+        return BcpTagRegex.IsMatch(trimmed) ? trimmed : null;
+    }
+
+    /// <summary>
+    /// Resolve the language to use for a recognition call: per-call argument
+    /// wins, then configured setting, then <see cref="DefaultLanguage"/>.
+    /// Returns <c>null</c> if the resolved string fails validation.
+    /// </summary>
+    public static string? ResolveLanguage(string? requested, string? configured)
+    {
+        var candidate = !string.IsNullOrWhiteSpace(requested)
+            ? requested
+            : (!string.IsNullOrWhiteSpace(configured) ? configured : DefaultLanguage);
+
+        return NormalizeLanguageTag(candidate!);
+    }
+
+    public override Task<NodeInvokeResponse> ExecuteAsync(NodeInvokeRequest request)
+        => ExecuteAsync(request, CancellationToken.None);
+
+    public override async Task<NodeInvokeResponse> ExecuteAsync(
+        NodeInvokeRequest request,
+        CancellationToken cancellationToken)
+    {
+        return request.Command switch
+        {
+            TranscribeCommand => await HandleTranscribeAsync(request, cancellationToken).ConfigureAwait(false),
+            ListenCommand     => await HandleListenAsync(request, cancellationToken).ConfigureAwait(false),
+            StatusCommand     => await HandleStatusAsync(cancellationToken).ConfigureAwait(false),
+            _ => Error($"Unknown command: {request.Command}")
+        };
+    }
+
+    private async Task<NodeInvokeResponse> HandleTranscribeAsync(
+        NodeInvokeRequest request,
+        CancellationToken cancellationToken)
+    {
+        // maxDurationMs is required and bounded server-side. We deliberately
+        // reject 0/negative rather than substituting a default — callers
+        // explicitly choose how much mic time they're spending.
+        var maxDurationMs = GetIntArg(request.Args, "maxDurationMs", 0);
+        if (maxDurationMs <= 0)
+            return Error("Missing required maxDurationMs");
+        if (maxDurationMs > MaxTranscribeDurationMs)
+            return Error($"maxDurationMs exceeds {MaxTranscribeDurationMs} ms");
+
+        var requestedLanguage = GetStringArg(request.Args, "language");
+        string? resolvedLanguage = null;
+        if (!string.IsNullOrWhiteSpace(requestedLanguage))
+        {
+            resolvedLanguage = NormalizeLanguageTag(requestedLanguage);
+            if (resolvedLanguage == null)
+                return Error("Invalid language tag");
+        }
+
+        if (TranscribeRequested == null)
+            return Error("STT transcribe not available");
+
+        var args = new SttTranscribeArgs
+        {
+            MaxDurationMs = maxDurationMs,
+            Language = resolvedLanguage  // null lets the tray fall back to its configured setting
+        };
+
+        Logger.Info($"stt.transcribe: maxDurationMs={args.MaxDurationMs}, language={args.Language ?? "(default)"}");
+
+        try
+        {
+            var result = await TranscribeRequested(args, cancellationToken).ConfigureAwait(false);
+            return Success(new
+            {
+                transcribed = result.Transcribed,
+                text = result.Text,
+                durationMs = result.DurationMs,
+                language = result.Language,
+                engineEffective = result.EngineEffective
+            });
+        }
+        catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
+        {
+            return Error("Transcribe canceled");
+        }
+        catch (Exception ex)
+        {
+            // Privacy: never echo raw exception text into the response. The
+            // exception flows through the failed-invoke path and may be
+            // persisted to recent activity / support bundles. Full detail
+            // stays in the local log only.
+            Logger.Error("STT transcribe failed", ex);
+            return Error("Transcribe failed");
+        }
+    }
+
+    private async Task<NodeInvokeResponse> HandleListenAsync(
+        NodeInvokeRequest request,
+        CancellationToken cancellationToken)
+    {
+        // timeoutMs is optional with a sane default; bounded both ways so
+        // a hostile caller can't pin the mic open for an hour.
+        var timeoutMs = GetIntArg(request.Args, "timeoutMs", DefaultListenTimeoutMs);
+        if (timeoutMs < MinListenTimeoutMs) timeoutMs = MinListenTimeoutMs;
+        if (timeoutMs > MaxListenTimeoutMs) timeoutMs = MaxListenTimeoutMs;
+
+        var requestedLanguage = GetStringArg(request.Args, "language");
+        string resolvedLanguage = AutoLanguage;
+        if (!string.IsNullOrWhiteSpace(requestedLanguage))
+        {
+            var normalized = NormalizeLanguageTag(requestedLanguage);
+            if (normalized == null)
+                return Error("Invalid language tag");
+            resolvedLanguage = normalized;
+        }
+
+        if (ListenRequested == null)
+            return Error("STT listen not available");
+
+        var args = new SttListenArgs
+        {
+            TimeoutMs = timeoutMs,
+            Language = resolvedLanguage
+        };
+
+        Logger.Info($"stt.listen: timeoutMs={timeoutMs}, language={resolvedLanguage}");
+
+        try
+        {
+            var result = await ListenRequested(args, cancellationToken).ConfigureAwait(false);
+            return Success(new
+            {
+                text = result.Text,
+                language = result.Language,
+                durationMs = result.DurationMs,
+                segments = result.Segments,
+                engineEffective = result.EngineEffective
+            });
+        }
+        catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
+        {
+            return Error("Listen canceled");
+        }
+        catch (Exception ex)
+        {
+            // Same privacy invariant as Transcribe.
+            Logger.Error("STT listen failed", ex);
+            return Error("Listen failed");
+        }
+    }
+
+    private async Task<NodeInvokeResponse> HandleStatusAsync(CancellationToken cancellationToken)
+    {
+        if (StatusRequested == null)
+            return Error("STT status not available");
+
+        try
+        {
+            var result = await StatusRequested(cancellationToken).ConfigureAwait(false);
+            return Success(new
+            {
+                engine = result.Engine,
+                readiness = result.Readiness,
+                modelDownloadProgress = result.ModelDownloadProgress,
+                isListenWithVadSupported = result.IsListenWithVadSupported,
+                isBoundedTranscribeSupported = result.IsBoundedTranscribeSupported
+            });
+        }
+        catch (Exception ex)
+        {
+            // Status must not leak engine internals; carry only a fixed message.
+            Logger.Error("STT status failed", ex);
+            return Error("Status failed");
+        }
+    }
+}
+
+public sealed class SttTranscribeArgs
+{
+    public int MaxDurationMs { get; set; }
+    /// <summary>
+    /// BCP-47 tag (e.g., "en-US"), the literal "auto" sentinel, or null
+    /// to let the tray fall back to its configured <c>SttLanguage</c> setting.
+    /// </summary>
+    public string? Language { get; set; }
+}
+
+public sealed class SttTranscribeResult
+{
+    public bool Transcribed { get; set; }
+    public string Text { get; set; } = "";
+    public int DurationMs { get; set; }
+    public string Language { get; set; } = SttCapability.DefaultLanguage;
+
+    /// <summary>
+    /// Engine that served this call. Always <see cref="SttCapability.EngineWhisper"/>
+    /// today; the field exists so a future engine doesn't break the wire.
+    /// </summary>
+    public string EngineEffective { get; set; } = SttCapability.EngineWhisper;
+}
+
+public sealed class SttListenArgs
+{
+    public int TimeoutMs { get; set; }
+    /// <summary>
+    /// BCP-47 tag (e.g., "en-US"), or the literal "auto" sentinel
+    /// (default; lets Whisper auto-detect).
+    /// </summary>
+    public string Language { get; set; } = SttCapability.AutoLanguage;
+}
+
+public sealed class SttListenResult
+{
+    public string Text { get; set; } = "";
+    public string Language { get; set; } = SttCapability.AutoLanguage;
+    public int DurationMs { get; set; }
+    public IReadOnlyList<SttSegment> Segments { get; set; } = Array.Empty<SttSegment>();
+
+    public string EngineEffective { get; set; } = SttCapability.EngineWhisper;
+}
+
+public sealed class SttSegment
+{
+    public string Text { get; set; } = "";
+    public int StartMs { get; set; }
+    public int EndMs { get; set; }
+}
+
+public sealed class SttStatusResult
+{
+    public string Engine { get; set; } = SttCapability.EngineWhisper;
+
+    /// <summary>One of "ready", "initializing", "model-downloading", "model-not-downloaded", "unavailable".</summary>
+    public string Readiness { get; set; } = "unavailable";
+
+    /// <summary>0..1 download progress when <see cref="Readiness"/> == "model-downloading"; null otherwise.</summary>
+    public double? ModelDownloadProgress { get; set; }
+
+    public bool IsListenWithVadSupported { get; set; }
+    public bool IsBoundedTranscribeSupported { get; set; }
+}
diff --git a/src/OpenClaw.Shared/Capabilities/TtsCapability.cs b/src/OpenClaw.Shared/Capabilities/TtsCapability.cs
index c6407828..23878b22 100644
--- a/src/OpenClaw.Shared/Capabilities/TtsCapability.cs
+++ b/src/OpenClaw.Shared/Capabilities/TtsCapability.cs
@@ -10,6 +10,11 @@ public sealed class TtsCapability : NodeCapabilityBase
     public const string SpeakCommand = "tts.speak";
     public const string WindowsProvider = "windows";
     public const string ElevenLabsProvider = "elevenlabs";
+    /// <summary>
+    /// Local neural TTS via Sherpa-ONNX wrapping Piper voices. No network
+    /// egress; voice models download once to %LOCALAPPDATA%.
+    /// </summary>
+    public const string PiperProvider = "piper";
     public const int MaxTextLength = 5000;
 
     private static readonly string[] _commands = [SpeakCommand];
@@ -30,7 +35,7 @@ public static string ResolveProvider(string? requestedProvider, string? configur
             : requestedProvider;
 
         return string.IsNullOrWhiteSpace(provider)
-            ? WindowsProvider
+            ? PiperProvider
             : provider.Trim().ToLowerInvariant();
     }
 
@@ -81,8 +86,14 @@ public override async Task<NodeInvokeResponse> ExecuteAsync(
         }
         catch (Exception ex)
         {
+            // Privacy: never echo raw exception text into the response. The
+            // exception flows through the failed-invoke path and may be
+            // persisted to recent activity / support bundles. ElevenLabs
+            // error messages can contain key prefixes; OS speech errors
+            // can contain device names. Full detail stays in the local
+            // log only. (Same pattern as SttCapability.)
             Logger.Error("TTS speak failed", ex);
-            return Error($"Speak failed: {ex.Message}");
+            return Error("Speak failed");
         }
     }
 
diff --git a/src/OpenClaw.Shared/DeepLinkParser.cs b/src/OpenClaw.Shared/DeepLinkParser.cs
index 21bcdf7c..4ef68266 100644
--- a/src/OpenClaw.Shared/DeepLinkParser.cs
+++ b/src/OpenClaw.Shared/DeepLinkParser.cs
@@ -20,10 +20,13 @@ public static class DeepLinkParser
         if (!uri.StartsWith(Scheme, StringComparison.OrdinalIgnoreCase))
             return null;
 
-        var remainder = uri[Scheme.Length..].TrimEnd('/');
+        var remainder = uri[Scheme.Length..];
         var queryIndex = remainder.IndexOf('?');
         var query = queryIndex >= 0 ? remainder[(queryIndex + 1)..] : "";
-        var path = queryIndex >= 0 ? remainder[..queryIndex] : remainder;
+        // Trim trailing slash AFTER splitting off the query so the
+        // Windows-canonicalized form `openclaw://send/?args=...` (slash
+        // BEFORE the `?`) yields path "send", not "send/".
+        var path = (queryIndex >= 0 ? remainder[..queryIndex] : remainder).TrimEnd('/');
 
         var parameters = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase);
         foreach (var part in query.Split('&', StringSplitOptions.RemoveEmptyEntries))
diff --git a/src/OpenClaw.Shared/Mcp/McpToolBridge.cs b/src/OpenClaw.Shared/Mcp/McpToolBridge.cs
index e6ce348a..2bae2000 100644
--- a/src/OpenClaw.Shared/Mcp/McpToolBridge.cs
+++ b/src/OpenClaw.Shared/Mcp/McpToolBridge.cs
@@ -236,9 +236,18 @@ private object HandleToolsList()
         ["camera.clip"] =
             "Record a short clip from a camera. Args: deviceId (string, optional), durationMs (int, required, max 60000), format ('mp4'|'webm', default 'mp4'), maxWidth (int, default 1280). Returns { format, durationMs, base64 }.",
 
+        // stt.* — microphone capture → text. Default-off; privacy-sensitive.
+        // Single engine: Whisper.net runs locally on the device.
+        ["stt.transcribe"] =
+            "Capture microphone audio for a bounded duration and return the transcribed text. Args: maxDurationMs (int, required, > 0, max 30000), language (string, optional BCP-47 tag like 'en-US' or 'auto' — falls back to the configured SttLanguage setting). Returns { transcribed, text, durationMs, language, engineEffective ('whisper') }. Whisper model is downloaded on first use; until then this returns an error pointing to Voice Settings. Requires NodeSttEnabled.",
+        ["stt.listen"] =
+            "Capture microphone audio with voice-activity detection and return when the user stops speaking, or after timeoutMs. Args: timeoutMs (int, optional, default 30000, range 1000..120000), language (string, optional BCP-47 tag or 'auto', default 'auto'). Returns { text, language, durationMs, segments[{ text, startMs, endMs }], engineEffective ('whisper') }. Result is the full silence-bounded utterance (all Whisper segments concatenated), not a partial first segment. Requires NodeSttEnabled.",
+        ["stt.status"] =
+            "Report STT engine readiness. No args. Returns { engine ('whisper'), readiness ('ready'|'initializing'|'model-downloading'|'model-not-downloaded'|'unavailable'), modelDownloadProgress (0..1 or null), isListenWithVadSupported (bool), isBoundedTranscribeSupported (bool) }. Carries no PII (no transcript history, no language history, no device IDs, no model paths).",
+
         // tts.*
         ["tts.speak"] =
-            "Speak text aloud on the Windows node. Args: text (string, required), provider ('windows'|'elevenlabs', optional), voiceId (string, optional), model (string, optional), interrupt (bool, default false). Returns { spoken, provider, contentType, durationMs }.",
+            "Speak text aloud on the Windows node. Args: text (string, required), provider ('piper'|'windows'|'elevenlabs', optional — falls back to the configured TtsProvider setting, default 'piper' for fresh installs), voiceId (string, optional — overrides the per-provider configured voice), model (string, optional, ElevenLabs only), interrupt (bool, default false — interrupts any in-progress playback). Returns { spoken, provider, contentType, durationMs }.",
 
         // app.*
         ["app.navigate"] =
diff --git a/src/OpenClaw.Shared/Models.cs b/src/OpenClaw.Shared/Models.cs
index 496e879a..3ce16f99 100644
--- a/src/OpenClaw.Shared/Models.cs
+++ b/src/OpenClaw.Shared/Models.cs
@@ -762,7 +762,7 @@ public static List<PermissionDiagnosticInfo> BuildDefaultWindowsMatrix()
             {
                 Name = "Microphone",
                 Status = "review",
-                Detail = "Required only for camera clips with audio or future voice features.",
+                Detail = "Required for camera clips with audio and for stt.transcribe speech-to-text capture.",
                 SettingsUri = "ms-settings:privacy-microphone"
             },
             new()
@@ -1019,7 +1019,7 @@ public static class CommandCenterCommandGroups
     public static readonly FrozenSet<string> SafeCompanionCommandSet =
         SafeCompanionCommands.ToFrozenSet(StringComparer.OrdinalIgnoreCase);
 
-    public static readonly string[] DangerousCommands =
+    public static readonly string[] CommonDangerousCommands =
     [
         "camera.snap",
         "camera.clip",
@@ -1027,6 +1027,14 @@ public static class CommandCenterCommandGroups
         "tts.speak"
     ];
 
+    public static readonly string[] DangerousCommands =
+    [
+        .. CommonDangerousCommands,
+        "stt.transcribe",
+        "stt.listen",
+        "stt.status"
+    ];
+
     public static readonly FrozenSet<string> DangerousCommandSet =
         DangerousCommands.ToFrozenSet(StringComparer.OrdinalIgnoreCase);
 
@@ -1235,7 +1243,7 @@ public static List<GatewayDiagnosticWarning> BuildNodeWarnings(NodeCapabilityHea
                 Severity = GatewayDiagnosticSeverity.Info,
                 Category = "allowlist",
                 Title = "Privacy-sensitive commands are currently blocked",
-                Detail = $"{blocked} {(node.MissingDangerousAllowlistCommands.Count == 1 ? "is" : "are")} declared but filtered by gateway policy. Leave blocked unless you explicitly want camera or screen recording access for this node.",
+                Detail = $"{blocked} {(node.MissingDangerousAllowlistCommands.Count == 1 ? "is" : "are")} declared but filtered by gateway policy. Leave blocked unless you explicitly want camera, microphone, or screen recording access for this node.",
                 RepairAction = "Copy opt-in guidance",
                 CopyText = BuildDangerousCommandOptInGuidance(node.MissingDangerousAllowlistCommands)
             });
diff --git a/src/OpenClaw.Shared/OpenClaw.Shared.csproj b/src/OpenClaw.Shared/OpenClaw.Shared.csproj
index ac832955..2c0a6df5 100644
--- a/src/OpenClaw.Shared/OpenClaw.Shared.csproj
+++ b/src/OpenClaw.Shared/OpenClaw.Shared.csproj
@@ -15,6 +15,13 @@
     <PackageReference Include="NSec.Cryptography" Version="25.4.0" />
   </ItemGroup>
 
+  <!-- Audio / Speech-to-Text (platform-agnostic components) -->
+  <ItemGroup>
+    <PackageReference Include="Whisper.net" Version="1.9.0" />
+    <PackageReference Include="Whisper.net.Runtime" Version="1.9.0" />
+    <PackageReference Include="Microsoft.ML.OnnxRuntime" Version="1.25.1" />
+  </ItemGroup>
+
 </Project>
 
 
diff --git a/src/OpenClaw.Shared/SettingsData.cs b/src/OpenClaw.Shared/SettingsData.cs
index f1983d88..a8096c0c 100644
--- a/src/OpenClaw.Shared/SettingsData.cs
+++ b/src/OpenClaw.Shared/SettingsData.cs
@@ -34,8 +34,24 @@ public class SettingsData
     public bool NodeCameraEnabled { get; set; } = true;
     public bool NodeLocationEnabled { get; set; } = true;
     public bool NodeBrowserProxyEnabled { get; set; } = true;
+    public bool NodeSttEnabled { get; set; } = false;
+    /// <summary>STT language: "auto" for Whisper auto-detect, or a BCP-47 tag like "en-US".</summary>
+    public string SttLanguage { get; set; } = "auto";
+    /// <summary>Whisper model name: "tiny", "base", or "small".</summary>
+    public string SttModelName { get; set; } = "base";
+    /// <summary>Seconds of silence before auto-submit in voice chat mode.</summary>
+    public float SttSilenceTimeout { get; set; } = 2.5f;
+    /// <summary>Enable TTS playback of responses during voice sessions.</summary>
+    public bool VoiceTtsEnabled { get; set; } = true;
+    /// <summary>Play audio feedback chimes on listen start/stop.</summary>
+    public bool VoiceAudioFeedback { get; set; } = true;
     public bool NodeTtsEnabled { get; set; } = false;
-    public string TtsProvider { get; set; } = "windows";
+    public string TtsProvider { get; set; } = OpenClaw.Shared.Capabilities.TtsCapability.PiperProvider;
+    /// <summary>Persisted: whether the Hub's NavigationView pane is expanded
+    /// (true) or collapsed/compact (false). Default true.</summary>
+    public bool HubNavPaneOpen { get; set; } = true;
+    /// <summary>Optional Windows TTS voice id (or display name). Empty = system default.</summary>
+    public string? TtsWindowsVoiceId { get; set; }
     /// <summary>
     /// ElevenLabs API key storage slot. When persisted by the Windows tray's
     /// SettingsManager this is an opaque dpapi:-prefixed blob, not plaintext.
@@ -43,6 +59,8 @@ public class SettingsData
     public string? TtsElevenLabsApiKey { get; set; }
     public string? TtsElevenLabsModel { get; set; }
     public string? TtsElevenLabsVoiceId { get; set; }
+    /// <summary>Piper voice identifier, e.g. "en_US-amy-low". Voice file is downloaded on first use.</summary>
+    public string TtsPiperVoiceId { get; set; } = "en_US-amy-low";
     /// <summary>Run the local MCP HTTP server. Independent of EnableNodeMode.</summary>
     public bool EnableMcpServer { get; set; } = false;
     /// <summary>
@@ -64,6 +82,8 @@ public class SettingsData
     public bool PreferStructuredCategories { get; set; } = true;
     public List<UserNotificationRule>? UserRules { get; set; }
 
+    // ── (Voice / STT settings consolidated into the block above.) ──
+
     private static readonly JsonSerializerOptions s_options = new()
     {
         WriteIndented = true,
diff --git a/src/OpenClaw.Tray.WinUI/App.xaml.cs b/src/OpenClaw.Tray.WinUI/App.xaml.cs
index 35930815..fc4f49af 100644
--- a/src/OpenClaw.Tray.WinUI/App.xaml.cs
+++ b/src/OpenClaw.Tray.WinUI/App.xaml.cs
@@ -380,6 +380,7 @@ protected override async void OnLaunched(LaunchActivatedEventArgs args)
         {
             _globalHotkey = new GlobalHotkeyService();
             _globalHotkey.HotkeyPressed += OnGlobalHotkeyPressed;
+            _globalHotkey.VoiceHotkeyPressed += OnVoiceHotkeyPressed;
             _globalHotkey.Register();
         }
 
@@ -454,6 +455,49 @@ private void ShowChatWindow()
         }
     }
 
+    private VoiceOverlayWindow? _voiceOverlayWindow;
+    private VoiceService? _standaloneVoiceService;
+
+    private void ShowVoiceOverlay()
+    {
+        var voiceService = _nodeService?.VoiceService ?? EnsureStandaloneVoiceService();
+        if (voiceService == null)
+        {
+            // STT not enabled — show settings
+            ShowHub("voice");
+            return;
+        }
+
+        if (_voiceOverlayWindow == null || _voiceOverlayWindow.AppWindow == null)
+        {
+            _voiceOverlayWindow = new VoiceOverlayWindow(voiceService, new AppLogger());
+            _voiceOverlayWindow.Closed += (_, _) => _voiceOverlayWindow = null;
+            // Wire transcription to gateway chat when connected
+            _voiceOverlayWindow.TextSubmitted += text =>
+            {
+                if (_gatewayClient != null && _currentStatus == ConnectionStatus.Connected)
+                {
+                    _ = _gatewayClient.SendChatMessageAsync(text);
+                }
+            };
+            // Wire Settings button → open the Hub on the Voice & Audio page.
+            _voiceOverlayWindow.SettingsRequested += () =>
+            {
+                _dispatcherQueue?.TryEnqueue(() => ShowHub("voice"));
+            };
+        }
+
+        _voiceOverlayWindow.Activate();
+    }
+
+    private VoiceService? EnsureStandaloneVoiceService()
+    {
+        if (_settings?.NodeSttEnabled != true)
+            return null;
+
+        return _standaloneVoiceService ??= new VoiceService(new AppLogger(), _settings);
+    }
+
     private void OnTrayContextMenu(TrayIcon sender, TrayIconEventArgs e)
     {
         // Right-click: show menu
@@ -503,6 +547,7 @@ private void OnTrayMenuItemClicked(object? sender, string action)
             case "dashboard": OpenDashboard(); break;
             case "canvas": _nodeService?.ShowCanvasWindow(); break;
             case "openchat": ShowChatWindow(); break;
+            case "voice": ShowVoiceOverlay(); break;
             case "webchat": ShowWebChat(); break;
             case "hub": ShowHub(); break;
             case "companion":
@@ -1031,6 +1076,7 @@ private void BuildTrayMenuPopup(TrayMenuWindow menu)
         menu.AddMenuItem("Dashboard", "🌐", "dashboard");
         menu.AddMenuItem("Chat", "💬", "openchat");
         menu.AddMenuItem("Canvas", "🎨", "canvas");
+        menu.AddMenuItem("Voice", "🎙️", "voice");
         menu.AddMenuItem("Companion", "🦞", "companion");
         menu.AddMenuItem(LocalizationHelper.GetString("Menu_QuickSend"), "📤", "quicksend");
 
@@ -1863,6 +1909,7 @@ private void SyncHubNodeState()
             _hubWindow.NodeIsPendingApproval = _nodeService.IsPendingApproval;
             _hubWindow.NodeShortDeviceId = _nodeService.ShortDeviceId;
             _hubWindow.NodeFullDeviceId = _nodeService.FullDeviceId;
+            _hubWindow.VoiceServiceInstance = _nodeService.VoiceService;
         }
         else
         {
@@ -2318,6 +2365,32 @@ private void OnNotificationReceived(object? sender, OpenClawNotification notific
             $"{notification.Type ?? "info"}: {notification.Title ?? "notification"}",
             category: "notification",
             details: notification.Message);
+
+        // Voice overlay: show agent chat responses, and (independently) speak them
+        // if the user enabled "Read responses aloud". TTS used to be gated on
+        // an active voice overlay session — we want the toggle to honor every
+        // chat reply now that voice and text chat will eventually share one UI.
+        if (notification.IsChat && !string.IsNullOrEmpty(notification.Message))
+        {
+            if (_voiceOverlayWindow != null)
+            {
+                _dispatcherQueue?.TryEnqueue(() =>
+                {
+                    try
+                    {
+                        _voiceOverlayWindow?.AddAgentResponse(notification.Message);
+                    }
+                    catch { }
+                });
+            }
+
+            // TTS: read response aloud whenever the toggle is on (any chat surface).
+            if (_settings?.VoiceTtsEnabled == true)
+            {
+                _ = SpeakResponseAsync(notification.Message);
+            }
+        }
+
         if (_settings?.ShowNotifications != true) return;
         if (!ShouldShowNotification(notification)) return;
 
@@ -2519,7 +2592,7 @@ private string BuildTrayTooltip()
 
     #region Window Management
 
-    private void ShowHub(string? navigateTo = null)
+    private void ShowHub(string? navigateTo = null, bool activate = true)
     {
         if (_hubWindow == null || _hubWindow.IsClosed)
         {
@@ -2561,6 +2634,7 @@ private void ShowHub(string? navigateTo = null)
                 _hubWindow.NodeShortDeviceId = _nodeService.ShortDeviceId;
                 _hubWindow.NodeFullDeviceId = _nodeService.FullDeviceId;
             }
+            _hubWindow.VoiceServiceInstance = _nodeService?.VoiceService ?? _standaloneVoiceService;
             _hubWindow.SettingsSaved += OnSettingsSaved;
             _hubWindow.Closed += (s, e) =>
             {
@@ -2578,6 +2652,7 @@ private void ShowHub(string? navigateTo = null)
         _hubWindow.Settings = _settings;
         _hubWindow.GatewayClient = _gatewayClient;
         _hubWindow.CurrentStatus = _currentStatus;
+        _hubWindow.VoiceServiceInstance = _nodeService?.VoiceService ?? _standaloneVoiceService;
         if (_nodeService != null)
         {
             _hubWindow.NodeIsConnected = _nodeService.IsConnected;
@@ -2594,7 +2669,29 @@ private void ShowHub(string? navigateTo = null)
         {
             _hubWindow.NavigateTo(navigateTo);
         }
-        _hubWindow.Activate();
+        if (activate)
+        {
+            _hubWindow.Activate();
+        }
+        else
+        {
+            // Show without stealing focus — used by right-click on the
+            // tray icon where the popup needs to remain the foreground
+            // window (popups light-dismiss if focus moves away).
+            // If the Hub was minimized, restore it first so it actually
+            // becomes visible behind the popup; otherwise Show(false)
+            // is a no-op on a minimized window.
+            try
+            {
+                if (_hubWindow.AppWindow.Presenter is Microsoft.UI.Windowing.OverlappedPresenter op
+                    && op.State == Microsoft.UI.Windowing.OverlappedPresenterState.Minimized)
+                {
+                    op.Restore(activateWindow: false);
+                }
+                _hubWindow.AppWindow.Show(activateWindow: false);
+            }
+            catch { /* swallow */ }
+        }
     }
 
     private void SeedHubCachedData()
@@ -3637,8 +3734,6 @@ private void CopyExtensibilitySummary()
 
     private void OnGlobalHotkeyPressed(object? sender, EventArgs e)
     {
-        // Hotkey events are raised from a dedicated Win32 message-loop thread.
-        // Creating/activating WinUI windows must happen on the app's UI thread.
         if (_dispatcherQueue == null)
         {
             Logger.Warn("Hotkey pressed but DispatcherQueue is null");
@@ -3652,6 +3747,12 @@ private void OnGlobalHotkeyPressed(object? sender, EventArgs e)
         }
     }
 
+    private void OnVoiceHotkeyPressed(object? sender, EventArgs e)
+    {
+        if (_dispatcherQueue == null) return;
+        _dispatcherQueue.TryEnqueue(() => ShowVoiceOverlay());
+    }
+
     #endregion
 
     #region Updates
@@ -3871,6 +3972,8 @@ private void HandleDeepLink(string uri)
             OpenDashboard = OpenDashboard,
             OpenQuickSend = ShowQuickSend,
             OpenHub = (page) => ShowHub(page),
+            OpenVoice = () => ShowVoiceOverlay(),
+            StopVoice = () => _ = StopVoiceAsync(),
             SendMessage = async (msg) =>
             {
                 if (_gatewayClient != null)
@@ -3881,6 +3984,58 @@ private void HandleDeepLink(string uri)
         });
     }
 
+    private async Task StopVoiceAsync()
+    {
+        var voiceService = _nodeService?.VoiceService;
+        if (voiceService != null)
+            await voiceService.StopAsync();
+    }
+
+    private int _ttsMuteCount;
+
+    private async Task SpeakResponseAsync(string text)
+    {
+        var voiceService = _nodeService?.VoiceService;
+        var ttsService = _nodeService?.TextToSpeech;
+        try
+        {
+            if (voiceService == null || _settings == null || ttsService == null) return;
+
+            // Increment mute counter — multiple concurrent TTS won't unmute prematurely
+            Interlocked.Increment(ref _ttsMuteCount);
+            voiceService.IsMutedForPlayback = true;
+
+            var speakText = text.Length > 500 ? text[..500] + "..." : text;
+
+            // Don't pass VoiceId here. The shared TextToSpeechService picks
+            // the right per-provider voice from settings (TtsPiperVoiceId,
+            // TtsWindowsVoiceId, TtsElevenLabsVoiceId). Cross-provider
+            // voice IDs would otherwise leak across providers.
+            var speakArgs = new OpenClaw.Shared.Capabilities.TtsSpeakArgs
+            {
+                Text = speakText,
+                Provider = _settings.TtsProvider ?? TtsCapability.PiperProvider,
+                Interrupt = true
+            };
+
+            await ttsService.SpeakAsync(speakArgs);
+        }
+        catch (Exception ex)
+        {
+            Logger.Warn($"TTS response playback failed: {ex.Message}");
+        }
+        finally
+        {
+            // Only unmute when all concurrent TTS operations have finished
+            if (voiceService != null)
+            {
+                await Task.Delay(300);
+                if (Interlocked.Decrement(ref _ttsMuteCount) <= 0)
+                    voiceService.IsMutedForPlayback = false;
+            }
+        }
+    }
+
     private static void SendDeepLinkToRunningInstance(string uri)
     {
         try
@@ -3988,6 +4143,12 @@ private void ExitApplication()
             _nodeService = null;
         });
 
+        SafeShutdownStep("standalone voice service", () =>
+        {
+            _standaloneVoiceService?.DisposeAsync().AsTask().GetAwaiter().GetResult();
+            _standaloneVoiceService = null;
+        });
+
         SafeShutdownStep("ssh tunnel service", () =>
         {
             _sshTunnelService?.Dispose();
diff --git a/src/OpenClaw.Tray.WinUI/Onboarding/Pages/WizardPage.cs b/src/OpenClaw.Tray.WinUI/Onboarding/Pages/WizardPage.cs
index 687bd716..7355f1ec 100644
--- a/src/OpenClaw.Tray.WinUI/Onboarding/Pages/WizardPage.cs
+++ b/src/OpenClaw.Tray.WinUI/Onboarding/Pages/WizardPage.cs
@@ -282,7 +282,7 @@ async void SubmitStep()
                 var answerValue = string.IsNullOrEmpty(stepInput) ? "true" : stepInput;
 
                 // Smart timeout: 5min for auth-related steps (device code polling), 30s for everything else
-                var isAuthStep = !string.IsNullOrEmpty(stepMessage) && 
+                var isAuthStep = !string.IsNullOrEmpty(stepMessage) &&
                     (stepMessage.Contains("device", StringComparison.OrdinalIgnoreCase) ||
                      stepMessage.Contains("authorize", StringComparison.OrdinalIgnoreCase) ||
                      stepMessage.Contains("login", StringComparison.OrdinalIgnoreCase) ||
@@ -639,4 +639,3 @@ async void SkipStep()
         .Padding(0, 8, 0, 0);
     }
 }
-
diff --git a/src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj b/src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj
index b9873035..ed8afcbd 100644
--- a/src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj
+++ b/src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj
@@ -60,6 +60,8 @@
     <PackageReference Include="System.Drawing.Common" Version="10.0.7" />
     <PackageReference Include="System.Security.Cryptography.ProtectedData" Version="10.0.0" />
     <PackageReference Include="Updatum" Version="1.3.4" />
+    <PackageReference Include="NAudio.Wasapi" Version="2.3.0" />
+    <PackageReference Include="org.k2fsa.sherpa.onnx" Version="1.13.0" />
     <PackageReference Include="Zeroconf" Version="3.6.11" />
     <PackageReference Include="ZXing.Net" Version="0.16.10" />
   </ItemGroup>
diff --git a/src/OpenClaw.Tray.WinUI/Pages/CapabilitiesPage.xaml b/src/OpenClaw.Tray.WinUI/Pages/CapabilitiesPage.xaml
index 2022f27b..bfff9a6f 100644
--- a/src/OpenClaw.Tray.WinUI/Pages/CapabilitiesPage.xaml
+++ b/src/OpenClaw.Tray.WinUI/Pages/CapabilitiesPage.xaml
@@ -25,6 +25,70 @@
                 </ItemsRepeater.Layout>
             </ItemsRepeater>
 
+            <!-- Speech-to-Text settings (visible only when STT is enabled) -->
+            <Border x:Name="SttCard" Background="{ThemeResource CardBackgroundFillColorDefaultBrush}"
+                    BorderBrush="{ThemeResource CardStrokeColorDefaultBrush}"
+                    BorderThickness="1" CornerRadius="8" Padding="16" Margin="0,8,0,0"
+                    Visibility="Collapsed">
+                <StackPanel Spacing="8">
+                    <TextBlock x:Uid="CapabilitiesPage_SttCardHeader" Text="🎤 Speech-to-Text" Style="{StaticResource BodyStrongTextBlockStyle}"/>
+                    <TextBlock x:Uid="CapabilitiesPage_SttCardDescription"
+                               Text="Recognizes microphone audio locally on this PC using Whisper. The model is downloaded once on first use."
+                               Style="{StaticResource CaptionTextBlockStyle}"
+                               Foreground="{ThemeResource TextFillColorSecondaryBrush}" TextWrapping="Wrap"/>
+                    <TextBlock x:Name="SttEngineHint"
+                               Style="{StaticResource CaptionTextBlockStyle}"
+                               Foreground="{ThemeResource TextFillColorSecondaryBrush}" TextWrapping="Wrap"/>
+                    <HyperlinkButton x:Name="SttMoreSettingsLink"
+                                     x:Uid="CapabilitiesPage_SttMoreSettingsLink"
+                                     Content="More voice settings…"
+                                     Click="OnSttMoreSettingsClick"
+                                     Padding="0"/>
+                </StackPanel>
+            </Border>
+
+            <!-- Text-to-Speech settings (visible only when TTS is enabled) -->
+            <Border x:Name="TtsCard" Background="{ThemeResource CardBackgroundFillColorDefaultBrush}"
+                    BorderBrush="{ThemeResource CardStrokeColorDefaultBrush}"
+                    BorderThickness="1" CornerRadius="8" Padding="16" Margin="0,8,0,0"
+                    Visibility="Collapsed">
+                <StackPanel Spacing="8">
+                    <TextBlock x:Uid="CapabilitiesPage_TtsCardHeader" Text="🔊 Text-to-Speech" Style="{StaticResource BodyStrongTextBlockStyle}"/>
+                    <TextBlock x:Uid="CapabilitiesPage_TtsCardDescription"
+                               Text="Default provider for tts.speak. Per-call provider/voiceId/model overrides this default."
+                               Style="{StaticResource CaptionTextBlockStyle}"
+                               Foreground="{ThemeResource TextFillColorSecondaryBrush}" TextWrapping="Wrap"/>
+                    <ComboBox x:Name="TtsProviderComboBox" x:Uid="CapabilitiesPage_TtsProviderComboBox" Header="Provider"
+                              SelectionChanged="OnTtsProviderSelectionChanged">
+                        <ComboBoxItem x:Uid="CapabilitiesPage_TtsProviderPiper" Content="Piper (local ML, recommended)" Tag="piper"/>
+                        <ComboBoxItem x:Uid="CapabilitiesPage_TtsProviderWindows" Content="Windows built-in speech" Tag="windows"/>
+                        <ComboBoxItem x:Uid="CapabilitiesPage_TtsProviderElevenLabs" Content="ElevenLabs" Tag="elevenlabs"/>
+                    </ComboBox>
+                    <StackPanel x:Name="TtsElevenLabsPanel" Spacing="6" Visibility="Collapsed">
+                        <PasswordBox x:Name="TtsElevenLabsApiKeyBox"
+                                     x:Uid="CapabilitiesPage_TtsElevenLabsApiKey"
+                                     Header="ElevenLabs API key"
+                                     LostFocus="OnTtsElevenLabsCommitted"/>
+                        <TextBox x:Name="TtsElevenLabsVoiceIdBox"
+                                 x:Uid="CapabilitiesPage_TtsElevenLabsVoiceId"
+                                 Header="ElevenLabs voice ID"
+                                 LostFocus="OnTtsElevenLabsCommitted"/>
+                        <TextBox x:Name="TtsElevenLabsModelBox"
+                                 x:Uid="CapabilitiesPage_TtsElevenLabsModel"
+                                 Header="ElevenLabs model"
+                                 PlaceholderText="eleven_multilingual_v2"
+                                 LostFocus="OnTtsElevenLabsCommitted"/>
+                        <TextBlock x:Uid="CapabilitiesPage_TtsElevenLabsHelp"
+                                   Text="API key is encrypted at rest with Windows DPAPI. Leave blank to keep the previously saved value when you change other fields."
+                                   Style="{StaticResource CaptionTextBlockStyle}"
+                                   Foreground="{ThemeResource TextFillColorSecondaryBrush}" TextWrapping="Wrap"/>
+                    </StackPanel>
+                    <TextBlock x:Name="TtsStatusText"
+                               Style="{StaticResource CaptionTextBlockStyle}"
+                               Foreground="{ThemeResource TextFillColorSecondaryBrush}"/>
+                </StackPanel>
+            </Border>
+
             <!-- MCP Server -->
             <Border x:Name="McpCard" Background="{ThemeResource CardBackgroundFillColorDefaultBrush}"
                     BorderBrush="{ThemeResource CardStrokeColorDefaultBrush}"
diff --git a/src/OpenClaw.Tray.WinUI/Pages/CapabilitiesPage.xaml.cs b/src/OpenClaw.Tray.WinUI/Pages/CapabilitiesPage.xaml.cs
index 0430cdc3..5ebec1b9 100644
--- a/src/OpenClaw.Tray.WinUI/Pages/CapabilitiesPage.xaml.cs
+++ b/src/OpenClaw.Tray.WinUI/Pages/CapabilitiesPage.xaml.cs
@@ -1,11 +1,14 @@
 using Microsoft.UI.Xaml;
 using Microsoft.UI.Xaml.Controls;
+using Microsoft.UI.Xaml.Input;
 using OpenClaw.Shared;
+using OpenClaw.Shared.Capabilities;
 using OpenClawTray.Services;
 using OpenClawTray.Windows;
 using System;
 using System.Collections.Generic;
 using Windows.ApplicationModel.DataTransfer;
+using Windows.System;
 
 namespace OpenClawTray.Pages;
 
@@ -13,6 +16,12 @@ public sealed partial class CapabilitiesPage : Page
 {
     private HubWindow? _hub;
     private bool _suppressMcpToggle;
+    private bool _suppressTtsProviderChange;
+
+    // Sentinel rendered into the API key PasswordBox so the user can see
+    // that a key is already saved without us ever surfacing the plaintext.
+    // Saving the form treats this exact value as "keep current key".
+    private const string SavedApiKeySentinel = "••••••••";
 
     public CapabilitiesPage()
     {
@@ -26,6 +35,8 @@ public void Initialize(HubWindow hub)
 
         BuildCapabilityToggles(hub);
         UpdateMcpStatus(hub);
+        UpdateSttCard(hub);
+        UpdateTtsCard(hub);
         UpdateNodeStatus(hub);
     }
 
@@ -43,6 +54,7 @@ private void BuildCapabilityToggles(HubWindow hub)
             ("🖥️", "Screen Capture", settings.NodeScreenEnabled, v => settings.NodeScreenEnabled = v),
             ("📍", "Location", settings.NodeLocationEnabled, v => settings.NodeLocationEnabled = v),
             ("🔊", "Text-to-Speech", settings.NodeTtsEnabled, v => settings.NodeTtsEnabled = v),
+            ("🎤", "Speech-to-Text", settings.NodeSttEnabled, v => settings.NodeSttEnabled = v),
         };
 
         var items = new List<UIElement>();
@@ -59,6 +71,8 @@ private void BuildCapabilityToggles(HubWindow hub)
                 setter(toggle.IsOn);
                 settings.Save();
                 hub.RaiseSettingsSaved();
+                UpdateSttCard(hub);
+                UpdateTtsCard(hub);
                 UpdateNodeStatus(hub);
             };
             items.Add(toggle);
@@ -67,6 +81,162 @@ private void BuildCapabilityToggles(HubWindow hub)
         CapabilityRepeater.ItemsSource = items;
     }
 
+    // ============================================================
+    // Speech-to-Text settings card
+    // ============================================================
+
+    private void UpdateSttCard(HubWindow hub)
+    {
+        var enabled = hub.Settings?.NodeSttEnabled == true;
+        SttCard.Visibility = enabled ? Visibility.Visible : Visibility.Collapsed;
+        if (!enabled || hub.Settings == null) return;
+
+        UpdateSttEngineHint(hub);
+    }
+
+    private void UpdateSttEngineHint(HubWindow hub)
+    {
+        // Whisper is the only engine. Surface model-readiness so the user
+        // knows what (if anything) needs to happen before stt.* will work.
+        //
+        // Check the file directly via WhisperModelManager rather than going
+        // through hub.VoiceServiceInstance — that instance is only created
+        // by NodeService.RegisterCapabilities() at Connect time, so a user
+        // who toggled STT on but hasn't reconnected yet would see a stale
+        // "not downloaded" message even with the file on disk.
+        var modelName = hub.Settings?.SttModelName ?? "base";
+        var modelManager = new OpenClaw.Shared.Audio.WhisperModelManager(
+            SettingsManager.SettingsDirectoryPath, new AppLogger());
+        var modelDownloaded = modelManager.IsModelDownloaded(modelName);
+        var modelDownloading = hub.VoiceServiceInstance?.IsWhisperDownloadingModel ?? false;
+
+        if (modelDownloaded)
+        {
+            SttEngineHint.Text = "Whisper model is ready. Speech-to-text runs fully on this PC; no audio leaves the device.";
+        }
+        else if (modelDownloading)
+        {
+            SttEngineHint.Text = "Whisper model is downloading. Speech-to-text will be available once it's ready.";
+        }
+        else
+        {
+            SttEngineHint.Text = "Whisper model is not downloaded. Open More voice settings… to download it before using speech-to-text.";
+        }
+    }
+
+    private void OnSttMoreSettingsClick(object sender, RoutedEventArgs e)
+    {
+        // Navigate the Hub to the dedicated voice settings page.
+        _hub?.NavigateTo("voice");
+    }
+
+    // ============================================================
+    // Text-to-Speech settings card
+    // ============================================================
+
+    private void UpdateTtsCard(HubWindow hub)
+    {
+        var enabled = hub.Settings?.NodeTtsEnabled == true;
+        TtsCard.Visibility = enabled ? Visibility.Visible : Visibility.Collapsed;
+        if (!enabled || hub.Settings == null) return;
+
+        var settings = hub.Settings;
+
+        _suppressTtsProviderChange = true;
+        // ComboBox order: 0=Piper, 1=Windows, 2=ElevenLabs.
+        TtsProviderComboBox.SelectedIndex = settings.TtsProvider switch
+        {
+            var p when string.Equals(p, TtsCapability.ElevenLabsProvider, StringComparison.OrdinalIgnoreCase) => 2,
+            var p when string.Equals(p, TtsCapability.WindowsProvider, StringComparison.OrdinalIgnoreCase)    => 1,
+            _ => 0  // default to Piper for unknown / null / whitespace
+        };
+        _suppressTtsProviderChange = false;
+
+        // PasswordBox shows a masked sentinel when we already have a saved
+        // key, so the user can tell something is set without us ever
+        // putting plaintext on screen.
+        TtsElevenLabsApiKeyBox.Password =
+            string.IsNullOrEmpty(settings.TtsElevenLabsApiKey) ? "" : SavedApiKeySentinel;
+        TtsElevenLabsVoiceIdBox.Text = settings.TtsElevenLabsVoiceId;
+        TtsElevenLabsModelBox.Text = settings.TtsElevenLabsModel;
+
+        UpdateTtsElevenLabsPanelVisibility();
+        TtsStatusText.Text = "";
+    }
+
+    private void UpdateTtsElevenLabsPanelVisibility()
+    {
+        var isEleven = (TtsProviderComboBox.SelectedItem is ComboBoxItem item)
+            && string.Equals(item.Tag as string, TtsCapability.ElevenLabsProvider, StringComparison.OrdinalIgnoreCase);
+        TtsElevenLabsPanel.Visibility = isEleven ? Visibility.Visible : Visibility.Collapsed;
+    }
+
+    private void OnTtsProviderSelectionChanged(object sender, SelectionChangedEventArgs e)
+    {
+        if (_suppressTtsProviderChange) return;
+        if (_hub?.Settings == null) return;
+
+        var newProvider = (TtsProviderComboBox.SelectedItem is ComboBoxItem item && item.Tag is string tag)
+            ? tag
+            : TtsCapability.WindowsProvider;
+
+        if (!string.Equals(_hub.Settings.TtsProvider, newProvider, StringComparison.OrdinalIgnoreCase))
+        {
+            _hub.Settings.TtsProvider = newProvider;
+            _hub.Settings.Save();
+            _hub.RaiseSettingsSaved();
+            TtsStatusText.Text = $"Default provider: {newProvider}";
+        }
+
+        UpdateTtsElevenLabsPanelVisibility();
+    }
+
+    private void OnTtsElevenLabsCommitted(object sender, RoutedEventArgs e)
+    {
+        if (_hub?.Settings == null) return;
+        var settings = _hub.Settings;
+
+        var changed = false;
+
+        // Treat the sentinel as "keep existing"; only overwrite when the
+        // user has typed a real key.
+        var typedKey = TtsElevenLabsApiKeyBox.Password ?? "";
+        if (!string.Equals(typedKey, SavedApiKeySentinel, StringComparison.Ordinal))
+        {
+            var trimmedKey = typedKey.Trim();
+            if (!string.Equals(settings.TtsElevenLabsApiKey, trimmedKey, StringComparison.Ordinal))
+            {
+                settings.TtsElevenLabsApiKey = trimmedKey;
+                changed = true;
+            }
+        }
+
+        var voiceId = TtsElevenLabsVoiceIdBox.Text?.Trim() ?? "";
+        if (!string.Equals(settings.TtsElevenLabsVoiceId, voiceId, StringComparison.Ordinal))
+        {
+            settings.TtsElevenLabsVoiceId = voiceId;
+            changed = true;
+        }
+
+        var model = TtsElevenLabsModelBox.Text?.Trim() ?? "";
+        if (!string.Equals(settings.TtsElevenLabsModel, model, StringComparison.Ordinal))
+        {
+            settings.TtsElevenLabsModel = model;
+            changed = true;
+        }
+
+        if (changed)
+        {
+            settings.Save();
+            _hub.RaiseSettingsSaved();
+            // Re-render the API key field so the sentinel tracks the newly
+            // saved state instead of leaving the typed key visible.
+            TtsElevenLabsApiKeyBox.Password =
+                string.IsNullOrEmpty(settings.TtsElevenLabsApiKey) ? "" : SavedApiKeySentinel;
+            TtsStatusText.Text = "ElevenLabs settings saved.";
+        }
+    }
+
     private void UpdateNodeStatus(HubWindow hub)
     {
         var nodeEnabled = hub.Settings?.EnableNodeMode ?? false;
@@ -90,6 +260,7 @@ private void UpdateNodeStatus(HubWindow hub)
             if (hub.Settings?.NodeScreenEnabled == true) caps.Add("screen");
             if (hub.Settings?.NodeLocationEnabled == true) caps.Add("location");
             if (hub.Settings?.NodeTtsEnabled == true) caps.Add("tts");
+            if (hub.Settings?.NodeSttEnabled == true) caps.Add("stt");
             NodeDetailsText.Text = caps.Count > 0
                 ? $"Providing {caps.Count} capabilities: {string.Join(", ", caps)}"
                 : "No capabilities enabled.";
diff --git a/src/OpenClaw.Tray.WinUI/Pages/HomePage.xaml b/src/OpenClaw.Tray.WinUI/Pages/HomePage.xaml
index 3c4c5828..e3687d4b 100644
--- a/src/OpenClaw.Tray.WinUI/Pages/HomePage.xaml
+++ b/src/OpenClaw.Tray.WinUI/Pages/HomePage.xaml
@@ -19,14 +19,14 @@
                         <ColumnDefinition Width="Auto"/>
                     </Grid.ColumnDefinitions>
 
-                    <!-- Molty mascot with status ring -->
+                    <!-- Companion mascot with status ring -->
                     <Grid Grid.Column="0" Width="72" Height="72" VerticalAlignment="Center">
                         <!-- Colored ring (Ellipse behind emoji) -->
-                        <Ellipse x:Name="MoltyRing" Width="68" Height="68"
+                        <Ellipse x:Name="CompanionRing" Width="68" Height="68"
                                  StrokeThickness="3" Fill="Transparent"
                                  Stroke="{ThemeResource TextFillColorDisabledBrush}"/>
                         <!-- Animated ProgressRing for "working" state -->
-                        <ProgressRing x:Name="MoltyProgressRing" Width="72" Height="72"
+                        <ProgressRing x:Name="CompanionProgressRing" Width="72" Height="72"
                                       IsActive="False" Visibility="Collapsed"/>
                         <!-- Lobster emoji -->
                         <TextBlock Text="🦞" FontSize="36"
diff --git a/src/OpenClaw.Tray.WinUI/Pages/HomePage.xaml.cs b/src/OpenClaw.Tray.WinUI/Pages/HomePage.xaml.cs
index d7c7fdd2..b1b2125f 100644
--- a/src/OpenClaw.Tray.WinUI/Pages/HomePage.xaml.cs
+++ b/src/OpenClaw.Tray.WinUI/Pages/HomePage.xaml.cs
@@ -59,12 +59,12 @@ public void UpdateConnectionStatus(ConnectionStatus status, string? gatewayUrl)
             if (!string.IsNullOrEmpty(gatewayUrl))
                 GatewayUrlText.Text = gatewayUrl;
 
-            UpdateMoltyRing(status);
+            UpdateCompanionRing(status);
             UpdateStatusText(status);
         });
     }
 
-    private void UpdateMoltyRing(ConnectionStatus status)
+    private void UpdateCompanionRing(ConnectionStatus status)
     {
         bool hasActiveSessions = _lastSessions?.Any(s =>
             string.Equals(s.Status, "active", StringComparison.OrdinalIgnoreCase)) ?? false;
@@ -72,17 +72,17 @@ private void UpdateMoltyRing(ConnectionStatus status)
         if (status == ConnectionStatus.Connected && hasActiveSessions)
         {
             // Agent working — animated blue ring
-            MoltyRing.Visibility = Visibility.Collapsed;
-            MoltyProgressRing.IsActive = true;
-            MoltyProgressRing.Visibility = Visibility.Visible;
+            CompanionRing.Visibility = Visibility.Collapsed;
+            CompanionProgressRing.IsActive = true;
+            CompanionProgressRing.Visibility = Visibility.Visible;
         }
         else
         {
-            MoltyProgressRing.IsActive = false;
-            MoltyProgressRing.Visibility = Visibility.Collapsed;
-            MoltyRing.Visibility = Visibility.Visible;
+            CompanionProgressRing.IsActive = false;
+            CompanionProgressRing.Visibility = Visibility.Collapsed;
+            CompanionRing.Visibility = Visibility.Visible;
 
-            MoltyRing.Stroke = status switch
+            CompanionRing.Stroke = status switch
             {
                 ConnectionStatus.Connected => new SolidColorBrush(Colors.LimeGreen),
                 ConnectionStatus.Error => new SolidColorBrush(Colors.Red),
@@ -133,7 +133,7 @@ public void UpdateSessions(SessionInfo[] sessions)
         _lastSessions = sessions;
         DispatcherQueue?.TryEnqueue(() =>
         {
-            UpdateMoltyRing(_lastStatus);
+            UpdateCompanionRing(_lastStatus);
             UpdateStatusText(_lastStatus);
         });
     }
diff --git a/src/OpenClaw.Tray.WinUI/Pages/VoiceSettingsPage.xaml b/src/OpenClaw.Tray.WinUI/Pages/VoiceSettingsPage.xaml
new file mode 100644
index 00000000..34c3bf20
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Pages/VoiceSettingsPage.xaml
@@ -0,0 +1,187 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Page
+    x:Class="OpenClawTray.Pages.VoiceSettingsPage"
+    xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
+    xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml">
+
+    <ScrollViewer VerticalScrollBarVisibility="Auto">
+        <StackPanel Padding="24" Spacing="16" HorizontalAlignment="Stretch" MaxWidth="600">
+
+            <TextBlock x:Uid="VoiceSettingsPage_PageTitle" x:Name="PageTitleText" Text="🎙️ Voice &amp; Audio" Style="{StaticResource TitleTextBlockStyle}"/>
+            <TextBlock x:Uid="VoiceSettingsPage_PageDescription" x:Name="PageDescriptionText"
+                       Text="Configure speech-to-text and voice interaction settings. All speech processing runs locally on your device."
+                       Style="{StaticResource CaptionTextBlockStyle}"
+                       Foreground="{ThemeResource TextFillColorSecondaryBrush}"
+                       TextWrapping="Wrap" Margin="0,0,0,8"/>
+
+            <!-- Enable STT -->
+            <Border Background="{ThemeResource CardBackgroundFillColorDefaultBrush}"
+                    BorderBrush="{ThemeResource CardStrokeColorDefaultBrush}"
+                    BorderThickness="1" CornerRadius="8" Padding="16">
+                <StackPanel Spacing="8">
+                    <TextBlock x:Uid="VoiceSettingsPage_SttHeader" x:Name="SttHeaderText" Text="Speech-to-Text" Style="{StaticResource BodyStrongTextBlockStyle}"/>
+                    <TextBlock x:Uid="VoiceSettingsPage_SttDescription" x:Name="SttDescriptionText"
+                               Text="Enable voice input via microphone. Requires a Whisper model download."
+                               Style="{StaticResource CaptionTextBlockStyle}"
+                               Foreground="{ThemeResource TextFillColorSecondaryBrush}" TextWrapping="Wrap"/>
+                    <ToggleSwitch x:Uid="VoiceSettingsPage_SttEnabledToggle" x:Name="SttEnabledToggle" Toggled="OnSttToggled"
+                                  Header="Enable Voice Input"/>
+                </StackPanel>
+            </Border>
+
+            <!-- Model Management -->
+            <Border x:Name="ModelCard"
+                    Background="{ThemeResource CardBackgroundFillColorDefaultBrush}"
+                    BorderBrush="{ThemeResource CardStrokeColorDefaultBrush}"
+                    BorderThickness="1" CornerRadius="8" Padding="16">
+                <StackPanel Spacing="12">
+                    <TextBlock x:Uid="VoiceSettingsPage_ModelHeader" x:Name="ModelHeaderText" Text="Speech Model" Style="{StaticResource BodyStrongTextBlockStyle}"/>
+
+                    <ComboBox x:Uid="VoiceSettingsPage_ModelCombo" x:Name="ModelCombo" Header="Model Size"
+                              SelectionChanged="OnModelChanged" Width="320">
+                        <ComboBoxItem x:Uid="VoiceSettingsPage_ModelTiny"  Content="Tiny (~75 MB) — Fast, basic accuracy" Tag="tiny"/>
+                        <ComboBoxItem x:Uid="VoiceSettingsPage_ModelBase"  Content="Base (~142 MB) — Good balance"        Tag="base"/>
+                        <ComboBoxItem x:Uid="VoiceSettingsPage_ModelSmall" Content="Small (~466 MB) — High accuracy"      Tag="small"/>
+                    </ComboBox>
+
+                    <StackPanel Orientation="Horizontal" Spacing="12">
+                        <Button x:Name="DownloadButton" Click="OnDownloadClick"
+                                Style="{StaticResource AccentButtonStyle}">
+                            <StackPanel Orientation="Horizontal" Spacing="6">
+                                <FontIcon Glyph="&#xE896;" FontSize="14"/>
+                                <TextBlock x:Uid="VoiceSettingsPage_DownloadButtonText" x:Name="DownloadButtonText" Text="Download Model"/>
+                            </StackPanel>
+                        </Button>
+                        <TextBlock x:Name="ModelStatusText" VerticalAlignment="Center"
+                                   Style="{StaticResource CaptionTextBlockStyle}"
+                                   Foreground="{ThemeResource TextFillColorSecondaryBrush}"/>
+                    </StackPanel>
+
+                    <ProgressBar x:Name="DownloadProgress" Visibility="Collapsed"
+                                 Minimum="0" Maximum="100"/>
+                </StackPanel>
+            </Border>
+
+            <!-- Language -->
+            <Border Background="{ThemeResource CardBackgroundFillColorDefaultBrush}"
+                    BorderBrush="{ThemeResource CardStrokeColorDefaultBrush}"
+                    BorderThickness="1" CornerRadius="8" Padding="16">
+                <StackPanel Spacing="8">
+                    <TextBlock x:Uid="VoiceSettingsPage_LanguageHeader" x:Name="LanguageHeaderText" Text="Language" Style="{StaticResource BodyStrongTextBlockStyle}"/>
+                    <ComboBox x:Name="LanguageCombo"
+                              SelectionChanged="OnLanguageChanged" Width="200">
+                        <ComboBoxItem x:Uid="VoiceSettingsPage_LangAuto" Content="Auto-detect" Tag="auto"/>
+                        <ComboBoxItem x:Uid="VoiceSettingsPage_LangEn"   Content="English"     Tag="en"/>
+                        <ComboBoxItem x:Uid="VoiceSettingsPage_LangEs"   Content="Spanish"     Tag="es"/>
+                        <ComboBoxItem x:Uid="VoiceSettingsPage_LangFr"   Content="French"      Tag="fr"/>
+                        <ComboBoxItem x:Uid="VoiceSettingsPage_LangDe"   Content="German"      Tag="de"/>
+                        <ComboBoxItem x:Uid="VoiceSettingsPage_LangJa"   Content="Japanese"    Tag="ja"/>
+                        <ComboBoxItem x:Uid="VoiceSettingsPage_LangZh"   Content="Chinese"     Tag="zh"/>
+                        <ComboBoxItem x:Uid="VoiceSettingsPage_LangKo"   Content="Korean"      Tag="ko"/>
+                        <ComboBoxItem x:Uid="VoiceSettingsPage_LangPt"   Content="Portuguese"  Tag="pt"/>
+                        <ComboBoxItem x:Uid="VoiceSettingsPage_LangIt"   Content="Italian"     Tag="it"/>
+                    </ComboBox>
+                </StackPanel>
+            </Border>
+
+            <!-- Voice Chat Settings -->
+            <Border Background="{ThemeResource CardBackgroundFillColorDefaultBrush}"
+                    BorderBrush="{ThemeResource CardStrokeColorDefaultBrush}"
+                    BorderThickness="1" CornerRadius="8" Padding="16">
+                <StackPanel Spacing="8">
+                    <TextBlock x:Uid="VoiceSettingsPage_VoiceChatHeader" x:Name="VoiceChatHeaderText" Text="Voice Chat" Style="{StaticResource BodyStrongTextBlockStyle}"/>
+
+                    <Slider x:Uid="VoiceSettingsPage_SilenceSlider" x:Name="SilenceSlider" Header="Silence timeout (seconds)"
+                            Minimum="0.5" Maximum="5" StepFrequency="0.5"
+                            ValueChanged="OnSilenceChanged" Width="250"/>
+
+                    <ToggleSwitch x:Uid="VoiceSettingsPage_TtsResponseToggle" x:Name="TtsResponseToggle" Toggled="OnTtsResponseToggled"
+                                  Header="Read responses aloud"/>
+
+                    <ToggleSwitch x:Uid="VoiceSettingsPage_AudioFeedbackToggle" x:Name="AudioFeedbackToggle" Toggled="OnAudioFeedbackToggled"
+                                  Header="Audio feedback sounds"/>
+                </StackPanel>
+            </Border>
+
+            <!-- TTS Voice Selection -->
+            <Border Background="{ThemeResource CardBackgroundFillColorDefaultBrush}"
+                    BorderBrush="{ThemeResource CardStrokeColorDefaultBrush}"
+                    BorderThickness="1" CornerRadius="8" Padding="16">
+                <StackPanel Spacing="12">
+                    <TextBlock x:Uid="VoiceSettingsPage_VoiceHeader" x:Name="VoiceHeaderText" Text="🔊 Companion Voice" Style="{StaticResource BodyStrongTextBlockStyle}"/>
+                    <TextBlock x:Uid="VoiceSettingsPage_VoiceDescription" x:Name="VoiceDescriptionText"
+                               Text="Choose the voice used when reading responses aloud."
+                               Style="{StaticResource CaptionTextBlockStyle}"
+                               Foreground="{ThemeResource TextFillColorSecondaryBrush}" TextWrapping="Wrap"/>
+
+                    <ComboBox x:Uid="VoiceSettingsPage_TtsProviderCombo" x:Name="TtsProviderCombo" Header="Provider"
+                              SelectionChanged="OnTtsProviderChanged" Width="360">
+                        <ComboBoxItem x:Uid="VoiceSettingsPage_TtsPiper"      Content="Piper (local neural voices)"          Tag="piper"/>
+                        <ComboBoxItem x:Uid="VoiceSettingsPage_TtsWindows"    Content="Windows (built-in neural voices)"     Tag="windows"/>
+                        <ComboBoxItem x:Uid="VoiceSettingsPage_TtsElevenLabs" Content="ElevenLabs (cloud, requires API key)" Tag="elevenlabs"/>
+                    </ComboBox>
+
+                    <!-- Piper voice picker (local neural TTS via Sherpa-ONNX) -->
+                    <StackPanel x:Name="PiperVoicePanel" Spacing="8" Visibility="Collapsed">
+                        <ComboBox x:Uid="VoiceSettingsPage_PiperVoiceCombo" x:Name="PiperVoiceCombo" Header="Voice"
+                                  SelectionChanged="OnPiperVoiceChanged" Width="400"/>
+                        <StackPanel Orientation="Horizontal" Spacing="8">
+                            <Button x:Name="PiperDownloadButton" Click="OnPiperDownloadClick"
+                                    Width="180" MinHeight="32">
+                                <StackPanel Orientation="Horizontal" Spacing="6">
+                                    <FontIcon x:Name="PiperDownloadIcon" Glyph="&#xE896;" FontSize="14"/>
+                                    <TextBlock x:Uid="VoiceSettingsPage_PiperDownloadButtonText" x:Name="PiperDownloadButtonText" Text="Download Voice"/>
+                                </StackPanel>
+                            </Button>
+                            <Button x:Uid="VoiceSettingsPage_PiperDeleteButton" x:Name="PiperDeleteButton" Click="OnPiperDeleteClick"
+                                    Content="Delete" Width="80" Visibility="Collapsed"/>
+                            <Button x:Uid="VoiceSettingsPage_PiperPreviewButton" x:Name="PiperPreviewButton" Click="OnPiperPreviewClick"
+                                    Content="▶ Preview" Width="100" Visibility="Collapsed"/>
+                        </StackPanel>
+                        <ProgressBar x:Name="PiperDownloadProgress" Visibility="Collapsed"
+                                     IsIndeterminate="False" Maximum="100" Height="6" Width="400" HorizontalAlignment="Left"/>
+                        <TextBlock x:Name="PiperStatusText"
+                                   Style="{StaticResource CaptionTextBlockStyle}"
+                                   Foreground="{ThemeResource TextFillColorSecondaryBrush}" TextWrapping="Wrap"/>
+                        <TextBlock x:Uid="VoiceSettingsPage_PiperInfoText" x:Name="PiperInfoText"
+                                   Text="Voices download from the sherpa-onnx project's GitHub releases (~25 MB low quality, up to ~150 MB high quality). They run fully on this PC; no audio leaves your device."
+                                   Style="{StaticResource CaptionTextBlockStyle}"
+                                   Foreground="{ThemeResource TextFillColorSecondaryBrush}" TextWrapping="Wrap"/>
+                    </StackPanel>
+
+                    <!-- Windows voice picker -->
+                    <StackPanel x:Name="WindowsVoicePanel" Spacing="8" Visibility="Collapsed">
+                        <ComboBox x:Uid="VoiceSettingsPage_WindowsVoiceCombo" x:Name="WindowsVoiceCombo" Header="Voice"
+                                  SelectionChanged="OnWindowsVoiceChanged" Width="300"/>
+                        <Button x:Uid="VoiceSettingsPage_PreviewVoiceButton" x:Name="PreviewVoiceButton" Click="OnPreviewVoiceClick"
+                                Content="▶ Preview Voice" Width="140"/>
+                    </StackPanel>
+
+                    <!-- ElevenLabs settings -->
+                    <StackPanel x:Name="ElevenLabsPanel" Spacing="8" Visibility="Collapsed">
+                        <PasswordBox x:Uid="VoiceSettingsPage_ElevenLabsApiKeyBox" x:Name="ElevenLabsApiKeyBox" Header="API Key"
+                                     PasswordChanged="OnElevenLabsKeyChanged" Width="300"/>
+                        <TextBox x:Uid="VoiceSettingsPage_ElevenLabsVoiceIdBox" x:Name="ElevenLabsVoiceIdBox" Header="Voice ID"
+                                 TextChanged="OnElevenLabsVoiceIdChanged" Width="300"
+                                 PlaceholderText="e.g. 21m00Tcm4TlvDq8ikWAM"/>
+                        <TextBox x:Uid="VoiceSettingsPage_ElevenLabsModelBox" x:Name="ElevenLabsModelBox" Header="Model (optional)"
+                                 TextChanged="OnElevenLabsModelChanged" Width="300"
+                                 PlaceholderText="e.g. eleven_turbo_v2"/>
+                    </StackPanel>
+                </StackPanel>
+            </Border>
+
+            <!-- Privacy notice -->
+            <Border Background="{ThemeResource SystemFillColorCautionBackgroundBrush}"
+                    CornerRadius="8" Padding="12">
+                <StackPanel Orientation="Horizontal" Spacing="8">
+                    <FontIcon Glyph="&#xE8D7;" FontSize="16" Foreground="{ThemeResource SystemFillColorCautionBrush}"/>
+                    <TextBlock x:Uid="VoiceSettingsPage_PrivacyNote" x:Name="PrivacyNoteText"
+                               Text="All speech processing runs entirely on your device. No audio data is sent to any cloud service."
+                               Style="{StaticResource CaptionTextBlockStyle}" TextWrapping="Wrap"/>
+                </StackPanel>
+            </Border>
+
+        </StackPanel>
+    </ScrollViewer>
+</Page>
diff --git a/src/OpenClaw.Tray.WinUI/Pages/VoiceSettingsPage.xaml.cs b/src/OpenClaw.Tray.WinUI/Pages/VoiceSettingsPage.xaml.cs
new file mode 100644
index 00000000..3ebf6bbf
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Pages/VoiceSettingsPage.xaml.cs
@@ -0,0 +1,609 @@
+using Microsoft.UI.Xaml;
+using Microsoft.UI.Xaml.Controls;
+using OpenClaw.Shared;
+using OpenClaw.Shared.Capabilities;
+using OpenClawTray.Helpers;
+using OpenClawTray.Services;
+using OpenClawTray.Windows;
+using System;
+using System.Globalization;
+using System.Linq;
+using System.Threading;
+
+namespace OpenClawTray.Pages;
+
+public sealed partial class VoiceSettingsPage : Page
+{
+    private HubWindow? _hub;
+    private VoiceService? _voiceService;
+    private bool _suppressEvents;
+    // Per-asset CTS so a Piper download doesn't cancel an in-flight Whisper
+    // download (and vice versa). Each download type owns its own token.
+    private static string L(string key) => LocalizationHelper.GetString(key);
+    private static string Lf(string key, params object?[] args) =>
+        string.Format(CultureInfo.CurrentCulture, LocalizationHelper.GetString(key), args);
+
+    private CancellationTokenSource? _whisperDownloadCts;
+    private CancellationTokenSource? _piperDownloadCts;
+
+    public VoiceSettingsPage()
+    {
+        InitializeComponent();
+        // Refresh model + voice status every time the page becomes visible so
+        // file-state changes (e.g. a silent Whisper auto-download triggered by
+        // the Voice Overlay, or a Piper voice downloaded in another window)
+        // propagate without forcing the user to renavigate.
+        Loaded += (_, _) =>
+        {
+            UpdateModelStatus();
+            UpdatePiperVoiceState();
+        };
+    }
+
+    public void Initialize(HubWindow hub, VoiceService? voiceService)
+    {
+        _hub = hub;
+        _voiceService = voiceService;
+        LoadSettings();
+    }
+
+    private void LoadSettings()
+    {
+        if (_hub?.Settings == null) return;
+        _suppressEvents = true;
+
+        try
+        {
+            var settings = _hub.Settings;
+
+            SttEnabledToggle.IsOn = settings.NodeSttEnabled;
+
+            // Select model in combo
+            for (int i = 0; i < ModelCombo.Items.Count; i++)
+            {
+                if (ModelCombo.Items[i] is ComboBoxItem item &&
+                    string.Equals(item.Tag?.ToString(), settings.SttModelName, StringComparison.OrdinalIgnoreCase))
+                {
+                    ModelCombo.SelectedIndex = i;
+                    break;
+                }
+            }
+
+            // Select language
+            for (int i = 0; i < LanguageCombo.Items.Count; i++)
+            {
+                if (LanguageCombo.Items[i] is ComboBoxItem item &&
+                    string.Equals(item.Tag?.ToString(), settings.SttLanguage, StringComparison.OrdinalIgnoreCase))
+                {
+                    LanguageCombo.SelectedIndex = i;
+                    break;
+                }
+            }
+            if (LanguageCombo.SelectedIndex < 0)
+                LanguageCombo.SelectedIndex = 0; // auto
+
+            SilenceSlider.Value = settings.SttSilenceTimeout;
+            TtsResponseToggle.IsOn = settings.VoiceTtsEnabled;
+            AudioFeedbackToggle.IsOn = settings.VoiceAudioFeedback;
+
+            LoadTtsSettings(settings);
+            UpdateModelStatus();
+            UpdateCardVisibility();
+        }
+        finally
+        {
+            _suppressEvents = false;
+        }
+    }
+
+    private void UpdateModelStatus()
+    {
+        // Determine the selected model. Prefer settings; fall back to the
+        // ModelCombo selection if settings haven't been wired yet so the
+        // status reflects what's on disk even before Initialize completes.
+        var modelName = _hub?.Settings?.SttModelName
+            ?? (ModelCombo?.SelectedItem as ComboBoxItem)?.Tag?.ToString()
+            ?? "base";
+
+        // Check the file directly via WhisperModelManager rather than going
+        // through VoiceService — _voiceService can be null if the user reaches
+        // this page before NodeService finishes wiring it, and we still want
+        // accurate status.
+        var manager = new OpenClaw.Shared.Audio.WhisperModelManager(
+            SettingsManager.SettingsDirectoryPath, new AppLogger());
+
+        if (manager.IsModelDownloaded(modelName))
+        {
+            ModelStatusText.Text = L("VoiceSettingsPage_StatusModelReady");
+            DownloadButtonText.Text = L("VoiceSettingsPage_ButtonReDownload");
+        }
+        else
+        {
+            ModelStatusText.Text = L("VoiceSettingsPage_StatusDownloadRequired");
+            DownloadButtonText.Text = L("VoiceSettingsPage_ButtonDownloadModel");
+        }
+    }
+
+    private void UpdateCardVisibility()
+    {
+        ModelCard.Opacity = SttEnabledToggle.IsOn ? 1.0 : 0.5;
+        ModelCard.IsHitTestVisible = SttEnabledToggle.IsOn;
+    }
+
+    private void OnSttToggled(object sender, RoutedEventArgs e)
+    {
+        if (_suppressEvents || _hub?.Settings == null) return;
+        _hub.Settings.NodeSttEnabled = SttEnabledToggle.IsOn;
+        _hub.Settings.Save();
+        UpdateCardVisibility();
+    }
+
+    private void OnModelChanged(object sender, SelectionChangedEventArgs e)
+    {
+        if (_suppressEvents || _hub?.Settings == null) return;
+
+        if (ModelCombo.SelectedItem is ComboBoxItem item && item.Tag is string modelName)
+        {
+            _hub.Settings.SttModelName = modelName;
+            _hub.Settings.Save();
+            UpdateModelStatus();
+        }
+    }
+
+    private void OnLanguageChanged(object sender, SelectionChangedEventArgs e)
+    {
+        if (_suppressEvents || _hub?.Settings == null) return;
+
+        if (LanguageCombo.SelectedItem is ComboBoxItem item && item.Tag is string lang)
+        {
+            _hub.Settings.SttLanguage = lang;
+            _hub.Settings.Save();
+        }
+    }
+
+    private void OnSilenceChanged(object sender, Microsoft.UI.Xaml.Controls.Primitives.RangeBaseValueChangedEventArgs e)
+    {
+        if (_suppressEvents || _hub?.Settings == null) return;
+        _hub.Settings.SttSilenceTimeout = (float)SilenceSlider.Value;
+        _hub.Settings.Save();
+    }
+
+    private void OnTtsResponseToggled(object sender, RoutedEventArgs e)
+    {
+        if (_suppressEvents || _hub?.Settings == null) return;
+        _hub.Settings.VoiceTtsEnabled = TtsResponseToggle.IsOn;
+        _hub.Settings.Save();
+    }
+
+    private void OnAudioFeedbackToggled(object sender, RoutedEventArgs e)
+    {
+        if (_suppressEvents || _hub?.Settings == null) return;
+        _hub.Settings.VoiceAudioFeedback = AudioFeedbackToggle.IsOn;
+        _hub.Settings.Save();
+    }
+
+    private async void OnDownloadClick(object sender, RoutedEventArgs e)
+    {
+        if (_hub?.Settings == null) return;
+
+        // Cancel any in-progress Whisper download (only). Piper downloads are
+        // independent and keep running.
+        _whisperDownloadCts?.Cancel();
+        _whisperDownloadCts = new CancellationTokenSource();
+
+        DownloadButton.IsEnabled = false;
+        DownloadProgress.Visibility = Visibility.Visible;
+        DownloadProgress.Value = 0;
+        ModelStatusText.Text = L("VoiceSettingsPage_StatusDownloading");
+
+        try
+        {
+            // Throttle UI updates: the underlying download streams in 80 KB
+            // chunks, so for a 466 MB model that's ~5,800 progress callbacks
+            // — each one Posts to the SyncContext and then queues a
+            // DispatcherQueue tick. The dispatcher saturates and the app
+            // appears frozen. Coalesce to at most one UI update per ~150 ms,
+            // and always force a final 100% update when the download
+            // completes so the user never sees a stuck "99%" before "Model
+            // ready" appears.
+            DateTime lastReportUtc = DateTime.MinValue;
+            var progress = new Progress<(long downloaded, long total)>(p =>
+            {
+                var now = DateTime.UtcNow;
+                var isFinal = p.total > 0 && p.downloaded >= p.total;
+                if (!isFinal && now - lastReportUtc < TimeSpan.FromMilliseconds(150)) return;
+                lastReportUtc = now;
+                if (p.total > 0)
+                {
+                    var pct = (double)p.downloaded / p.total * 100;
+                    DownloadProgress.Value = pct;
+                    ModelStatusText.Text = Lf("VoiceSettingsPage_StatusDownloadingPct", $"{pct:F0}");
+                }
+            });
+
+            // Download via the model manager directly so the user can fetch
+            // a model even before NodeService has registered the STT
+            // capability (which only happens after Connect / StartLocalOnly
+            // and only when NodeSttEnabled is true). VoiceService still
+            // wraps this same manager when it auto-downloads on first use,
+            // so the on-disk result is identical.
+            var manager = new OpenClaw.Shared.Audio.WhisperModelManager(
+                SettingsManager.SettingsDirectoryPath, new AppLogger());
+            // Re-download semantic: when the file is already present the
+            // button label flips to "Re-download" (UpdateModelStatus). The
+            // download manager short-circuits if the file exists, so we
+            // delete first to force a fresh fetch + SHA-256 re-verify.
+            manager.DeleteModel(_hub.Settings.SttModelName);
+            await manager.DownloadModelAsync(
+                _hub.Settings.SttModelName,
+                progress,
+                _whisperDownloadCts.Token);
+
+            ModelStatusText.Text = L("VoiceSettingsPage_StatusModelReady");
+            DownloadButtonText.Text = L("VoiceSettingsPage_ButtonReDownload");
+        }
+        catch (OperationCanceledException)
+        {
+            ModelStatusText.Text = L("VoiceSettingsPage_StatusDownloadCanceled");
+        }
+        catch (Exception ex)
+        {
+            // Privacy: never put ex.Message in the UI — it can carry URLs,
+            // file paths, hash digests, or HTTP body fragments. Log the full
+            // detail; show a generic message.
+            Logger.Error($"Whisper model download failed: {ex}");
+            ModelStatusText.Text = L("VoiceSettingsPage_StatusError");
+        }
+        finally
+        {
+            DownloadButton.IsEnabled = true;
+            DownloadProgress.Visibility = Visibility.Collapsed;
+        }
+    }
+
+    // ── TTS Voice Selection ──
+
+    private void LoadTtsSettings(SettingsManager settings)
+    {
+        // Provider
+        var provider = settings.TtsProvider;
+        for (int i = 0; i < TtsProviderCombo.Items.Count; i++)
+        {
+            if (TtsProviderCombo.Items[i] is ComboBoxItem item &&
+                string.Equals(item.Tag?.ToString(), provider, StringComparison.OrdinalIgnoreCase))
+            {
+                TtsProviderCombo.SelectedIndex = i;
+                break;
+            }
+        }
+        if (TtsProviderCombo.SelectedIndex < 0)
+            TtsProviderCombo.SelectedIndex = 0;  // default to Piper
+
+        // Piper voice catalog
+        PopulatePiperVoices(settings);
+
+        // Windows voices
+        PopulateWindowsVoices(settings);
+
+        // ElevenLabs
+        ElevenLabsApiKeyBox.Password = settings.TtsElevenLabsApiKey ?? "";
+        ElevenLabsVoiceIdBox.Text = settings.TtsElevenLabsVoiceId ?? "";
+        ElevenLabsModelBox.Text = settings.TtsElevenLabsModel ?? "";
+
+        UpdateTtsProviderVisibility();
+        UpdatePiperVoiceState();
+    }
+
+    private void PopulatePiperVoices(SettingsManager settings)
+    {
+        PiperVoiceCombo.Items.Clear();
+        var selected = string.IsNullOrWhiteSpace(settings.TtsPiperVoiceId)
+            ? "en_US-amy-low"
+            : settings.TtsPiperVoiceId;
+        int selectedIdx = 0;
+
+        foreach (var v in OpenClaw.Shared.Audio.PiperVoiceManager.AvailableVoices)
+        {
+            var item = new ComboBoxItem { Content = v.DisplayName, Tag = v.VoiceId };
+            PiperVoiceCombo.Items.Add(item);
+            if (string.Equals(v.VoiceId, selected, StringComparison.OrdinalIgnoreCase))
+                selectedIdx = PiperVoiceCombo.Items.Count - 1;
+        }
+
+        if (PiperVoiceCombo.Items.Count > 0)
+            PiperVoiceCombo.SelectedIndex = selectedIdx;
+    }
+
+    private void OnPiperVoiceChanged(object sender, SelectionChangedEventArgs e)
+    {
+        if (_suppressEvents || _hub?.Settings == null) return;
+        if (PiperVoiceCombo.SelectedItem is ComboBoxItem item && item.Tag is string voiceId)
+        {
+            _hub.Settings.TtsPiperVoiceId = voiceId;
+            _hub.Settings.Save();
+        }
+        UpdatePiperVoiceState();
+    }
+
+    /// <summary>
+    /// Refresh the Piper download/delete/preview buttons + status text based
+    /// on whether the currently-selected voice is on disk. Pure UI; touches
+    /// the file system once via PiperVoiceManager.IsVoiceDownloaded.
+    /// </summary>
+    private void UpdatePiperVoiceState()
+    {
+        if (_hub?.Settings == null) return;
+        if (PiperVoiceCombo.SelectedItem is not ComboBoxItem item || item.Tag is not string voiceId)
+            return;
+
+        var voices = new OpenClaw.Shared.Audio.PiperVoiceManager(SettingsManager.SettingsDirectoryPath, new AppLogger());
+        var downloaded = voices.IsVoiceDownloaded(voiceId);
+
+        PiperDownloadButton.IsEnabled = !downloaded;
+        PiperDownloadButtonText.Text = downloaded
+            ? L("VoiceSettingsPage_PiperButtonDownloaded")
+            : L("VoiceSettingsPage_PiperButtonDownloadVoice");
+        PiperDownloadIcon.Glyph = downloaded ? "\uE73E" : "\uE896";  // checkmark vs download arrow
+        PiperDeleteButton.Visibility = downloaded ? Visibility.Visible : Visibility.Collapsed;
+        PiperPreviewButton.Visibility = downloaded ? Visibility.Visible : Visibility.Collapsed;
+
+        if (downloaded)
+        {
+            var sizeMb = voices.GetVoiceSize(voiceId) / (1024d * 1024d);
+            PiperStatusText.Text = Lf("VoiceSettingsPage_PiperVoiceReady", $"{sizeMb:F1}");
+        }
+        else
+        {
+            PiperStatusText.Text = L("VoiceSettingsPage_PiperVoiceNotDownloaded");
+        }
+        PiperDownloadProgress.Visibility = Visibility.Collapsed;
+    }
+
+    private async void OnPiperDownloadClick(object sender, RoutedEventArgs e)
+    {
+        if (_hub?.Settings == null) return;
+        if (PiperVoiceCombo.SelectedItem is not ComboBoxItem item || item.Tag is not string voiceId) return;
+
+        // Cancel any prior Piper download (only). Whisper downloads are
+        // independent and continue running.
+        try { _piperDownloadCts?.Cancel(); } catch { /* swallow */ }
+        _piperDownloadCts = new CancellationTokenSource();
+        var ct = _piperDownloadCts.Token;
+
+        PiperDownloadButton.IsEnabled = false;
+        PiperDownloadButtonText.Text = L("VoiceSettingsPage_PiperButtonDownloading");
+        PiperDownloadProgress.Visibility = Visibility.Visible;
+        PiperDownloadProgress.Value = 0;
+        PiperStatusText.Text = L("VoiceSettingsPage_PiperConnecting");
+
+        try
+        {
+            var voices = new OpenClaw.Shared.Audio.PiperVoiceManager(SettingsManager.SettingsDirectoryPath, new AppLogger());
+            // Same throttling story as the Whisper download: ~80 KB per
+            // streaming callback × ~150 MB voices = ~1,800 reports. Coalesce
+            // to ≥150 ms intervals so we don't choke the dispatcher.
+            DateTime lastPiperReportUtc = DateTime.MinValue;
+            var progress = new Progress<(long downloaded, long total)>(p =>
+            {
+                var now = DateTime.UtcNow;
+                var isFinal = p.total > 0 && p.downloaded >= p.total;
+                if (!isFinal && now - lastPiperReportUtc < TimeSpan.FromMilliseconds(150)) return;
+                lastPiperReportUtc = now;
+                if (p.total <= 0)
+                {
+                    PiperDownloadProgress.IsIndeterminate = true;
+                    PiperStatusText.Text = Lf("VoiceSettingsPage_PiperProgressIndeterminate", p.downloaded / (1024 * 1024));
+                }
+                else
+                {
+                    PiperDownloadProgress.IsIndeterminate = false;
+                    PiperDownloadProgress.Value = (double)p.downloaded * 100 / p.total;
+                    PiperStatusText.Text = Lf("VoiceSettingsPage_PiperProgressBytes",
+                        $"{p.downloaded / (1024d * 1024d):F1}",
+                        $"{p.total / (1024d * 1024d):F1}");
+                }
+            });
+
+            await voices.DownloadVoiceAsync(voiceId, progress, ct);
+            PiperStatusText.Text = L("VoiceSettingsPage_PiperExtracting");
+            // DownloadVoiceAsync extracts inline before returning, so by the
+            // time we get here the voice is fully on disk.
+            UpdatePiperVoiceState();
+        }
+        catch (OperationCanceledException)
+        {
+            PiperStatusText.Text = L("VoiceSettingsPage_PiperDownloadCanceled");
+            UpdatePiperVoiceState();
+        }
+        catch (Exception ex)
+        {
+            // The Logger captured full detail; surface a short user-facing
+            // message without leaking the URL, hash, or stack frame.
+            Logger.Error($"Piper voice download failed: {ex}");
+            PiperStatusText.Text = L("VoiceSettingsPage_PiperDownloadFailed");
+            PiperDownloadButton.IsEnabled = true;
+            PiperDownloadButtonText.Text = L("VoiceSettingsPage_PiperButtonRetry");
+            PiperDownloadProgress.Visibility = Visibility.Collapsed;
+        }
+    }
+
+    private void OnPiperDeleteClick(object sender, RoutedEventArgs e)
+    {
+        if (_hub?.Settings == null) return;
+        if (PiperVoiceCombo.SelectedItem is not ComboBoxItem item || item.Tag is not string voiceId) return;
+
+        try
+        {
+            var voices = new OpenClaw.Shared.Audio.PiperVoiceManager(SettingsManager.SettingsDirectoryPath, new AppLogger());
+            voices.DeleteVoice(voiceId);
+            PiperStatusText.Text = L("VoiceSettingsPage_PiperDeleted");
+            UpdatePiperVoiceState();
+        }
+        catch (Exception ex)
+        {
+            Logger.Error($"Piper voice delete failed: {ex}");
+            PiperStatusText.Text = L("VoiceSettingsPage_PiperDeleteFailed");
+        }
+    }
+
+    private async void OnPiperPreviewClick(object sender, RoutedEventArgs e)
+    {
+        if (_hub?.Settings == null) return;
+        if (PiperVoiceCombo.SelectedItem is not ComboBoxItem item || item.Tag is not string voiceId) return;
+
+        PiperPreviewButton.IsEnabled = false;
+        var oldContent = PiperPreviewButton.Content;
+        PiperPreviewButton.Content = L("VoiceSettingsPage_PreviewButtonPlaying");
+
+        try
+        {
+            using var tts = new TextToSpeechService(new AppLogger(), _hub.Settings);
+            await tts.SpeakAsync(new OpenClaw.Shared.Capabilities.TtsSpeakArgs
+            {
+                Text = L("VoiceSettingsPage_CompanionPreviewText"),
+                Provider = OpenClaw.Shared.Capabilities.TtsCapability.PiperProvider,
+                VoiceId = voiceId,
+                Interrupt = true
+            });
+        }
+        catch (Exception ex)
+        {
+            Logger.Error($"Piper voice preview failed: {ex}");
+            PiperStatusText.Text = L("VoiceSettingsPage_PiperPreviewFailed");
+        }
+        finally
+        {
+            PiperPreviewButton.IsEnabled = true;
+            PiperPreviewButton.Content = oldContent;
+        }
+    }
+
+    private void PopulateWindowsVoices(SettingsManager settings)
+    {
+        WindowsVoiceCombo.Items.Clear();
+
+        try
+        {
+            var voices = global::Windows.Media.SpeechSynthesis.SpeechSynthesizer.AllVoices;
+            int selectedIdx = 0;
+
+            foreach (var voice in voices)
+            {
+                var label = $"{voice.DisplayName} ({voice.Language})";
+                var item = new ComboBoxItem { Content = label, Tag = voice.Id };
+                WindowsVoiceCombo.Items.Add(item);
+
+                // Match current setting
+                if (!string.IsNullOrEmpty(settings.TtsWindowsVoiceId) &&
+                    (string.Equals(voice.Id, settings.TtsWindowsVoiceId, StringComparison.OrdinalIgnoreCase) ||
+                     string.Equals(voice.DisplayName, settings.TtsWindowsVoiceId, StringComparison.OrdinalIgnoreCase)))
+                {
+                    selectedIdx = WindowsVoiceCombo.Items.Count - 1;
+                }
+            }
+
+            if (WindowsVoiceCombo.Items.Count > 0)
+                WindowsVoiceCombo.SelectedIndex = selectedIdx;
+        }
+        catch (Exception ex)
+        {
+            Logger.Error($"Loading Windows TTS voices failed: {ex}");
+            WindowsVoiceCombo.Items.Add(new ComboBoxItem { Content = L("VoiceSettingsPage_VoiceErrorLoading"), IsEnabled = false });
+        }
+    }
+
+    private void UpdateTtsProviderVisibility()
+    {
+        var providerTag = (TtsProviderCombo.SelectedItem as ComboBoxItem)?.Tag?.ToString() ?? TtsCapability.PiperProvider;
+        var isPiper = string.Equals(providerTag, "piper", StringComparison.OrdinalIgnoreCase);
+        var isElevenLabs = string.Equals(providerTag, "elevenlabs", StringComparison.OrdinalIgnoreCase);
+        var isWindows = !isPiper && !isElevenLabs;
+
+        PiperVoicePanel.Visibility = isPiper ? Visibility.Visible : Visibility.Collapsed;
+        WindowsVoicePanel.Visibility = isWindows ? Visibility.Visible : Visibility.Collapsed;
+        ElevenLabsPanel.Visibility = isElevenLabs ? Visibility.Visible : Visibility.Collapsed;
+    }
+
+    private void OnTtsProviderChanged(object sender, SelectionChangedEventArgs e)
+    {
+        if (_suppressEvents || _hub?.Settings == null) return;
+
+        if (TtsProviderCombo.SelectedItem is ComboBoxItem item && item.Tag is string provider)
+        {
+            _hub.Settings.TtsProvider = provider;
+            _hub.Settings.Save();
+        }
+        UpdateTtsProviderVisibility();
+    }
+
+    private void OnWindowsVoiceChanged(object sender, SelectionChangedEventArgs e)
+    {
+        if (_suppressEvents || _hub?.Settings == null) return;
+
+        if (WindowsVoiceCombo.SelectedItem is ComboBoxItem item && item.Tag is string voiceId)
+        {
+            _hub.Settings.TtsWindowsVoiceId = voiceId;
+            _hub.Settings.Save();
+        }
+    }
+
+    private async void OnPreviewVoiceClick(object sender, RoutedEventArgs e)
+    {
+        if (_hub?.Settings == null) return;
+
+        PreviewVoiceButton.IsEnabled = false;
+        PreviewVoiceButton.Content = L("VoiceSettingsPage_PreviewButtonPlaying");
+
+        try
+        {
+            var tts = new TextToSpeechService(new AppLogger(), _hub.Settings);
+            try
+            {
+                await tts.SpeakAsync(new OpenClaw.Shared.Capabilities.TtsSpeakArgs
+                {
+                    Text = L("VoiceSettingsPage_CompanionPreviewText"),
+                    Provider = _hub.Settings.TtsProvider,
+                    VoiceId = WindowsVoiceCombo.SelectedItem is ComboBoxItem item ? item.Tag?.ToString() : null,
+                    Interrupt = true
+                });
+            }
+            finally
+            {
+                tts.Dispose();
+            }
+        }
+        catch (Exception ex)
+        {
+            // Show error inline (sanitized — full detail in the log).
+            Logger.Error($"Windows TTS preview failed: {ex}");
+            PreviewVoiceButton.Content = L("VoiceSettingsPage_StatusError");
+            await System.Threading.Tasks.Task.Delay(3000);
+        }
+        finally
+        {
+            PreviewVoiceButton.IsEnabled = true;
+            PreviewVoiceButton.Content = L("VoiceSettingsPage_PreviewVoiceButtonContent");
+        }
+    }
+
+    private void OnElevenLabsKeyChanged(object sender, RoutedEventArgs e)
+    {
+        if (_suppressEvents || _hub?.Settings == null) return;
+        _hub.Settings.TtsElevenLabsApiKey = ElevenLabsApiKeyBox.Password;
+        _hub.Settings.Save();
+    }
+
+    private void OnElevenLabsVoiceIdChanged(object sender, TextChangedEventArgs e)
+    {
+        if (_suppressEvents || _hub?.Settings == null) return;
+        _hub.Settings.TtsElevenLabsVoiceId = ElevenLabsVoiceIdBox.Text;
+        _hub.Settings.Save();
+    }
+
+    private void OnElevenLabsModelChanged(object sender, TextChangedEventArgs e)
+    {
+        if (_suppressEvents || _hub?.Settings == null) return;
+        _hub.Settings.TtsElevenLabsModel = ElevenLabsModelBox.Text;
+        _hub.Settings.Save();
+    }
+}
diff --git a/src/OpenClaw.Tray.WinUI/Services/AudioPipeline.cs b/src/OpenClaw.Tray.WinUI/Services/AudioPipeline.cs
new file mode 100644
index 00000000..5c724f64
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Services/AudioPipeline.cs
@@ -0,0 +1,688 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Threading;
+using System.Threading.Tasks;
+using NAudio.CoreAudioApi;
+using NAudio.Wave;
+using OpenClaw.Shared;
+using OpenClaw.Shared.Audio;
+
+namespace OpenClawTray.Services;
+
+/// <summary>
+/// End-to-end audio pipeline: microphone capture → resample → VAD → buffer → Whisper STT.
+/// Fires events for transcription results, VAD state changes, and audio levels.
+/// </summary>
+public sealed class AudioPipeline : IAsyncDisposable
+{
+    private readonly IOpenClawLogger _logger;
+    private readonly SpeechToTextService _stt;
+    private readonly VoiceActivityDetector _vad;
+
+    private WasapiCapture? _capture;
+    private WaveFormat? _captureFormat;
+    private AudioPipelineOptions _options = new();
+
+    // Resampling state
+    private readonly List<float> _resampleBuffer = new();
+
+    // VAD + buffering state
+    private readonly List<float> _speechBuffer = new();
+    // Pre-buffer: keeps the last ~300ms of audio before VAD triggers,
+    // so the beginning of speech isn't lost.
+    private readonly Queue<float[]> _preBuffer = new();
+    private const int PreBufferChunks = 10; // ~320ms at 512 samples/16kHz
+    private bool _isSpeaking;
+    private int _silenceChunksCount;
+    private int _silenceChunksThreshold;
+
+    // State
+    private AudioPipelineState _state = AudioPipelineState.Stopped;
+    private CancellationTokenSource? _cts;
+
+    // Backpressure: cap how many transcription Task.Run callbacks may be
+    // outstanding at once. Each holds its own copy of the audio samples
+    // for an entire silence-bounded utterance, so an unbounded queue
+    // means unbounded RAM if Whisper falls behind. When we hit the cap
+    // we drop the *new* segment with a diagnostic instead of queueing,
+    // because piling up old utterances behind a stuck Whisper is a worse
+    // UX than the user noticing one missed segment.
+    private int _inFlightTranscriptions;
+    private const int MaxConcurrentTranscriptions = 2;
+    // Flag set by StopAsync so TranscribeSamplesAsync can distinguish
+    // "Whisper actually failed" from "Whisper was interrupted by our own
+    // cancel during shutdown" — the latter often surfaces as a misleading
+    // "Failed to encode audio features" exception.
+    private volatile bool _isStopping;
+
+    // Fixed-duration capture mode: when set, OnDataAvailable bypasses the
+    // VAD pipeline entirely and just appends every chunk to
+    // _fixedCaptureBuffer for the duration of CaptureFixedDurationAsync.
+    // This gives stt.transcribe a true bounded-window capture (vs.
+    // stt.listen's silence-bounded behavior).
+    private bool _fixedCaptureMode;
+    private readonly List<float> _fixedCaptureBuffer = new();
+
+    /// <summary>Fired when a single Whisper segment has been transcribed.
+    /// Multiple of these may fire per silence-bounded utterance — useful
+    /// for streaming bubble updates. Consumers that want a complete
+    /// utterance (chat submission, stt.listen result) should listen on
+    /// <see cref="UtteranceTranscribed"/> instead.</summary>
+    public event Action<TranscriptionResult>? TranscriptionReady;
+
+    /// <summary>Fired exactly once per silence-bounded utterance, after
+    /// all Whisper segments for that utterance have been emitted. Carries
+    /// an immutable snapshot of every segment plus the concatenated text.</summary>
+    public event Action<UtteranceResult>? UtteranceTranscribed;
+
+    /// <summary>Fired when VAD detects speech start/end.</summary>
+    public event Action<VadEvent>? VoiceActivityChanged;
+
+    /// <summary>Fired with RMS audio level for visualization (0.0–1.0).</summary>
+    public event Action<float>? AudioLevelChanged;
+
+    /// <summary>Fired when pipeline state changes.</summary>
+    public event Action<AudioPipelineState>? StateChanged;
+
+    /// <summary>Fired with diagnostic status messages for the UI.</summary>
+    public event Action<string>? DiagnosticMessage;
+
+    /// <summary>Current pipeline state.</summary>
+    public AudioPipelineState State => _state;
+
+    /// <summary>When true, incoming audio is ignored (prevents echo during TTS playback).</summary>
+    public bool IsMuted { get; set; }
+
+    public AudioPipeline(IOpenClawLogger logger, SpeechToTextService stt, VoiceActivityDetector vad)
+    {
+        _logger = logger;
+        _stt = stt;
+        _vad = vad;
+    }
+
+    /// <summary>Start capturing and processing audio.</summary>
+    public async Task StartAsync(AudioPipelineOptions options, CancellationToken cancellationToken = default)
+    {
+        if (_state != AudioPipelineState.Stopped)
+            throw new InvalidOperationException($"Pipeline is {_state}, must be Stopped to start.");
+
+        _options = options;
+        _cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
+
+        // Calculate silence threshold: how many VAD chunks = silence timeout
+        float chunkDurationSec = (float)VoiceActivityDetector.ChunkSamples / VoiceActivityDetector.SampleRate;
+        _silenceChunksThreshold = Math.Max(1, (int)(options.SilenceTimeoutSeconds / chunkDurationSec));
+
+        SetState(AudioPipelineState.Starting);
+
+        try
+        {
+            // WASAPI COM objects must be created on an MTA thread, not the
+            // WinUI STA dispatcher thread. Run capture init on the thread pool.
+            await Task.Run(() =>
+            {
+                _capture = new WasapiCapture();
+                _captureFormat = _capture.WaveFormat;
+                _capture.DataAvailable += OnDataAvailable;
+                _capture.RecordingStopped += OnRecordingStopped;
+                _capture.StartRecording();
+            });
+
+            _speechBuffer.Clear();
+            _resampleBuffer.Clear();
+            _isSpeaking = false;
+            _silenceChunksCount = 0;
+            _dataCallbackCount = 0;
+            _vadChunkCount = 0;
+
+            SetState(AudioPipelineState.Listening);
+            var sttStatus = _stt.IsModelLoaded ? "loaded" : "NOT loaded";
+            _logger.Info($"Audio pipeline started: {_captureFormat.SampleRate}Hz {_captureFormat.BitsPerSample}bit {_captureFormat.Channels}ch → 16kHz mono, VAD=energy, STT={sttStatus}");
+            DiagnosticMessage?.Invoke($"Mic: {_captureFormat.SampleRate}Hz, STT model: {sttStatus}");
+        }
+        catch (System.Runtime.InteropServices.COMException ex) when (
+            ex.HResult == unchecked((int)0x80070005) || // E_ACCESSDENIED
+            ex.HResult == unchecked((int)0x88890008))   // AUDCLNT_E_DEVICE_INVALIDATED
+        {
+            _logger.Error("Microphone access denied", ex);
+            SetState(AudioPipelineState.Error);
+            DiagnosticMessage?.Invoke("⚠️ Microphone access denied — check Windows Settings → Privacy → Microphone");
+            // Release the partially-initialised capture device.
+            CleanupCapture();
+            throw new InvalidOperationException(
+                "Microphone access denied. Open Windows Settings → Privacy & Security → Microphone and enable 'Let desktop apps access your microphone'.",
+                ex);
+        }
+        catch (Exception ex)
+        {
+            _logger.Error("Failed to start audio capture", ex);
+            SetState(AudioPipelineState.Error);
+            DiagnosticMessage?.Invoke($"⚠️ Mic error: {ex.Message}");
+            // Release the partially-initialised capture device and CTS so
+            // the mic LED doesn't stay on after a failed start.
+            CleanupCapture();
+            throw;
+        }
+    }
+
+    /// <summary>
+    /// Capture audio for exactly <paramref name="durationMs"/> milliseconds
+    /// (or until the token is cancelled), then return the entire 16 kHz
+    /// mono float buffer. Bypasses VAD entirely — every sample in the
+    /// window is preserved. Used by stt.transcribe to honor the
+    /// "bounded fixed-duration capture" contract.
+    /// </summary>
+    public async Task<float[]> CaptureFixedDurationAsync(int durationMs, CancellationToken cancellationToken = default)
+    {
+        if (_state != AudioPipelineState.Stopped)
+            throw new InvalidOperationException($"Pipeline is {_state}, must be Stopped to start capture.");
+        if (durationMs <= 0)
+            throw new ArgumentOutOfRangeException(nameof(durationMs), "Duration must be positive.");
+
+        _fixedCaptureMode = true;
+        _fixedCaptureBuffer.Clear();
+        _resampleBuffer.Clear();
+        _cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
+
+        SetState(AudioPipelineState.Starting);
+        try
+        {
+            await Task.Run(() =>
+            {
+                _capture = new WasapiCapture();
+                _captureFormat = _capture.WaveFormat;
+                _capture.DataAvailable += OnDataAvailable;
+                _capture.RecordingStopped += OnRecordingStopped;
+                _capture.StartRecording();
+            });
+
+            SetState(AudioPipelineState.Listening);
+            try { DiagnosticMessage?.Invoke($"Recording {durationMs / 1000.0:F1}s..."); } catch { }
+
+            try
+            {
+                await Task.Delay(durationMs, _cts.Token).ConfigureAwait(false);
+            }
+            catch (TaskCanceledException)
+            {
+                // External cancellation: return whatever we have so far.
+            }
+
+            // Stop capture and give NAudio a moment to flush its last buffer.
+            try { _capture?.StopRecording(); } catch { /* swallow */ }
+            await Task.Delay(150).ConfigureAwait(false);
+
+            return _fixedCaptureBuffer.ToArray();
+        }
+        finally
+        {
+            _fixedCaptureMode = false;
+            _fixedCaptureBuffer.Clear();
+            CleanupCapture();
+            SetState(AudioPipelineState.Stopped);
+        }
+    }
+
+    /// <summary>Stop capturing and processing.</summary>
+    public async Task StopAsync()
+    {
+        if (_state == AudioPipelineState.Stopped)
+            return;
+
+        _isStopping = true;
+        try
+        {
+            // Order matters here. Previously we cancelled `_cts` first and THEN
+            // tried to flush the speech buffer — but the flush passed `_cts.Token`
+            // straight into Whisper.net, which honored the cancel and dropped the
+            // final utterance. Now:
+            //
+            //   1. Stop capturing new audio so the buffer doesn't grow further.
+            //   2. Wait briefly for any in-flight transcriptions (Task.Run-spawned
+            //      from earlier VAD bursts) to finish — so the user's last
+            //      utterance reaches Whisper instead of being killed mid-encode.
+            //   3. Flush any buffered speech using a fresh (non-cancelled) token
+            //      so anything left over also reaches Whisper.
+            //   4. Cancel `_cts` to stop background work that hasn't drained yet.
+            //   5. Tear down capture resources.
+            if (_capture != null)
+            {
+                try { _capture.StopRecording(); }
+                catch (Exception ex) { _logger.Error("Error stopping capture", ex); }
+            }
+
+            // Drain in-flight transcriptions, capped at 3 s so Stop never hangs.
+            var drainDeadline = DateTime.UtcNow + TimeSpan.FromSeconds(3);
+            while (Volatile.Read(ref _inFlightTranscriptions) > 0 && DateTime.UtcNow < drainDeadline)
+            {
+                await Task.Delay(50).ConfigureAwait(false);
+            }
+
+            if (_speechBuffer.Count > 0 && _stt.IsModelLoaded)
+            {
+                await FlushSpeechBufferAsync();
+            }
+
+            _cts?.Cancel();
+
+            CleanupCapture();
+            SetState(AudioPipelineState.Stopped);
+            _logger.Info("Audio pipeline stopped");
+        }
+        finally
+        {
+            _isStopping = false;
+        }
+    }
+
+    private int _dataCallbackCount;
+
+    private void OnDataAvailable(object? sender, WaveInEventArgs e)
+    {
+        if (_cts?.IsCancellationRequested == true || e.BytesRecorded == 0 || IsMuted)
+            return;
+
+        _dataCallbackCount++;
+
+        try
+        {
+            var sourceSamples = ConvertToFloat(e.Buffer, e.BytesRecorded, _captureFormat!);
+            var resampled = ResampleTo16kMono(sourceSamples, _captureFormat!);
+
+            // Amplify: many laptop mics produce very low levels.
+            const float gain = 5.0f;
+            for (int i = 0; i < resampled.Length; i++)
+                resampled[i] = Math.Clamp(resampled[i] * gain, -1.0f, 1.0f);
+
+            // Compute RMS for level visualization
+            if (resampled.Length > 0)
+            {
+                float sumSquares = 0;
+                for (int i = 0; i < resampled.Length; i++)
+                    sumSquares += resampled[i] * resampled[i];
+                var rms = MathF.Sqrt(sumSquares / resampled.Length);
+                AudioLevelChanged?.Invoke(Math.Clamp(rms * 3f, 0f, 1f));
+            }
+
+            // Fixed-duration capture mode: skip VAD entirely; we want every
+            // sample for the full window. CaptureFixedDurationAsync drains
+            // the buffer when the timer fires.
+            if (_fixedCaptureMode)
+            {
+                _fixedCaptureBuffer.AddRange(resampled);
+                return;
+            }
+
+            _resampleBuffer.AddRange(resampled);
+            ProcessVadChunks();
+        }
+        catch (Exception ex)
+        {
+            _logger.Error("Error processing audio data", ex);
+            if (_dataCallbackCount <= 3)
+                try { DiagnosticMessage?.Invoke($"⚠️ Audio error: {ex.Message}"); } catch { }
+        }
+    }
+
+    private int _vadChunkCount;
+    private int _speechChunkCount; // how many speech chunks in current utterance
+
+    private void ProcessVadChunks()
+    {
+        while (_resampleBuffer.Count >= VoiceActivityDetector.ChunkSamples)
+        {
+            var chunk = _resampleBuffer.GetRange(0, VoiceActivityDetector.ChunkSamples).ToArray();
+            _resampleBuffer.RemoveRange(0, VoiceActivityDetector.ChunkSamples);
+
+            // Compute RMS energy of this chunk
+            float energy = 0;
+            for (int i = 0; i < chunk.Length; i++)
+                energy += chunk[i] * chunk[i];
+            energy = MathF.Sqrt(energy / chunk.Length);
+
+            _vadChunkCount++;
+
+            // Hysteresis: use a higher threshold to START detecting speech,
+            // and a lower threshold to STAY in speech mode. This prevents
+            // brief pauses between words from ending the utterance.
+            const float startThreshold = 0.03f;  // energy to begin speech
+            const float stayThreshold = 0.008f;   // energy to remain in speech (much lower)
+
+            bool chunkIsSpeech = _isSpeaking
+                ? energy >= stayThreshold
+                : energy >= startThreshold;
+
+            if (chunkIsSpeech)
+            {
+                if (!_isSpeaking)
+                {
+                    _isSpeaking = true;
+                    _silenceChunksCount = 0;
+                    _speechChunkCount = 0;
+                    try { VoiceActivityChanged?.Invoke(new VadEvent { IsSpeaking = true, Probability = energy }); } catch { }
+                    try { DiagnosticMessage?.Invoke("🗣️ Listening..."); } catch { }
+
+                    // Prepend the pre-buffer so we don't lose the speech onset
+                    while (_preBuffer.Count > 0)
+                        _speechBuffer.AddRange(_preBuffer.Dequeue());
+                }
+                _speechBuffer.AddRange(chunk);
+                _speechChunkCount++;
+                _silenceChunksCount = 0;
+            }
+            else if (_isSpeaking)
+            {
+                _speechBuffer.AddRange(chunk);
+                _silenceChunksCount++;
+
+                if (_silenceChunksCount >= _silenceChunksThreshold)
+                {
+                    _isSpeaking = false;
+                    try { VoiceActivityChanged?.Invoke(new VadEvent { IsSpeaking = false, Probability = energy }); } catch { }
+
+                    var samples = _speechBuffer.ToArray();
+                    _speechBuffer.Clear();
+                    _silenceChunksCount = 0;
+
+                    // Only transcribe if we had enough speech (not just a brief noise)
+                    var durationSec = (float)samples.Length / VoiceActivityDetector.SampleRate;
+                    if (_speechChunkCount < 10) // less than ~320ms of actual speech
+                    {
+                        try { DiagnosticMessage?.Invoke("Speak now — I'm listening"); } catch { }
+                    }
+                    else
+                    {
+                        try { DiagnosticMessage?.Invoke($"Transcribing {durationSec:F1}s of speech..."); } catch { }
+
+                        // Bounded in-flight count. If Whisper is stuck or
+                        // slow, dropping a segment is preferable to letting
+                        // a queue of stale utterances arrive minutes later.
+                        if (Interlocked.Increment(ref _inFlightTranscriptions) > MaxConcurrentTranscriptions)
+                        {
+                            Interlocked.Decrement(ref _inFlightTranscriptions);
+                            try { DiagnosticMessage?.Invoke("⚠️ Transcription backlog — segment dropped"); } catch { }
+                        }
+                        else
+                        {
+                            _ = Task.Run(async () =>
+                            {
+                                try
+                                {
+                                    await TranscribeSamplesAsync(samples);
+                                }
+                                catch (Exception ex)
+                                {
+                                    _logger.Error("Transcription task failed", ex);
+                                    try { DiagnosticMessage?.Invoke($"⚠️ Error: {ex.Message}"); } catch { }
+                                }
+                                finally
+                                {
+                                    Interlocked.Decrement(ref _inFlightTranscriptions);
+                                }
+                            });
+                        }
+                    }
+                }
+            }
+            else
+            {
+                // Not speaking — maintain rolling pre-buffer
+                _preBuffer.Enqueue(chunk);
+                while (_preBuffer.Count > PreBufferChunks)
+                    _preBuffer.Dequeue();
+            }
+        }
+    }
+
+    private async Task TranscribeSamplesAsync(float[] samples, CancellationToken? overrideToken = null)
+    {
+        if (!_stt.IsModelLoaded || samples.Length == 0)
+        {
+            DiagnosticMessage?.Invoke(_stt.IsModelLoaded ? "Empty audio segment" : "⚠️ Model not loaded");
+            return;
+        }
+
+        // Skip very short segments (< 0.3 seconds)
+        if (samples.Length < VoiceActivityDetector.SampleRate * 0.3f)
+        {
+            DiagnosticMessage?.Invoke("Segment too short, skipped");
+            return;
+        }
+
+        SetState(AudioPipelineState.Processing);
+        try
+        {
+            // overrideToken is used by FlushSpeechBufferAsync during teardown
+            // so the final utterance isn't dropped when the pipeline cancel
+            // token is about to fire.
+            var token = overrideToken ?? _cts?.Token ?? CancellationToken.None;
+            var results = await _stt.TranscribeAsync(samples, _options.Language, token);
+
+            if (results.Count == 0)
+            {
+                try { DiagnosticMessage?.Invoke("No speech recognized in segment"); } catch { }
+            }
+
+            foreach (var result in results)
+            {
+                _logger.Info($"Transcription: \"{result.Text}\"");
+                try { TranscriptionReady?.Invoke(result); } catch (Exception ex)
+                {
+                    _logger.Error("TranscriptionReady handler failed", ex);
+                }
+            }
+
+            // Emit a single completed-utterance event so consumers that care
+            // about "the full thing the user just said" (chat submission,
+            // stt.listen) don't fire on every fragment.
+            if (results.Count > 0)
+            {
+                var snapshot = results.ToArray();
+                var aggregate = new UtteranceResult
+                {
+                    Text = string.Join(" ", snapshot.Select(r => r.Text)).Trim(),
+                    Language = snapshot[0].Language,
+                    Start = snapshot[0].Start,
+                    End = snapshot[^1].End,
+                    Segments = snapshot
+                };
+                try { UtteranceTranscribed?.Invoke(aggregate); } catch (Exception ex)
+                {
+                    _logger.Error("UtteranceTranscribed handler failed", ex);
+                }
+            }
+        }
+        catch (Exception ex)
+        {
+            // If we're tearing the pipeline down, mid-encode interruptions
+            // surface from Whisper.net as misleading exceptions like
+            // "Failed to encode audio features." instead of a clean
+            // OperationCanceledException. Suppress those — the user already
+            // knows they pressed Stop.
+            if (_isStopping || (_cts?.IsCancellationRequested ?? false))
+            {
+                _logger.Info($"Transcription interrupted during shutdown ({ex.GetType().Name})");
+                return;
+            }
+            _logger.Error("Transcription failed", ex);
+            // Sanitized — the raw ex.Message can include sample lengths,
+            // language tags, or other audio-shape detail.
+            try { DiagnosticMessage?.Invoke("⚠️ Transcription error"); } catch { }
+        }
+        finally
+        {
+            if (_state == AudioPipelineState.Processing)
+                SetState(AudioPipelineState.Listening);
+        }
+    }
+
+    private async Task FlushSpeechBufferAsync()
+    {
+        if (_speechBuffer.Count == 0) return;
+
+        var samples = _speechBuffer.ToArray();
+        _speechBuffer.Clear();
+
+        try
+        {
+            // Pass CancellationToken.None — the flush is the last chance
+            // to transcribe the user's final utterance during teardown,
+            // so it must not be killable by the pipeline's own cancel
+            // token (which StopAsync is about to fire).
+            await TranscribeSamplesAsync(samples, CancellationToken.None);
+        }
+        catch (Exception ex)
+        {
+            _logger.Error("Flush transcription failed", ex);
+        }
+    }
+
+    private void OnRecordingStopped(object? sender, StoppedEventArgs e)
+    {
+        if (e.Exception != null)
+        {
+            _logger.Error("Recording stopped with error", e.Exception);
+            SetState(AudioPipelineState.Error);
+            DiagnosticMessage?.Invoke($"⚠️ Microphone error: {e.Exception.Message}");
+        }
+    }
+
+    /// <summary>Convert raw audio bytes to float samples based on wave format.</summary>
+    private static float[] ConvertToFloat(byte[] buffer, int bytesRecorded, WaveFormat format)
+    {
+        int bytesPerSample = format.BitsPerSample / 8;
+        int sampleCount = bytesRecorded / bytesPerSample;
+        var result = new float[sampleCount];
+
+        for (int i = 0; i < sampleCount; i++)
+        {
+            int offset = i * bytesPerSample;
+            if (offset + bytesPerSample > bytesRecorded) break;
+
+            result[i] = format.BitsPerSample switch
+            {
+                16 => BitConverter.ToInt16(buffer, offset) / 32768f,
+                32 when format.Encoding == WaveFormatEncoding.IeeeFloat
+                    => BitConverter.ToSingle(buffer, offset),
+                32 => BitConverter.ToInt32(buffer, offset) / (float)int.MaxValue,
+                24 => (buffer[offset] | (buffer[offset + 1] << 8) | ((sbyte)buffer[offset + 2] << 16)) / 8388608f,
+                _ => 0f
+            };
+        }
+
+        return result;
+    }
+
+    /// <summary>Resample multi-channel audio to 16 kHz mono.</summary>
+    private static float[] ResampleTo16kMono(float[] input, WaveFormat sourceFormat)
+    {
+        int sourceRate = sourceFormat.SampleRate;
+        int channels = sourceFormat.Channels;
+
+        // First: downmix to mono if needed
+        float[] mono;
+        if (channels > 1)
+        {
+            int monoSamples = input.Length / channels;
+            mono = new float[monoSamples];
+            for (int i = 0; i < monoSamples; i++)
+            {
+                float sum = 0;
+                for (int ch = 0; ch < channels; ch++)
+                    sum += input[i * channels + ch];
+                mono[i] = sum / channels;
+            }
+        }
+        else
+        {
+            mono = input;
+        }
+
+        // If already 16kHz, return as-is
+        if (sourceRate == 16000)
+            return mono;
+
+        // Simple linear interpolation resampling
+        double ratio = (double)sourceRate / 16000;
+        int outputSamples = (int)(mono.Length / ratio);
+        if (outputSamples == 0) return [];
+
+        var output = new float[outputSamples];
+        for (int i = 0; i < outputSamples; i++)
+        {
+            double srcIndex = i * ratio;
+            int idx = (int)srcIndex;
+            float frac = (float)(srcIndex - idx);
+
+            if (idx + 1 < mono.Length)
+                output[i] = mono[idx] * (1 - frac) + mono[idx + 1] * frac;
+            else if (idx < mono.Length)
+                output[i] = mono[idx];
+        }
+
+        return output;
+    }
+
+    private void SetState(AudioPipelineState newState)
+    {
+        if (_state == newState) return;
+        _state = newState;
+        StateChanged?.Invoke(newState);
+    }
+
+    private void CleanupCapture()
+    {
+        if (_capture != null)
+        {
+            try
+            {
+                _capture.DataAvailable -= OnDataAvailable;
+                _capture.RecordingStopped -= OnRecordingStopped;
+            }
+            catch (Exception ex)
+            {
+                _logger.Error("Error detaching capture event handlers", ex);
+            }
+
+            try
+            {
+                _capture.Dispose();
+            }
+            catch (Exception ex)
+            {
+                // NAudio's WasapiCapture.Dispose may throw on a stuck COM
+                // object. Log but never propagate — this method is called
+                // from finally-blocks and re-throwing would mask the original
+                // failure AND leave the mic device held by the OS until
+                // process exit.
+                _logger.Error("Error disposing audio capture", ex);
+            }
+            finally
+            {
+                _capture = null;
+            }
+        }
+
+        try
+        {
+            _cts?.Dispose();
+        }
+        catch (Exception ex)
+        {
+            _logger.Error("Error disposing pipeline cancellation source", ex);
+        }
+        finally
+        {
+            _cts = null;
+        }
+
+        _resampleBuffer.Clear();
+        _speechBuffer.Clear();
+    }
+
+    public async ValueTask DisposeAsync()
+    {
+        await StopAsync();
+    }
+}
diff --git a/src/OpenClaw.Tray.WinUI/Services/DeepLinkHandler.cs b/src/OpenClaw.Tray.WinUI/Services/DeepLinkHandler.cs
index 29b09e2e..d83600e8 100644
--- a/src/OpenClaw.Tray.WinUI/Services/DeepLinkHandler.cs
+++ b/src/OpenClaw.Tray.WinUI/Services/DeepLinkHandler.cs
@@ -242,6 +242,15 @@ public static void Handle(string uri, DeepLinkActions actions)
                 }
                 break;
 
+            case "voice":
+            case "voice-start":
+                actions.OpenVoice?.Invoke();
+                break;
+
+            case "voice-stop":
+                actions.StopVoice?.Invoke();
+                break;
+
             default:
                 if (path == "hub" || path.StartsWith("hub/"))
                 {
@@ -286,4 +295,6 @@ public class DeepLinkActions
     public Action<string?>? OpenQuickSend { get; set; }
     public Action<string?>? OpenHub { get; set; }
     public Func<string, Task>? SendMessage { get; set; }
+    public Action? OpenVoice { get; set; }
+    public Action? StopVoice { get; set; }
 }
diff --git a/src/OpenClaw.Tray.WinUI/Services/GlobalHotkeyService.cs b/src/OpenClaw.Tray.WinUI/Services/GlobalHotkeyService.cs
index d0e5f93a..7b9b40ac 100644
--- a/src/OpenClaw.Tray.WinUI/Services/GlobalHotkeyService.cs
+++ b/src/OpenClaw.Tray.WinUI/Services/GlobalHotkeyService.cs
@@ -12,10 +12,12 @@ namespace OpenClawTray.Services;
 public class GlobalHotkeyService : IDisposable
 {
     private const int HOTKEY_ID = 9001;
+    private const int HOTKEY_ID_VOICE = 9002;
     private const uint MOD_CONTROL = 0x0002;
     private const uint MOD_ALT = 0x0001;
     private const uint MOD_SHIFT = 0x0004;
     private const uint VK_C = 0x43;
+    private const uint VK_V = 0x56;
     private const int WM_HOTKEY = 0x0312;
 
     [DllImport("user32.dll", SetLastError = true)]
@@ -114,6 +116,7 @@ private struct POINT
     private readonly ManualResetEventSlim _opCompleted = new(false);
 
     public event EventHandler? HotkeyPressed;
+    public event EventHandler? VoiceHotkeyPressed;
 
     public GlobalHotkeyService()
     {
@@ -240,6 +243,18 @@ private IntPtr WndProc(IntPtr hWnd, uint msg, IntPtr wParam, IntPtr lParam)
                 Logger.Warn($"Failed to register global hotkey (Win32Error={err}: {errMsg})");
             }
 
+            // Also register voice hotkey: Ctrl+Alt+Shift+V
+            if (RegisterHotKey(hWnd, HOTKEY_ID_VOICE,
+                MOD_CONTROL | MOD_ALT | MOD_SHIFT | MOD_NOREPEAT,
+                VK_V))
+            {
+                Logger.Info("Voice hotkey registered: Ctrl+Alt+Shift+V");
+            }
+            else
+            {
+                Logger.Warn("Failed to register voice hotkey Ctrl+Alt+Shift+V");
+            }
+
             _opCompleted.Set();
             return IntPtr.Zero;
         }
@@ -251,8 +266,9 @@ private IntPtr WndProc(IntPtr hWnd, uint msg, IntPtr wParam, IntPtr lParam)
                 if (_registered)
                 {
                     UnregisterHotKey(hWnd, HOTKEY_ID);
+                    UnregisterHotKey(hWnd, HOTKEY_ID_VOICE);
                     _registered = false;
-                    Logger.Info("Global hotkey unregistered");
+                    Logger.Info("Global hotkeys unregistered");
                 }
             }
             catch (Exception ex)
@@ -271,6 +287,11 @@ private IntPtr WndProc(IntPtr hWnd, uint msg, IntPtr wParam, IntPtr lParam)
             Logger.Info("Hotkey pressed: Ctrl+Alt+Shift+C");
             OnHotkeyPressed();
         }
+        else if (msg == WM_HOTKEY && wParam.ToInt32() == HOTKEY_ID_VOICE)
+        {
+            Logger.Info("Voice hotkey pressed: Ctrl+Alt+Shift+V");
+            VoiceHotkeyPressed?.Invoke(this, EventArgs.Empty);
+        }
         return DefWindowProc(hWnd, msg, wParam, lParam);
     }
 
diff --git a/src/OpenClaw.Tray.WinUI/Services/NodeCapabilityGating.cs b/src/OpenClaw.Tray.WinUI/Services/NodeCapabilityGating.cs
new file mode 100644
index 00000000..343ae7af
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Services/NodeCapabilityGating.cs
@@ -0,0 +1,26 @@
+namespace OpenClawTray.Services;
+
+/// <summary>
+/// Pure predicates that decide which optional node capabilities should be
+/// advertised based on the user's <see cref="SettingsManager"/> flags.
+///
+/// Extracted from <c>NodeService.RegisterCapabilities</c> so the gating
+/// rules can be unit-tested without standing up the full tray host. Both
+/// the gateway client path and the MCP-only path read from the same
+/// authoritative capability list, so a regression here would silently drop
+/// or leak a capability across both surfaces.
+///
+/// Defaults: capabilities default ON (a missing or null settings object
+/// counts as enabled) except <c>tts.speak</c> and <c>stt.transcribe</c>,
+/// which are privacy-sensitive and require an explicit opt-in.
+/// </summary>
+internal static class NodeCapabilityGating
+{
+    public static bool ShouldRegisterCanvas(SettingsManager? s)       => s?.NodeCanvasEnabled       != false;
+    public static bool ShouldRegisterScreen(SettingsManager? s)       => s?.NodeScreenEnabled       != false;
+    public static bool ShouldRegisterCamera(SettingsManager? s)       => s?.NodeCameraEnabled       != false;
+    public static bool ShouldRegisterLocation(SettingsManager? s)     => s?.NodeLocationEnabled     != false;
+    public static bool ShouldRegisterBrowserProxy(SettingsManager? s) => s?.NodeBrowserProxyEnabled != false;
+    public static bool ShouldRegisterTts(SettingsManager? s)          => s?.NodeTtsEnabled          == true;
+    public static bool ShouldRegisterStt(SettingsManager? s)          => s?.NodeSttEnabled          == true;
+}
diff --git a/src/OpenClaw.Tray.WinUI/Services/NodeInvokeActivityFormatter.cs b/src/OpenClaw.Tray.WinUI/Services/NodeInvokeActivityFormatter.cs
new file mode 100644
index 00000000..ccd0a21d
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Services/NodeInvokeActivityFormatter.cs
@@ -0,0 +1,72 @@
+using System;
+
+namespace OpenClawTray.Services;
+
+/// <summary>
+/// Builds the activity-stream "details" string for completed node invocations.
+///
+/// Extracted from App.OnNodeInvokeCompleted so the formatter can be unit-tested
+/// without spinning up the WinUI App. Drives both the recent-activity menu and
+/// <see cref="ActivityStreamService.BuildSupportBundle"/>.
+///
+/// **Privacy invariant:** for privacy-sensitive commands (mic / camera /
+/// screen) a failed invocation never includes the underlying error text in
+/// details, since support bundles can be shared off-device. Caller-supplied
+/// args (e.g., language tag) and runtime details (audio/video stack errors)
+/// stay in the local log only.
+/// </summary>
+internal static class NodeInvokeActivityFormatter
+{
+    public const string PrivacySensitive = "privacy-sensitive";
+    public const string Exec = "exec";
+    public const string Metadata = "metadata";
+
+    // Also classify tts.* as privacy-sensitive: failed TTS calls can
+    // surface ElevenLabs API key fragments (in 401 messages) or audio-stack
+    // errors. Keeping the namespace under the same scrubbing rule as stt.*
+    // means recent-activity / support-bundle entries record only the
+    // privacy class + duration + sanitized "error" sentinel.
+    public static string GetPrivacyClass(string command)
+    {
+        if (string.IsNullOrEmpty(command)) return Metadata;
+
+        if (command.StartsWith("stt.", StringComparison.OrdinalIgnoreCase) ||
+            command.StartsWith("tts.", StringComparison.OrdinalIgnoreCase))
+        {
+            return PrivacySensitive;
+        }
+
+        if (string.Equals(command, "screen.record", StringComparison.OrdinalIgnoreCase) ||
+            string.Equals(command, "screen.snapshot", StringComparison.OrdinalIgnoreCase) ||
+            string.Equals(command, "camera.snap", StringComparison.OrdinalIgnoreCase) ||
+            string.Equals(command, "camera.clip", StringComparison.OrdinalIgnoreCase))
+        {
+            return PrivacySensitive;
+        }
+
+        if (command.StartsWith("system.run", StringComparison.OrdinalIgnoreCase))
+        {
+            return Exec;
+        }
+
+        return Metadata;
+    }
+
+    public static string BuildDetails(string command, bool ok, int durationMs, string? error)
+    {
+        var privacyClass = GetPrivacyClass(command);
+        durationMs = Math.Max(0, durationMs);
+
+        if (ok)
+            return $"{privacyClass} · {durationMs} ms";
+
+        if (string.Equals(privacyClass, PrivacySensitive, StringComparison.Ordinal))
+        {
+            // See class summary: never echo error text for privacy-sensitive
+            // commands. Full detail stays in the local log.
+            return $"{privacyClass} · {durationMs} ms · error";
+        }
+
+        return $"{privacyClass} · {durationMs} ms · {error ?? "unknown error"}";
+    }
+}
diff --git a/src/OpenClaw.Tray.WinUI/Services/NodeService.cs b/src/OpenClaw.Tray.WinUI/Services/NodeService.cs
index 52f23e83..ac78cade 100644
--- a/src/OpenClaw.Tray.WinUI/Services/NodeService.cs
+++ b/src/OpenClaw.Tray.WinUI/Services/NodeService.cs
@@ -60,6 +60,11 @@ public sealed class NodeService : IDisposable
     private readonly System.Collections.Concurrent.ConcurrentDictionary<string, DateTimeOffset> _navigationDenyCooldown =
         new(StringComparer.OrdinalIgnoreCase);
     private static readonly TimeSpan NavigationDenyCooldownDuration = TimeSpan.FromSeconds(30);
+
+    // STT: rate-limit successive stt.listen invocations to prevent a
+    // compromised gateway from looping mic capture at the 120 s cap.
+    private static readonly TimeSpan SttListenMinInterval = TimeSpan.FromSeconds(1);
+    private DateTimeOffset _lastSttListenStartUtc = DateTimeOffset.MinValue;
     
     // Capabilities
     private SystemCapability? _systemCapability;
@@ -70,8 +75,10 @@ public sealed class NodeService : IDisposable
     private DeviceCapability? _deviceCapability;
     private DeviceStatusProvider? _deviceStatusProvider;
     private BrowserProxyCapability? _browserProxyCapability;
+    private SttCapability? _sttCapability;
     private TtsCapability? _ttsCapability;
     private TextToSpeechService? _textToSpeechService;
+    private VoiceService? _voiceService;
     private AppCapability? _appCapability;
     private readonly string _dataPath;
     private string? _token;
@@ -117,6 +124,8 @@ public sealed class NodeService : IDisposable
     private string? _mcpStartupError;
     public bool IsMcpRunning => _mcpServer != null;
     public AppCapability? AppCapability => _appCapability;
+    public VoiceService? VoiceService => _voiceService;
+    public TextToSpeechService? TextToSpeech => _textToSpeechService;
     public string McpEndpoint => McpServerUrl;
     /// <summary>Last MCP server startup error, or null if it started cleanly. Surfaced by Settings UI.</summary>
     public string? McpStartupError => _mcpStartupError;
@@ -258,7 +267,7 @@ private void RegisterCapabilities()
         _systemCapability.SetPromptHandler(new ExecApprovalPromptService(_dispatcherQueue, _rootProvider, _logger));
         Register(_systemCapability);
 
-        if (_settings?.NodeCanvasEnabled != false)
+        if (NodeCapabilityGating.ShouldRegisterCanvas(_settings))
         {
             _canvasCapability = new CanvasCapability(_logger);
             _canvasCapability.PresentRequested += OnCanvasPresent;
@@ -273,7 +282,7 @@ private void RegisterCapabilities()
             Register(_canvasCapability);
         }
 
-        if (_settings?.NodeScreenEnabled != false)
+        if (NodeCapabilityGating.ShouldRegisterScreen(_settings))
         {
             _screenCapability = new ScreenCapability(_logger);
             _screenCapability.CaptureRequested += OnScreenCapture;
@@ -281,7 +290,7 @@ private void RegisterCapabilities()
             Register(_screenCapability);
         }
 
-        if (_settings?.NodeCameraEnabled != false)
+        if (NodeCapabilityGating.ShouldRegisterCamera(_settings))
         {
             _cameraCapability = new CameraCapability(_logger);
             _cameraCapability.ListRequested += OnCameraList;
@@ -290,14 +299,14 @@ private void RegisterCapabilities()
             Register(_cameraCapability);
         }
 
-        if (_settings?.NodeLocationEnabled != false)
+        if (NodeCapabilityGating.ShouldRegisterLocation(_settings))
         {
             _locationCapability = new LocationCapability(_logger);
             _locationCapability.GetRequested += async (args) => await GetLocationAsync(args);
             Register(_locationCapability);
         }
 
-        if (_settings?.NodeTtsEnabled == true)
+        if (NodeCapabilityGating.ShouldRegisterTts(_settings))
         {
             _textToSpeechService ??= new TextToSpeechService(_logger, _settings);
             _ttsCapability = new TtsCapability(_logger);
@@ -305,6 +314,23 @@ private void RegisterCapabilities()
             Register(_ttsCapability);
         }
 
+        if (NodeCapabilityGating.ShouldRegisterStt(_settings))
+        {
+            // Whisper is the only STT engine. The legacy WinRT
+            // SpeechRecognizer + desktop SAPI fallback was removed —
+            // both stacks are old, can leak audio to the Microsoft
+            // cloud (online speech), and don't activate in unpackaged
+            // builds. When the Whisper model isn't downloaded yet, the
+            // handlers return a clear error pointing the caller at the
+            // Voice Settings page; there is no automatic fallback.
+            _voiceService ??= new VoiceService(_logger, _settings);
+            _sttCapability = new SttCapability(_logger);
+            _sttCapability.TranscribeRequested += OnSttTranscribeAsync;
+            _sttCapability.ListenRequested += OnSttListenAsync;
+            _sttCapability.StatusRequested += OnSttStatusAsync;
+            Register(_sttCapability);
+        }
+
         // Device metadata/status capability - dispose previous provider on re-registration
         _deviceStatusProvider?.Dispose();
         _deviceStatusProvider = new DeviceStatusProvider(_logger);
@@ -313,7 +339,7 @@ private void RegisterCapabilities()
         Register(_deviceCapability);
 
         // BrowserProxy needs a live gateway connection — only register when gateway is up.
-        if (_nodeClient != null && _settings?.NodeBrowserProxyEnabled != false)
+        if (_nodeClient != null && NodeCapabilityGating.ShouldRegisterBrowserProxy(_settings))
         {
             _browserProxyCapability = new BrowserProxyCapability(
                 _logger,
@@ -473,8 +499,12 @@ private List<string> BuildDisabledCommands()
             disabled.AddRange(CommandCenterCommandGroups.SafeCompanionCommands.Where(command => command.StartsWith("location.", StringComparison.OrdinalIgnoreCase)));
         if (_settings?.NodeBrowserProxyEnabled == false)
             disabled.Add("browser.proxy");
+        if (_settings?.NodeSttEnabled != true)
+            disabled.Add(SttCapability.TranscribeCommand);
         if (_settings?.NodeTtsEnabled != true)
             disabled.AddRange(CommandCenterCommandGroups.DangerousCommands.Where(command => command.StartsWith("tts.", StringComparison.OrdinalIgnoreCase)));
+        if (_settings?.NodeSttEnabled != true)
+            disabled.AddRange(new[] { "stt.listen", "stt.status" });
         return disabled;
     }
 
@@ -1301,6 +1331,110 @@ private Task<TtsSpeakResult> OnTtsSpeakAsync(TtsSpeakArgs args, CancellationToke
 
         return _textToSpeechService.SpeakAsync(args, cancellationToken);
     }
+
+    // ============================================================
+    // ============================================================
+    // STT handlers
+    //
+    // Single engine: VoiceService (Whisper.net + NAudio + Silero VAD).
+    // The legacy WinRT/SAPI engine and the engine selector have been
+    // removed — see Audio_FollowUps.md for the rationale.
+    //
+    // When the Whisper model isn't downloaded yet, every stt.* call
+    // returns a clear error pointing the caller at the Voice Settings
+    // page download button. There is no automatic fallback engine.
+    //
+    // Privacy: handlers never include caller-supplied args or runtime
+    // details in error messages. SttCapability already wraps the
+    // response surface; this layer only logs locally on failure.
+    // ============================================================
+
+    private bool IsWhisperReady() => _voiceService != null && _voiceService.IsWhisperReady;
+
+    private static string ResolveListenLanguage(string? configured)
+    {
+        if (!string.IsNullOrWhiteSpace(configured))
+        {
+            var normalized = SttCapability.NormalizeLanguageTag(configured!);
+            if (normalized != null) return normalized;
+        }
+        return SttCapability.AutoLanguage;
+    }
+
+    private async Task<SttTranscribeResult> OnSttTranscribeAsync(
+        SttTranscribeArgs args,
+        CancellationToken cancellationToken)
+    {
+        if (_voiceService == null)
+            throw new InvalidOperationException("Voice service not available");
+        // Check the file on disk, NOT IsWhisperReady (which is "loaded into
+        // memory"). The TranscribeFixedDurationAsync path calls
+        // EnsureInitializedAsync internally; that triggers the lazy
+        // file→memory load. Failing here on a freshly-launched tray that
+        // has the file but hasn't loaded it yet would be a paper cut for
+        // every MCP caller.
+        if (!_voiceService.IsModelDownloaded)
+            throw new InvalidOperationException("Whisper model not downloaded");
+
+        // True fixed-duration capture (no VAD-based early termination) so
+        // the contract advertised by skill.md / McpToolBridge holds: callers
+        // get exactly maxDurationMs of audio, transcribed in full. For
+        // "stop when the user pauses" semantics, callers should use
+        // stt.listen instead.
+        var transcribeArgs = new SttTranscribeArgs
+        {
+            MaxDurationMs = args.MaxDurationMs,
+            Language = !string.IsNullOrWhiteSpace(args.Language)
+                ? args.Language!
+                : ResolveListenLanguage(_settings?.SttLanguage)
+        };
+        return await _voiceService.TranscribeFixedDurationAsync(transcribeArgs, cancellationToken).ConfigureAwait(false);
+    }
+
+    private async Task<SttListenResult> OnSttListenAsync(
+        SttListenArgs args,
+        CancellationToken cancellationToken)
+    {
+        // Defense-in-depth rate-limit: a compromised gateway could otherwise
+        // loop stt.listen at the max 120 s window indefinitely.
+        var now = DateTimeOffset.UtcNow;
+        var sinceLast = now - _lastSttListenStartUtc;
+        if (sinceLast < SttListenMinInterval)
+        {
+            throw new InvalidOperationException("Listen rate limit");
+        }
+        _lastSttListenStartUtc = now;
+
+        if (_voiceService == null)
+            throw new InvalidOperationException("Voice service not available");
+        // See the OnSttTranscribeAsync comment: gate on file presence, not
+        // on the in-memory load state. ListenOnceAsync handles the lazy load.
+        if (!_voiceService.IsModelDownloaded)
+            throw new InvalidOperationException("Whisper model not downloaded");
+
+        var result = await _voiceService.ListenOnceAsync(args, cancellationToken).ConfigureAwait(false);
+        result.EngineEffective = SttCapability.EngineWhisper;
+        return result;
+    }
+
+    private Task<SttStatusResult> OnSttStatusAsync(CancellationToken cancellationToken)
+    {
+        var ready = IsWhisperReady();
+        var readiness = ready ? "ready"
+            : _voiceService == null ? "unavailable"
+            : _voiceService.IsWhisperDownloadingModel ? "model-downloading"
+            : _voiceService.IsModelDownloaded ? "initializing"
+            : "model-not-downloaded";
+
+        return Task.FromResult(new SttStatusResult
+        {
+            Engine = SttCapability.EngineWhisper,
+            Readiness = readiness,
+            ModelDownloadProgress = _voiceService?.WhisperModelDownloadProgress,
+            IsListenWithVadSupported = ready,
+            IsBoundedTranscribeSupported = ready
+        });
+    }
     
     #endregion
     
@@ -1315,6 +1449,7 @@ public void Dispose()
         try { _cameraCaptureService?.Dispose(); } catch { /* ignore */ }
         try { _screenRecordingService?.Dispose(); } catch { /* ignore */ }
         try { _textToSpeechService?.Dispose(); } catch { /* ignore */ }
+        try { _voiceService?.DisposeAsync().AsTask().GetAwaiter().GetResult(); } catch { /* ignore */ }
         // MediaResolver owns SocketsHttpHandler + HttpClient (disposeHandler:true);
         // without disposal the connection pool survives node teardown/recreate.
         try { _mediaResolver?.Dispose(); } catch { /* ignore */ }
diff --git a/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs b/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs
index eadad41f..733c27ea 100644
--- a/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs
+++ b/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs
@@ -4,6 +4,7 @@
 using System.Text;
 using System.Text.Json;
 using OpenClaw.Shared;
+using OpenClaw.Shared.Capabilities;
 
 namespace OpenClawTray.Services;
 
@@ -65,11 +66,27 @@ public class SettingsManager
     public bool NodeCameraEnabled { get; set; } = true;
     public bool NodeLocationEnabled { get; set; } = true;
     public bool NodeBrowserProxyEnabled { get; set; } = true;
+    public bool NodeSttEnabled { get; set; } = false;
+    /// <summary>STT language: "auto" for Whisper auto-detect, or a BCP-47 tag like "en-US".</summary>
+    public string SttLanguage { get; set; } = "auto";
+    /// <summary>Whisper model size: "tiny", "base", or "small".</summary>
+    public string SttModelName { get; set; } = "base";
+    /// <summary>Seconds of silence before auto-submit in voice chat mode.</summary>
+    public float SttSilenceTimeout { get; set; } = 1.5f;
+    /// <summary>Enable TTS playback of responses during voice sessions.</summary>
+    public bool VoiceTtsEnabled { get; set; } = true;
+    /// <summary>Play audio feedback chimes on listen start/stop.</summary>
+    public bool VoiceAudioFeedback { get; set; } = true;
     public bool NodeTtsEnabled { get; set; } = false;
-    public string TtsProvider { get; set; } = "windows";
+    public string TtsProvider { get; set; } = TtsCapability.PiperProvider;
     public string TtsElevenLabsApiKey { get; set; } = "";
     public string TtsElevenLabsModel { get; set; } = "";
     public string TtsElevenLabsVoiceId { get; set; } = "";
+    public string TtsWindowsVoiceId { get; set; } = "";
+    /// <summary>Hub NavigationView pane expanded (true) vs compact (false). Default true.</summary>
+    public bool HubNavPaneOpen { get; set; } = true;
+    /// <summary>Piper voice identifier, e.g. "en_US-amy-low".</summary>
+    public string TtsPiperVoiceId { get; set; } = "en_US-amy-low";
     // Local MCP HTTP server (independent of EnableNodeMode)
     public bool EnableMcpServer { get; set; } = false;
     /// <summary>
@@ -142,11 +159,20 @@ public void Load()
                     NodeCameraEnabled = loaded.NodeCameraEnabled;
                     NodeLocationEnabled = loaded.NodeLocationEnabled;
                     NodeBrowserProxyEnabled = loaded.NodeBrowserProxyEnabled;
+                    NodeSttEnabled = loaded.NodeSttEnabled;
+                    SttLanguage = string.IsNullOrWhiteSpace(loaded.SttLanguage) ? SttLanguage : loaded.SttLanguage;
+                    SttModelName = string.IsNullOrWhiteSpace(loaded.SttModelName) ? SttModelName : loaded.SttModelName;
+                    SttSilenceTimeout = loaded.SttSilenceTimeout > 0 ? loaded.SttSilenceTimeout : SttSilenceTimeout;
+                    VoiceTtsEnabled = loaded.VoiceTtsEnabled;
+                    VoiceAudioFeedback = loaded.VoiceAudioFeedback;
                     NodeTtsEnabled = loaded.NodeTtsEnabled;
                     TtsProvider = string.IsNullOrWhiteSpace(loaded.TtsProvider) ? TtsProvider : loaded.TtsProvider;
                     TtsElevenLabsApiKey = UnprotectSettingSecret(loaded.TtsElevenLabsApiKey) ?? TtsElevenLabsApiKey;
                     TtsElevenLabsModel = loaded.TtsElevenLabsModel ?? TtsElevenLabsModel;
                     TtsElevenLabsVoiceId = loaded.TtsElevenLabsVoiceId ?? TtsElevenLabsVoiceId;
+                    TtsWindowsVoiceId = loaded.TtsWindowsVoiceId ?? TtsWindowsVoiceId;
+                    HubNavPaneOpen = loaded.HubNavPaneOpen;
+                    TtsPiperVoiceId = string.IsNullOrWhiteSpace(loaded.TtsPiperVoiceId) ? TtsPiperVoiceId : loaded.TtsPiperVoiceId;
                     EnableMcpServer = loaded.EnableMcpServer;
                     A2UIImageHosts = loaded.A2UIImageHosts ?? new List<string>();
                     // Legacy McpOnlyMode migration:
@@ -216,11 +242,20 @@ public void Save()
                 NodeCameraEnabled = NodeCameraEnabled,
                 NodeLocationEnabled = NodeLocationEnabled,
                 NodeBrowserProxyEnabled = NodeBrowserProxyEnabled,
+                NodeSttEnabled = NodeSttEnabled,
+                SttLanguage = SttLanguage,
+                SttModelName = SttModelName,
+                SttSilenceTimeout = SttSilenceTimeout,
+                VoiceTtsEnabled = VoiceTtsEnabled,
+                VoiceAudioFeedback = VoiceAudioFeedback,
                 NodeTtsEnabled = NodeTtsEnabled,
                 TtsProvider = TtsProvider,
                 TtsElevenLabsApiKey = ProtectSettingSecret(TtsElevenLabsApiKey),
                 TtsElevenLabsModel = string.IsNullOrWhiteSpace(TtsElevenLabsModel) ? null : TtsElevenLabsModel,
                 TtsElevenLabsVoiceId = string.IsNullOrWhiteSpace(TtsElevenLabsVoiceId) ? null : TtsElevenLabsVoiceId,
+                TtsWindowsVoiceId = string.IsNullOrWhiteSpace(TtsWindowsVoiceId) ? null : TtsWindowsVoiceId,
+                HubNavPaneOpen = HubNavPaneOpen,
+                TtsPiperVoiceId = TtsPiperVoiceId,
                 EnableMcpServer = EnableMcpServer,
                 A2UIImageHosts = A2UIImageHosts.Count == 0 ? null : new List<string>(A2UIImageHosts),
                 // McpOnlyMode is legacy — never written; remains null in serialized output.
diff --git a/src/OpenClaw.Tray.WinUI/Services/TextToSpeech/PiperTextToSpeechClient.cs b/src/OpenClaw.Tray.WinUI/Services/TextToSpeech/PiperTextToSpeechClient.cs
new file mode 100644
index 00000000..dc91d330
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Services/TextToSpeech/PiperTextToSpeechClient.cs
@@ -0,0 +1,136 @@
+using System;
+using System.IO;
+using System.Threading;
+using System.Threading.Tasks;
+using OpenClaw.Shared;
+using OpenClaw.Shared.Audio;
+using SherpaOnnx;
+
+namespace OpenClawTray.Services;
+
+/// <summary>
+/// Wraps Sherpa-ONNX <see cref="OfflineTts"/> with a Piper voice for
+/// fully-local neural text-to-speech.
+///
+/// One instance owns one loaded voice. Callers ensure the voice is
+/// downloaded (via <see cref="PiperVoiceManager"/>) before constructing
+/// this service. Generation is single-flight: a second concurrent
+/// <see cref="GenerateWavAsync"/> call waits behind the first.
+///
+/// Output is 16-bit PCM mono WAV at the model's native sample rate
+/// (typically 22 050 Hz for Piper-low, 16 000 Hz for some others). The
+/// caller is responsible for playback.
+/// </summary>
+public sealed class PiperTextToSpeechClient : IDisposable
+{
+    private readonly IOpenClawLogger _logger;
+    private readonly string _voiceId;
+    private readonly OfflineTts _tts;
+    private readonly SemaphoreSlim _gate = new(1, 1);
+    private bool _disposed;
+
+    public string VoiceId => _voiceId;
+    public int SampleRate => _tts.SampleRate;
+
+    public PiperTextToSpeechClient(IOpenClawLogger logger, PiperVoiceManager voices, string voiceId)
+    {
+        _logger = logger;
+        _voiceId = voiceId;
+
+        if (!voices.IsVoiceDownloaded(voiceId))
+            throw new InvalidOperationException($"Piper voice '{voiceId}' is not downloaded.");
+
+        var config = new OfflineTtsConfig();
+        config.Model.Vits.Model = voices.GetModelPath(voiceId);
+        config.Model.Vits.Tokens = voices.GetTokensPath(voiceId);
+        config.Model.Vits.DataDir = voices.GetEspeakDataDir(voiceId);
+        // Piper defaults — produce natural-sounding speech.
+        config.Model.Vits.NoiseScale = 0.667f;
+        config.Model.Vits.NoiseScaleW = 0.8f;
+        config.Model.Vits.LengthScale = 1.0f;
+        config.Model.NumThreads = Math.Max(1, Environment.ProcessorCount / 2);
+        config.Model.Provider = "cpu";
+        config.Model.Debug = 0;
+        config.MaxNumSentences = 2;
+
+        _tts = new OfflineTts(config);
+        _logger.Info($"Piper voice '{_voiceId}' loaded (sample rate {_tts.SampleRate} Hz, {config.Model.NumThreads} threads)");
+    }
+
+    /// <summary>
+    /// Synthesize <paramref name="text"/> to a WAV byte array.
+    /// <paramref name="speed"/> &gt; 1 speeds up; &lt; 1 slows down.
+    /// </summary>
+    public async Task<byte[]> GenerateWavAsync(string text, float speed = 1.0f, CancellationToken cancellationToken = default)
+    {
+        if (_disposed) throw new ObjectDisposedException(nameof(PiperTextToSpeechClient));
+        if (string.IsNullOrWhiteSpace(text)) throw new ArgumentException("text must be non-empty", nameof(text));
+
+        await _gate.WaitAsync(cancellationToken).ConfigureAwait(false);
+        try
+        {
+            // Inference is CPU-bound — push it off the caller thread so
+            // cancellation can race the synthesis.
+            return await Task.Run(() =>
+            {
+                cancellationToken.ThrowIfCancellationRequested();
+                var audio = _tts.Generate(text, speed: speed, speakerId: 0);
+                cancellationToken.ThrowIfCancellationRequested();
+                return ConvertFloatPcmToWav(audio.Samples, audio.SampleRate);
+            }, cancellationToken).ConfigureAwait(false);
+        }
+        finally
+        {
+            _gate.Release();
+        }
+    }
+
+    /// <summary>
+    /// Convert Sherpa's 32-bit float PCM samples (range -1..1) to a
+    /// standard 16-bit PCM mono WAV blob the WinUI MediaPlayer can play.
+    /// </summary>
+    private static byte[] ConvertFloatPcmToWav(float[] samples, int sampleRate)
+    {
+        const int bitsPerSample = 16;
+        const int channels = 1;
+        var byteRate = sampleRate * channels * bitsPerSample / 8;
+        var blockAlign = channels * bitsPerSample / 8;
+        var dataSize = samples.Length * sizeof(short);
+
+        using var ms = new MemoryStream(44 + dataSize);
+        using var w = new BinaryWriter(ms);
+        // RIFF header
+        w.Write(System.Text.Encoding.ASCII.GetBytes("RIFF"));
+        w.Write(36 + dataSize);
+        w.Write(System.Text.Encoding.ASCII.GetBytes("WAVE"));
+        // fmt chunk
+        w.Write(System.Text.Encoding.ASCII.GetBytes("fmt "));
+        w.Write(16);
+        w.Write((short)1);  // PCM
+        w.Write((short)channels);
+        w.Write(sampleRate);
+        w.Write(byteRate);
+        w.Write((short)blockAlign);
+        w.Write((short)bitsPerSample);
+        // data chunk
+        w.Write(System.Text.Encoding.ASCII.GetBytes("data"));
+        w.Write(dataSize);
+        // 16-bit PCM: clamp + scale.
+        for (int i = 0; i < samples.Length; i++)
+        {
+            var s = samples[i];
+            if (s > 1f) s = 1f; else if (s < -1f) s = -1f;
+            w.Write((short)(s * short.MaxValue));
+        }
+        w.Flush();
+        return ms.ToArray();
+    }
+
+    public void Dispose()
+    {
+        if (_disposed) return;
+        _disposed = true;
+        try { _tts.Dispose(); } catch { /* swallow */ }
+        try { _gate.Dispose(); } catch { /* swallow */ }
+    }
+}
diff --git a/src/OpenClaw.Tray.WinUI/Services/TextToSpeech/TextToSpeechService.cs b/src/OpenClaw.Tray.WinUI/Services/TextToSpeech/TextToSpeechService.cs
index 13be3bfc..b9e7ef0e 100644
--- a/src/OpenClaw.Tray.WinUI/Services/TextToSpeech/TextToSpeechService.cs
+++ b/src/OpenClaw.Tray.WinUI/Services/TextToSpeech/TextToSpeechService.cs
@@ -1,9 +1,11 @@
 using System;
 using System.Diagnostics;
+using System.IO;
 using System.Linq;
 using System.Threading;
 using System.Threading.Tasks;
 using OpenClaw.Shared;
+using OpenClaw.Shared.Audio;
 using OpenClaw.Shared.Capabilities;
 using Windows.Media.Core;
 using Windows.Media.Playback;
@@ -17,6 +19,9 @@ public sealed class TextToSpeechService : IDisposable
     private readonly IOpenClawLogger _logger;
     private readonly SettingsManager _settings;
     private readonly ElevenLabsTextToSpeechClient _elevenLabsClient;
+    private readonly PiperVoiceManager _piperVoices;
+    private readonly object _piperLock = new();
+    private PiperTextToSpeechClient? _piperClient;  // lazily loaded; reused across calls for the same voice
     private readonly SemaphoreSlim _playbackGate = new(1, 1);
     private readonly object _activeLock = new();
     private MediaPlayer? _activePlayer;
@@ -35,8 +40,14 @@ internal TextToSpeechService(
         _logger = logger;
         _settings = settings;
         _elevenLabsClient = elevenLabsClient;
+        // Piper voices live under the same data directory as Whisper models
+        // so the user has a single "AI assets" folder to point at.
+        _piperVoices = new PiperVoiceManager(SettingsManager.SettingsDirectoryPath, logger);
     }
 
+    /// <summary>Exposed so Settings UI can drive download/delete from the same instance.</summary>
+    public PiperVoiceManager PiperVoices => _piperVoices;
+
     public async Task<TtsSpeakResult> SpeakAsync(TtsSpeakArgs args, CancellationToken cancellationToken = default)
     {
         var provider = TtsCapability.ResolveProvider(args.Provider, _settings.TtsProvider);
@@ -50,6 +61,10 @@ public async Task<TtsSpeakResult> SpeakAsync(TtsSpeakArgs args, CancellationToke
         {
             await SpeakWithElevenLabsAsync(args, cancellationToken).ConfigureAwait(false);
         }
+        else if (string.Equals(provider, TtsCapability.PiperProvider, StringComparison.OrdinalIgnoreCase))
+        {
+            await SpeakWithPiperAsync(args, cancellationToken).ConfigureAwait(false);
+        }
         else
         {
             throw new InvalidOperationException($"Unsupported TTS provider '{provider}'.");
@@ -69,9 +84,12 @@ public async Task<TtsSpeakResult> SpeakAsync(TtsSpeakArgs args, CancellationToke
     private async Task SpeakWithWindowsAsync(TtsSpeakArgs args, CancellationToken cancellationToken)
     {
         using var synthesizer = new SpeechSynthesizer();
-        if (!string.IsNullOrWhiteSpace(args.VoiceId))
+        var requestedVoice = string.IsNullOrWhiteSpace(args.VoiceId)
+            ? _settings.TtsWindowsVoiceId
+            : args.VoiceId;
+        if (!string.IsNullOrWhiteSpace(requestedVoice))
         {
-            var requestedVoice = args.VoiceId.Trim();
+            requestedVoice = requestedVoice.Trim();
             var voice = SpeechSynthesizer.AllVoices.FirstOrDefault(v =>
                 string.Equals(v.Id, requestedVoice, StringComparison.OrdinalIgnoreCase) ||
                 string.Equals(v.DisplayName, requestedVoice, StringComparison.OrdinalIgnoreCase));
@@ -116,6 +134,48 @@ private async Task SpeakWithElevenLabsAsync(TtsSpeakArgs args, CancellationToken
         await PlayStreamAsync(stream, audio.ContentType, args.Interrupt, cancellationToken).ConfigureAwait(false);
     }
 
+    private async Task SpeakWithPiperAsync(TtsSpeakArgs args, CancellationToken cancellationToken)
+    {
+        var voiceId = string.IsNullOrWhiteSpace(args.VoiceId)
+            ? _settings.TtsPiperVoiceId
+            : args.VoiceId;
+        if (string.IsNullOrWhiteSpace(voiceId))
+            throw new InvalidOperationException("Piper voice ID is required in Settings or the tts.speak voiceId argument.");
+
+        if (!_piperVoices.IsVoiceDownloaded(voiceId))
+        {
+            // Privacy: don't echo the voiceId — it's user-controlled. The
+            // SttCapability sanitization layer wraps "Speak failed" anyway,
+            // but we also keep this throw site free of caller args.
+            throw new InvalidOperationException("Piper voice not downloaded. Open Voice Settings to download it.");
+        }
+
+        var client = AcquirePiperClient(voiceId);
+        var wavBytes = await client.GenerateWavAsync(args.Text, speed: 1.0f, cancellationToken).ConfigureAwait(false);
+        using var stream = await CreateStreamAsync(wavBytes, cancellationToken).ConfigureAwait(false);
+        await PlayStreamAsync(stream, "audio/wav", args.Interrupt, cancellationToken).ConfigureAwait(false);
+    }
+
+    /// <summary>
+    /// Reuse the Piper client across calls for the same voice id (model
+    /// load is the expensive part, ~200-500 ms). Switch atomically when
+    /// the requested voice changes.
+    /// </summary>
+    private PiperTextToSpeechClient AcquirePiperClient(string voiceId)
+    {
+        lock (_piperLock)
+        {
+            if (_piperClient != null && string.Equals(_piperClient.VoiceId, voiceId, StringComparison.OrdinalIgnoreCase))
+                return _piperClient;
+
+            // Voice changed (or first call) — dispose the old client before
+            // loading the new model so we don't double the memory footprint.
+            try { _piperClient?.Dispose(); } catch { /* swallow */ }
+            _piperClient = new PiperTextToSpeechClient(_logger, _piperVoices, voiceId);
+            return _piperClient;
+        }
+    }
+
     private static async Task<InMemoryRandomAccessStream> CreateStreamAsync(byte[] bytes, CancellationToken cancellationToken)
     {
         var stream = new InMemoryRandomAccessStream();
@@ -204,5 +264,10 @@ public void Dispose()
         InterruptActivePlayback();
         // Playback may still release the gate after an interrupt during shutdown.
         _elevenLabsClient.Dispose();
+        lock (_piperLock)
+        {
+            try { _piperClient?.Dispose(); } catch { /* swallow */ }
+            _piperClient = null;
+        }
     }
 }
diff --git a/src/OpenClaw.Tray.WinUI/Services/VoiceService.cs b/src/OpenClaw.Tray.WinUI/Services/VoiceService.cs
new file mode 100644
index 00000000..3e530d98
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Services/VoiceService.cs
@@ -0,0 +1,593 @@
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.IO;
+using System.Linq;
+using System.Threading;
+using System.Threading.Tasks;
+using OpenClaw.Shared;
+using OpenClaw.Shared.Audio;
+using OpenClaw.Shared.Capabilities;
+
+namespace OpenClawTray.Services;
+
+/// <summary>Voice interaction modes.</summary>
+public enum VoiceMode
+{
+    Inactive,
+    PushToTalk,
+    VoiceChat
+}
+
+/// <summary>
+/// Orchestrates voice interactions: push-to-talk and voice chat session modes.
+/// Manages the audio pipeline lifecycle and coordinates with the gateway.
+/// </summary>
+public sealed class VoiceService : IAsyncDisposable
+{
+    private readonly IOpenClawLogger _logger;
+    private readonly SettingsManager _settings;
+    private readonly SpeechToTextService _stt;
+    private readonly VoiceActivityDetector _vad;
+    private readonly WhisperModelManager _modelManager;
+    private AudioPipeline? _pipeline;
+    private VoiceMode _currentMode = VoiceMode.Inactive;
+    private CancellationTokenSource? _sessionCts;
+
+    // Path to the bundled Silero VAD model (deployed with the app)
+    private string? _vadModelPath;
+
+    /// <summary>Current voice interaction mode.</summary>
+    public VoiceMode CurrentMode => _currentMode;
+
+    /// <summary>Whether a Whisper model is loaded and ready to transcribe.</summary>
+    public bool IsModelLoaded => _stt.IsModelLoaded;
+
+    /// <summary>Whether the configured model has been downloaded.</summary>
+    public bool IsModelDownloaded => _modelManager.IsModelDownloaded(_settings.SttModelName);
+
+    // ============================================================
+    // Engine adapter surface (consumed by NodeService's STT selector).
+    //
+    // Whisper is "ready" only when the model is both downloaded AND loaded
+    // into memory. Anything else falls back to the WinRT engine inside the
+    // selector — kept transparent at the SttCapability response surface.
+    // ============================================================
+
+    /// <summary>True when Whisper can serve a transcribe / listen call right now.</summary>
+    public bool IsWhisperReady => _stt.IsModelLoaded;
+
+    /// <summary>True while the Whisper model is actively downloading.</summary>
+    public bool IsWhisperDownloadingModel => _modelDownloadInProgress;
+
+    /// <summary>0.0..1.0 download progress when <see cref="IsWhisperDownloadingModel"/>; null otherwise.</summary>
+    public double? WhisperModelDownloadProgress => _modelDownloadInProgress ? _modelDownloadProgress : null;
+
+    private volatile bool _modelDownloadInProgress;
+    private volatile float _modelDownloadProgress;
+
+    /// <summary>Fired when a single Whisper segment is transcribed (per-fragment;
+    /// useful for streaming UI updates). For "the full thing the user said",
+    /// listen on <see cref="UtteranceCompleted"/> instead.</summary>
+    public event Action<string>? TranscriptionReceived;
+
+    /// <summary>Fired exactly once per silence-bounded utterance. Carries the
+    /// concatenated text and an immutable snapshot of every segment.</summary>
+    public event Action<UtteranceResult>? UtteranceCompleted;
+
+    /// <summary>Fired when voice mode changes.</summary>
+    public event Action<VoiceMode>? ModeChanged;
+
+    /// <summary>Fired when VAD state changes (speaking/silence).</summary>
+    public event Action<bool>? SpeakingChanged;
+
+    /// <summary>Fired with audio level for waveform visualization (0.0–1.0).</summary>
+    public event Action<float>? AudioLevelChanged;
+
+    /// <summary>Fired with diagnostic messages for the UI.</summary>
+    public event Action<string>? DiagnosticMessage;
+
+    /// <summary>Fired when pipeline state changes.</summary>
+    public event Action<AudioPipelineState>? PipelineStateChanged;
+
+    /// <summary>When true, the pipeline ignores audio input (used during TTS playback to prevent echo).</summary>
+    public bool IsMutedForPlayback
+    {
+        get => _pipeline?.IsMuted ?? false;
+        set
+        {
+            if (_pipeline != null)
+                _pipeline.IsMuted = value;
+        }
+    }
+
+    public VoiceService(IOpenClawLogger logger, SettingsManager settings)
+    {
+        _logger = logger;
+        _settings = settings;
+        _stt = new SpeechToTextService(logger);
+        _vad = new VoiceActivityDetector(logger);
+        _modelManager = new WhisperModelManager(SettingsManager.SettingsDirectoryPath, logger);
+    }
+
+    /// <summary>
+    /// Ensure the VAD and STT models are loaded and ready.
+    /// Downloads the Whisper model if needed.
+    /// </summary>
+    public async Task InitializeAsync(
+        IProgress<(long downloaded, long total)>? downloadProgress = null,
+        CancellationToken cancellationToken = default)
+    {
+        // Load VAD model
+        if (!_vad.IsLoaded)
+        {
+            var vadPath = FindVadModelPath();
+            if (vadPath == null)
+            {
+                // Auto-download Silero VAD model
+                DiagnosticMessage?.Invoke("Downloading voice activity model…");
+                vadPath = await DownloadVadModelAsync(cancellationToken);
+            }
+            if (vadPath != null)
+            {
+                _vad.LoadModel(vadPath);
+            }
+            else
+            {
+                _logger.Info("Silero VAD model not found — VAD will be unavailable");
+            }
+        }
+
+        // Download Whisper model if needed
+        var modelName = _settings.SttModelName;
+        if (!_modelManager.IsModelDownloaded(modelName))
+        {
+            _logger.Info($"Downloading Whisper model '{modelName}'...");
+            DiagnosticMessage?.Invoke($"Downloading Whisper '{modelName}' model on first use (~one-time, ~140 MB)…");
+            await _modelManager.DownloadModelAsync(modelName, downloadProgress, cancellationToken);
+            DiagnosticMessage?.Invoke("Whisper model downloaded. Loading…");
+        }
+
+        // Load Whisper model
+        if (!_stt.IsModelLoaded)
+        {
+            var modelPath = _modelManager.GetModelPath(modelName);
+            _stt.LoadModel(modelPath);
+        }
+    }
+
+    /// <summary>
+    /// Start push-to-talk: begins listening immediately.
+    /// Call <see cref="StopPushToTalkAsync"/> when the user releases the key.
+    /// </summary>
+    public async Task StartPushToTalkAsync()
+    {
+        if (_currentMode != VoiceMode.Inactive)
+        {
+            _logger.Info("Voice already active, ignoring PTT start");
+            return;
+        }
+
+        await EnsureInitializedAsync();
+        SetMode(VoiceMode.PushToTalk);
+
+        _sessionCts = new CancellationTokenSource();
+        _pipeline = new AudioPipeline(_logger, _stt, _vad);
+        WirePipelineEvents(_pipeline);
+
+        var options = new AudioPipelineOptions
+        {
+            ModelPath = _modelManager.GetModelPath(_settings.SttModelName),
+            Language = _settings.SttLanguage,
+            SilenceTimeoutSeconds = 30, // For PTT, don't auto-stop on silence
+            VadThreshold = 0.5f
+        };
+
+        try
+        {
+            await _pipeline.StartAsync(options, _sessionCts.Token);
+            _logger.Info("Push-to-talk started");
+        }
+        catch
+        {
+            // Clean up on failure so the service isn't stuck in a broken state
+            await CleanupSessionAsync();
+            throw;
+        }
+    }
+
+    /// <summary>Stop push-to-talk.</summary>
+    public Task StopPushToTalkAsync() => StopAsync();
+
+    /// <summary>
+    /// Start a voice chat session with continuous listening and auto-submit on silence.
+    /// </summary>
+    public async Task StartVoiceChatAsync()
+    {
+        if (_currentMode != VoiceMode.Inactive)
+        {
+            _logger.Info("Voice already active, ignoring voice chat start");
+            return;
+        }
+
+        await EnsureInitializedAsync();
+        SetMode(VoiceMode.VoiceChat);
+
+        _sessionCts = new CancellationTokenSource();
+        _pipeline = new AudioPipeline(_logger, _stt, _vad);
+        WirePipelineEvents(_pipeline);
+
+        var options = new AudioPipelineOptions
+        {
+            ModelPath = _modelManager.GetModelPath(_settings.SttModelName),
+            Language = _settings.SttLanguage,
+            SilenceTimeoutSeconds = _settings.SttSilenceTimeout,
+            VadThreshold = 0.5f
+        };
+
+        try
+        {
+            await _pipeline.StartAsync(options, _sessionCts.Token);
+            _logger.Info("Voice chat session started");
+        }
+        catch
+        {
+            await CleanupSessionAsync();
+            throw;
+        }
+    }
+
+    /// <summary>Stop the current voice chat session.</summary>
+    public Task StopVoiceChatAsync() => StopAsync();
+
+    /// <summary>Stop any active voice mode.</summary>
+    public async Task StopAsync()
+    {
+        if (_currentMode == VoiceMode.Inactive) return;
+        await CleanupSessionAsync();
+    }
+
+    private async Task CleanupSessionAsync()
+    {
+        if (_pipeline != null)
+        {
+            try { await _pipeline.StopAsync(); } catch { }
+            try { await _pipeline.DisposeAsync(); } catch { }
+            _pipeline = null;
+        }
+
+        _sessionCts?.Cancel();
+        _sessionCts?.Dispose();
+        _sessionCts = null;
+
+        SetMode(VoiceMode.Inactive);
+    }
+
+    /// <summary>
+    /// Handle an agent-initiated stt.listen request:
+    /// start the mic, wait for one complete silence-bounded utterance,
+    /// return the transcription. Multi-segment utterances are concatenated
+    /// before returning so callers never receive a partial first-segment.
+    /// </summary>
+    public async Task<SttListenResult> ListenOnceAsync(SttListenArgs args, CancellationToken cancellationToken)
+    {
+        await EnsureInitializedAsync();
+
+        using var timeoutCts = new CancellationTokenSource(args.TimeoutMs);
+        using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken, timeoutCts.Token);
+
+        var pipeline = new AudioPipeline(_logger, _stt, _vad);
+        var tcs = new TaskCompletionSource<SttListenResult>();
+        var sw = Stopwatch.StartNew();
+
+        pipeline.UtteranceTranscribed += utterance =>
+        {
+            // Snapshot already immutable (UtteranceResult.Segments is a fresh
+            // array). Map to the wire-shape SttSegment.
+            var segments = new List<SttSegment>(utterance.Segments.Count);
+            foreach (var s in utterance.Segments)
+            {
+                segments.Add(new SttSegment
+                {
+                    Text = s.Text,
+                    StartMs = (int)s.Start.TotalMilliseconds,
+                    EndMs = (int)s.End.TotalMilliseconds
+                });
+            }
+
+            tcs.TrySetResult(new SttListenResult
+            {
+                Text = utterance.Text,
+                Language = utterance.Language ?? "",
+                DurationMs = (int)sw.ElapsedMilliseconds,
+                Segments = segments
+            });
+        };
+
+        var options = new AudioPipelineOptions
+        {
+            Language = args.Language,
+            SilenceTimeoutSeconds = 2.0f,
+            VadThreshold = 0.5f
+        };
+
+        try
+        {
+            await pipeline.StartAsync(options, linkedCts.Token);
+
+            // Wait for either an utterance or timeout/cancellation.
+            // We don't throw immediately on timeout — pipeline.StopAsync's
+            // flush path may still produce an UtteranceTranscribed for
+            // speech that was buffered when the timer fired. Only after
+            // giving the flush a brief window do we report timeout.
+            var sentinel = new TaskCompletionSource<bool>();
+            using (linkedCts.Token.Register(() => sentinel.TrySetResult(true)))
+            {
+                var winner = await Task.WhenAny(tcs.Task, sentinel.Task).ConfigureAwait(false);
+                if (winner == tcs.Task)
+                {
+                    return await tcs.Task.ConfigureAwait(false);
+                }
+            }
+
+            // Timeout / external cancellation. Stop the pipeline (which
+            // flushes any buffered speech) and give UtteranceTranscribed
+            // up to 2 s to fire before reporting timeout.
+            try { await pipeline.StopAsync().ConfigureAwait(false); } catch { /* swallow */ }
+            await Task.WhenAny(tcs.Task, Task.Delay(2000)).ConfigureAwait(false);
+            if (tcs.Task.IsCompletedSuccessfully)
+            {
+                return await tcs.Task.ConfigureAwait(false);
+            }
+
+            throw new TimeoutException("No speech detected within timeout");
+        }
+        finally
+        {
+            try { await pipeline.StopAsync(); } catch { /* idempotent — already stopped above on timeout */ }
+            await pipeline.DisposeAsync();
+        }
+    }
+
+    /// <summary>
+    /// Handle a fixed-duration <c>stt.transcribe</c> request. Captures
+    /// audio for exactly <c>args.MaxDurationMs</c> milliseconds (no
+    /// VAD-based early termination), then transcribes the entire
+    /// captured window. Use this for "record N ms and tell me what's in
+    /// it" callers; use <see cref="ListenOnceAsync"/> for "listen until
+    /// the user stops speaking" callers.
+    /// </summary>
+    public async Task<SttTranscribeResult> TranscribeFixedDurationAsync(
+        SttTranscribeArgs args,
+        CancellationToken cancellationToken)
+    {
+        if (args == null) throw new ArgumentNullException(nameof(args));
+        if (args.MaxDurationMs <= 0)
+            throw new ArgumentOutOfRangeException(nameof(args), "maxDurationMs must be positive.");
+
+        await EnsureInitializedAsync();
+
+        var pipeline = new AudioPipeline(_logger, _stt, _vad);
+        var sw = Stopwatch.StartNew();
+        try
+        {
+            var samples = await pipeline.CaptureFixedDurationAsync(args.MaxDurationMs, cancellationToken).ConfigureAwait(false);
+            var captureMs = (int)sw.ElapsedMilliseconds;
+
+            if (samples.Length == 0)
+            {
+                return new SttTranscribeResult
+                {
+                    Transcribed = false,
+                    Text = "",
+                    DurationMs = captureMs,
+                    Language = args.Language ?? "auto",
+                    EngineEffective = SttCapability.EngineWhisper
+                };
+            }
+
+            var lang = !string.IsNullOrWhiteSpace(args.Language)
+                ? args.Language!
+                : _settings.SttLanguage ?? "auto";
+
+            var results = await _stt.TranscribeAsync(samples, lang, cancellationToken).ConfigureAwait(false);
+            var text = string.Join(" ", results.Select(r => r.Text)).Trim();
+
+            return new SttTranscribeResult
+            {
+                Transcribed = !string.IsNullOrEmpty(text),
+                Text = text,
+                DurationMs = (int)sw.ElapsedMilliseconds,
+                Language = results.Count > 0 ? results[0].Language : lang,
+                EngineEffective = SttCapability.EngineWhisper
+            };
+        }
+        finally
+        {
+            await pipeline.DisposeAsync();
+        }
+    }
+
+    // GetStatusAsync was previously tied to the old SttStatusResult shape
+    // (ModelLoaded / ModelName / IsListening). The unified status now lives
+    // in NodeService.OnSttStatusAsync, which probes both engines and reports
+    // per-engine readiness. VoiceService just exposes the raw signals
+    // (IsWhisperReady, IsWhisperDownloadingModel, WhisperModelDownloadProgress)
+    // that the selector consumes.
+
+    /// <summary>
+    /// Download the configured Whisper model with progress reporting.
+    /// Sets <see cref="IsWhisperDownloadingModel"/> for the duration so the
+    /// STT selector can fall back to WinRT while a download is in flight.
+    /// </summary>
+    public async Task DownloadModelAsync(
+        string? modelName = null,
+        IProgress<(long downloaded, long total)>? progress = null,
+        CancellationToken cancellationToken = default)
+    {
+        var resolved = modelName ?? _settings.SttModelName;
+        _modelDownloadInProgress = true;
+        _modelDownloadProgress = 0f;
+        try
+        {
+            // Wrap caller's progress reporter so we also keep our internal
+            // 0..1 snapshot updated for the stt.status surface.
+            var wrapped = new Progress<(long downloaded, long total)>(p =>
+            {
+                if (p.total > 0)
+                    _modelDownloadProgress = (float)((double)p.downloaded / p.total);
+                progress?.Report(p);
+            });
+            await _modelManager.DownloadModelAsync(resolved, wrapped, cancellationToken).ConfigureAwait(false);
+        }
+        finally
+        {
+            _modelDownloadInProgress = false;
+            _modelDownloadProgress = 0f;
+        }
+    }
+
+    /// <summary>Check if the specified model is downloaded.</summary>
+    public bool CheckModelDownloaded(string? modelName = null)
+        => _modelManager.IsModelDownloaded(modelName ?? _settings.SttModelName);
+
+    /// <summary>Get available model information.</summary>
+    public WhisperModelInfo[] GetAvailableModels() => WhisperModelManager.AvailableModels;
+
+    private async Task EnsureInitializedAsync()
+    {
+        if (!_stt.IsModelLoaded)
+        {
+            await InitializeAsync();
+        }
+    }
+
+    private void WirePipelineEvents(AudioPipeline pipeline)
+    {
+        pipeline.TranscriptionReady += result =>
+        {
+            TranscriptionReceived?.Invoke(result.Text);
+        };
+
+        pipeline.UtteranceTranscribed += utterance =>
+        {
+            UtteranceCompleted?.Invoke(utterance);
+        };
+
+        pipeline.VoiceActivityChanged += vad =>
+        {
+            SpeakingChanged?.Invoke(vad.IsSpeaking);
+        };
+
+        pipeline.AudioLevelChanged += level =>
+        {
+            AudioLevelChanged?.Invoke(level);
+        };
+
+        pipeline.StateChanged += state =>
+        {
+            PipelineStateChanged?.Invoke(state);
+        };
+
+        pipeline.DiagnosticMessage += msg =>
+        {
+            DiagnosticMessage?.Invoke(msg);
+        };
+    }
+
+    private void SetMode(VoiceMode mode)
+    {
+        if (_currentMode == mode) return;
+        _currentMode = mode;
+        ModeChanged?.Invoke(mode);
+        _logger.Info($"Voice mode changed: {mode}");
+    }
+
+    /// <summary>
+    /// Locate the Silero VAD ONNX model. Looks in the app's Assets folder
+    /// and the models directory.
+    /// </summary>
+    private string? FindVadModelPath()
+    {
+        if (_vadModelPath != null && File.Exists(_vadModelPath))
+            return _vadModelPath;
+
+        // Check Assets directory (deployed with the app)
+        var assetsPath = Path.Combine(AppContext.BaseDirectory, "Assets", "silero_vad.onnx");
+        if (File.Exists(assetsPath))
+        {
+            _vadModelPath = assetsPath;
+            return assetsPath;
+        }
+
+        // Check models directory
+        var modelsPath = Path.Combine(SettingsManager.SettingsDirectoryPath, "models", "silero_vad.onnx");
+        if (File.Exists(modelsPath))
+        {
+            _vadModelPath = modelsPath;
+            return modelsPath;
+        }
+
+        return null;
+    }
+
+    private const string VadDownloadUrl = SileroVadModelManifest.DownloadUrl;
+
+    /// <summary>Download the Silero VAD ONNX model if not already present.
+    /// SHA-256 is verified before the atomic rename so a tampered or
+    /// truncated file never lands at the canonical path. See
+    /// <see cref="SileroVadModelManifest"/>.</summary>
+    private async Task<string?> DownloadVadModelAsync(CancellationToken cancellationToken)
+    {
+        var destPath = Path.Combine(SettingsManager.SettingsDirectoryPath, "models", SileroVadModelManifest.FileName);
+        if (File.Exists(destPath))
+            return destPath;
+
+        _logger.Info("Downloading Silero VAD model...");
+        Directory.CreateDirectory(Path.GetDirectoryName(destPath)!);
+        var tempPath = destPath + ".tmp";
+        try
+        {
+            using var http = new System.Net.Http.HttpClient();
+            http.Timeout = TimeSpan.FromMinutes(5);
+            using var response = await http.GetAsync(VadDownloadUrl, cancellationToken);
+            response.EnsureSuccessStatusCode();
+            using (var fs = new FileStream(tempPath, FileMode.Create, FileAccess.Write, FileShare.None))
+            {
+                await response.Content.CopyToAsync(fs, cancellationToken);
+            }
+
+            // SECURITY: verify SHA-256 BEFORE the atomic rename so a
+            // tampered file never reaches ONNX Runtime.
+            using (var sha = System.Security.Cryptography.SHA256.Create())
+            using (var verifyStream = new FileStream(tempPath, FileMode.Open, FileAccess.Read, FileShare.Read, 81920, useAsync: true))
+            {
+                var actual = await sha.ComputeHashAsync(verifyStream, cancellationToken);
+                var actualHex = Convert.ToHexString(actual).ToLowerInvariant();
+                if (!string.Equals(actualHex, SileroVadModelManifest.Sha256, StringComparison.OrdinalIgnoreCase))
+                {
+                    throw new System.Security.SecurityException(
+                        "Silero VAD model failed integrity check. The downloaded file does not match the pinned SHA-256.");
+                }
+            }
+
+            File.Move(tempPath, destPath, overwrite: true);
+            _logger.Info($"Silero VAD model downloaded and verified ({new FileInfo(destPath).Length:N0} bytes)");
+            _vadModelPath = destPath;
+            return destPath;
+        }
+        catch (Exception ex)
+        {
+            _logger.Error("Failed to download VAD model", ex);
+            try { if (File.Exists(tempPath)) File.Delete(tempPath); } catch { }
+            return null;
+        }
+    }
+
+    public async ValueTask DisposeAsync()
+    {
+        await StopAsync();
+        _stt.Dispose();
+        _vad.Dispose();
+    }
+}
diff --git a/src/OpenClaw.Tray.WinUI/Strings/en-us/Resources.resw b/src/OpenClaw.Tray.WinUI/Strings/en-us/Resources.resw
index 57ef270f..5001c01e 100644
--- a/src/OpenClaw.Tray.WinUI/Strings/en-us/Resources.resw
+++ b/src/OpenClaw.Tray.WinUI/Strings/en-us/Resources.resw
@@ -1,26 +1,22 @@
-<?xml version="1.0" encoding="utf-8"?>
-<root>
-  <xsd:schema id="root" xmlns="" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:msdata="urn:schemas-microsoft-com:xml-msdata">
-    <xsd:element name="root" msdata:IsDataSet="true">
-      <xsd:complexType>
-        <xsd:choice maxOccurs="unbounded">
-          <xsd:element name="data">
-            <xsd:complexType>
-              <xsd:sequence>
-                <xsd:element name="value" type="xsd:string" minOccurs="0" msdata:Ordinal="1" />
-                <xsd:element name="comment" type="xsd:string" minOccurs="0" msdata:Ordinal="2" />
-              </xsd:sequence>
-              <xsd:attribute name="name" type="xsd:string" use="required" />
-            </xsd:complexType>
-          </xsd:element>
-        </xsd:choice>
-      </xsd:complexType>
-    </xsd:element>
-  </xsd:schema>
-
-  <!-- ==================== SettingsWindow.xaml ==================== -->
-
-  <!-- Section headers -->
+<?xml version='1.0' encoding='utf-8'?>
+<root xmlns:ns1="urn:schemas-microsoft-com:xml-msdata" xmlns:xs="http://www.w3.org/2001/XMLSchema">
+  <xs:schema id="root">
+    <xs:element name="root" ns1:IsDataSet="true">
+      <xs:complexType>
+        <xs:choice maxOccurs="unbounded">
+          <xs:element name="data">
+            <xs:complexType>
+              <xs:sequence>
+                <xs:element name="value" type="xsd:string" minOccurs="0" ns1:Ordinal="1" />
+                <xs:element name="comment" type="xsd:string" minOccurs="0" ns1:Ordinal="2" />
+              </xs:sequence>
+              <xs:attribute name="name" type="xsd:string" use="required" />
+            </xs:complexType>
+          </xs:element>
+        </xs:choice>
+      </xs:complexType>
+    </xs:element>
+  </xs:schema>
   <data name="SettingsConnectionHeader.Text" xml:space="preserve">
     <value>CONNECTION</value>
   </data>
@@ -33,8 +29,6 @@
   <data name="SettingsAdvancedHeader.Text" xml:space="preserve">
     <value>ADVANCED (EXPERIMENTAL)</value>
   </data>
-
-  <!-- TextBox headers and placeholders -->
   <data name="SettingsGatewayUrlTextBox.Header" xml:space="preserve">
     <value>Gateway URL</value>
   </data>
@@ -47,8 +41,6 @@
   <data name="SettingsTokenTextBox.PlaceholderText" xml:space="preserve">
     <value>Your API token</value>
   </data>
-
-  <!-- Toggle headers -->
   <data name="SettingsAutoStartToggle.Header" xml:space="preserve">
     <value>Start automatically with Windows</value>
   </data>
@@ -61,8 +53,6 @@
   <data name="SettingsNodeModeToggle.Header" xml:space="preserve">
     <value>Enable Node Mode</value>
   </data>
-
-  <!-- ComboBox -->
   <data name="SettingsSoundComboBox.Header" xml:space="preserve">
     <value>Sound</value>
   </data>
@@ -75,16 +65,12 @@
   <data name="SettingsSoundSubtle.Content" xml:space="preserve">
     <value>Subtle</value>
   </data>
-
-  <!-- Notification filter labels -->
   <data name="SettingsNotifyForLabel.Text" xml:space="preserve">
     <value>Show notifications for:</value>
   </data>
   <data name="SettingsNotifyFilterHint.Text" xml:space="preserve">
     <value>Filters by keywords in the message (e.g., 'email', 'reminder')</value>
   </data>
-
-  <!-- Notification filter checkboxes -->
   <data name="SettingsNotifyHealthCb.Content" xml:space="preserve">
     <value>Health alerts</value>
   </data>
@@ -109,8 +95,6 @@
   <data name="SettingsNotifyInfoCb.Content" xml:space="preserve">
     <value>Info messages</value>
   </data>
-
-  <!-- Buttons -->
   <data name="SettingsTestConnectionButton.Content" xml:space="preserve">
     <value>Test</value>
   </data>
@@ -123,15 +107,9 @@
   <data name="SettingsCancelButton.Content" xml:space="preserve">
     <value>Cancel</value>
   </data>
-
-  <!-- Node mode description -->
   <data name="SettingsNodeModeDescription.Text" xml:space="preserve">
     <value>When enabled, this PC can receive commands from the agent (canvas, screenshots, etc.)</value>
   </data>
-
-  <!-- ==================== StatusDetailWindow.xaml ==================== -->
-
-  <!-- Section headers -->
   <data name="StatusUsageHeader.Text" xml:space="preserve">
     <value>USAGE</value>
   </data>
@@ -165,8 +143,6 @@
   <data name="StatusRecentActivityHeader.Text" xml:space="preserve">
     <value>RECENT ACTIVITY</value>
   </data>
-
-  <!-- Labels -->
   <data name="StatusCostLabel.Text" xml:space="preserve">
     <value>Cost (window):</value>
   </data>
@@ -176,8 +152,6 @@
   <data name="StatusProvidersLabel.Text" xml:space="preserve">
     <value>Providers:</value>
   </data>
-
-  <!-- Default values -->
   <data name="StatusConnectedText.Text" xml:space="preserve">
     <value>Connected</value>
   </data>
@@ -211,8 +185,6 @@
   <data name="StatusCronDescription.Text" xml:space="preserve">
     <value>Open scheduled automation controls when the connected gateway supports them.</value>
   </data>
-
-  <!-- Buttons -->
   <data name="StatusRefreshButton.Content" xml:space="preserve">
     <value>Refresh</value>
   </data>
@@ -261,15 +233,9 @@
   <data name="StatusOpenStreamButton.Content" xml:space="preserve">
     <value>Open Stream</value>
   </data>
-
-  <!-- ==================== ActivityStreamWindow.xaml ==================== -->
-
-  <!-- Header -->
   <data name="ActivityStreamTitle.Text" xml:space="preserve">
     <value>⚡ Activity Stream</value>
   </data>
-
-  <!-- Filter items -->
   <data name="ActivityFilterAll.Content" xml:space="preserve">
     <value>All activity</value>
   </data>
@@ -285,13 +251,9 @@
   <data name="ActivityFilterNotifications.Content" xml:space="preserve">
     <value>Notifications</value>
   </data>
-
-  <!-- Empty state -->
   <data name="ActivityEmptyText.Text" xml:space="preserve">
     <value>No activity yet</value>
   </data>
-
-  <!-- Buttons -->
   <data name="ActivityOpenDashboardButton.Content" xml:space="preserve">
     <value>Open Dashboard</value>
   </data>
@@ -301,30 +263,18 @@
   <data name="ActivityCloseButton.Content" xml:space="preserve">
     <value>Close</value>
   </data>
-
-  <!-- ==================== NotificationHistoryWindow.xaml ==================== -->
-
-  <!-- Header -->
   <data name="NotificationHistoryTitle.Text" xml:space="preserve">
     <value>📋 Notification History</value>
   </data>
-
-  <!-- Empty state -->
   <data name="NotificationEmptyText.Text" xml:space="preserve">
     <value>No notifications yet</value>
   </data>
-
-  <!-- Buttons -->
   <data name="NotificationClearAllButton.Content" xml:space="preserve">
     <value>Clear All</value>
   </data>
   <data name="NotificationCloseButton.Content" xml:space="preserve">
     <value>Close</value>
   </data>
-
-  <!-- ==================== Runtime strings (C# code) ==================== -->
-
-  <!-- Window titles -->
   <data name="WindowTitle_Settings" xml:space="preserve">
     <value>Settings — OpenClaw Tray</value>
   </data>
@@ -349,8 +299,6 @@
   <data name="WindowTitle_Update" xml:space="preserve">
     <value>OpenClaw Update</value>
   </data>
-
-  <!-- Settings runtime status -->
   <data name="Status_Testing" xml:space="preserve">
     <value>Testing...</value>
   </data>
@@ -360,8 +308,6 @@
   <data name="Status_ConnectionFailed" xml:space="preserve">
     <value>❌ Connection failed</value>
   </data>
-
-  <!-- QuickSend dialog -->
   <data name="QuickSend_Header" xml:space="preserve">
     <value>📤 Quick Send</value>
   </data>
@@ -386,8 +332,6 @@
   <data name="QuickSend_ToastBody" xml:space="preserve">
     <value>Your message was sent to OpenClaw.</value>
   </data>
-
-  <!-- WelcomeDialog -->
   <data name="Welcome_Title" xml:space="preserve">
     <value>Welcome to OpenClaw!</value>
   </data>
@@ -412,8 +356,6 @@
   <data name="Welcome_OpenSettingsButton" xml:space="preserve">
     <value>Open Settings</value>
   </data>
-
-  <!-- UpdateDialog -->
   <data name="Update_VersionAvailable" xml:space="preserve">
     <value>🎉 Version {0} is available!</value>
   </data>
@@ -432,8 +374,6 @@
   <data name="Update_DownloadButton" xml:space="preserve">
     <value>Download &amp; Install</value>
   </data>
-
-  <!-- Tray menu items -->
   <data name="Menu_OpenDashboard" xml:space="preserve">
     <value>Open Dashboard</value>
   </data>
@@ -524,8 +464,6 @@
   <data name="Menu_RestartSshTunnel" xml:space="preserve">
     <value>Restart SSH Tunnel</value>
   </data>
-
-  <!-- Tray menu dynamic text -->
   <data name="Menu_StatusFormat" xml:space="preserve">
     <value>Status: {0}</value>
   </data>
@@ -562,21 +500,15 @@
   <data name="Menu_NodeDisconnected" xml:space="preserve">
     <value>⚪ Disconnected</value>
   </data>
-
-  <!-- Notification test -->
   <data name="TestNotification_Title" xml:space="preserve">
     <value>Test Notification</value>
   </data>
   <data name="TestNotification_Body" xml:space="preserve">
     <value>This is a test notification from OpenClaw Tray.</value>
   </data>
-
-  <!-- Status detail runtime -->
   <data name="Status_LastCheckFormat" xml:space="preserve">
     <value>Last check: {0}</value>
   </data>
-
-  <!-- Time ago strings -->
   <data name="TimeAgo_JustNow" xml:space="preserve">
     <value>Just now</value>
   </data>
@@ -586,17 +518,12 @@
   <data name="TimeAgo_HoursFormat" xml:space="preserve">
     <value>{0}h ago</value>
   </data>
-
-  <!-- Activity stream runtime -->
   <data name="Activity_ClickToOpen" xml:space="preserve">
     <value>Click to open in dashboard</value>
   </data>
-
   <data name="TimeAgo_DaysFormat" xml:space="preserve">
     <value>{0}d ago</value>
   </data>
-
-  <!-- Status display text (plain, no emoji) -->
   <data name="StatusDisplay_Connected" xml:space="preserve">
     <value>Connected</value>
   </data>
@@ -615,9 +542,6 @@
   <data name="Status_NotAvailable" xml:space="preserve">
     <value>n/a</value>
   </data>
-
-  <!-- ==================== CanvasWindow.xaml ==================== -->
-
   <data name="WindowTitle_Canvas" xml:space="preserve">
     <value>Canvas</value>
   </data>
@@ -651,9 +575,6 @@
   <data name="A2UI_MultipleChoiceSelect" xml:space="preserve">
     <value>Select...</value>
   </data>
-
-  <!-- ==================== WebChatWindow.xaml ==================== -->
-
   <data name="WebChatErrorTitle.Text" xml:space="preserve">
     <value>Web Chat Unavailable</value>
   </data>
@@ -692,15 +613,9 @@ Use one of these options:
 • If self-signed, import your gateway CA/cert into Windows Trusted Root (certmgr.msc)
 • Or tunnel to localhost: ssh -N -L 18789:localhost:18789 &lt;server&gt;</value>
   </data>
-
-  <!-- ==================== TrayMenuWindow.xaml ==================== -->
-
   <data name="WindowTitle_TrayMenu" xml:space="preserve">
     <value>OpenClaw Menu</value>
   </data>
-
-  <!-- ==================== Toast: Device / Node ==================== -->
-
   <data name="Toast_DeviceIdCopied" xml:space="preserve">
     <value>📋 Device ID Copied</value>
   </data>
@@ -713,18 +628,12 @@ Use one of these options:
   <data name="Toast_NodeSummaryCopiedDetail" xml:space="preserve">
     <value>{0} node(s) copied to clipboard</value>
   </data>
-
-  <!-- ==================== Toast: Session ==================== -->
-
   <data name="Toast_SessionActionFailed" xml:space="preserve">
     <value>❌ Session action failed</value>
   </data>
   <data name="Toast_SessionActionFailedDetail" xml:space="preserve">
     <value>Could not send request to gateway.</value>
   </data>
-
-  <!-- ==================== Toast: Node Mode ==================== -->
-
   <data name="Toast_NodeModeActive" xml:space="preserve">
     <value>🔌 Node Mode Active</value>
   </data>
@@ -755,9 +664,6 @@ Use one of these options:
   <data name="Toast_PairingRejectedDetail" xml:space="preserve">
     <value>The gateway rejected this device pairing request.</value>
   </data>
-
-  <!-- ==================== Toast: Health Check ==================== -->
-
   <data name="Toast_HealthCheck" xml:space="preserve">
     <value>Health Check</value>
   </data>
@@ -770,9 +676,6 @@ Use one of these options:
   <data name="Toast_HealthCheckFailed" xml:space="preserve">
     <value>Health Check Failed</value>
   </data>
-
-  <!-- ==================== Toast: Screen / Camera ==================== -->
-
   <data name="Toast_ScreenCaptured" xml:space="preserve">
     <value>📸 Screen Captured</value>
   </data>
@@ -785,9 +688,6 @@ Use one of these options:
   <data name="Toast_CameraBlockedDetail" xml:space="preserve">
     <value>Enable camera access in Windows Privacy settings for OpenClaw Tray</value>
   </data>
-
-  <!-- ==================== Toast: Activity Stream Tip ==================== -->
-
   <data name="Toast_ActivityStreamTip" xml:space="preserve">
     <value>⚡ New: Activity Stream</value>
   </data>
@@ -797,9 +697,6 @@ Use one of these options:
   <data name="Toast_ActivityStreamTipButton" xml:space="preserve">
     <value>Open Activity Stream</value>
   </data>
-
-  <!-- ==================== SetupWizardWindow (C# code) ==================== -->
-
   <data name="Setup_Title" xml:space="preserve">
     <value>OpenClaw Setup</value>
   </data>
@@ -950,9 +847,6 @@ On your gateway host (Mac/Linux), run:
   <data name="Setup_StepDone" xml:space="preserve">
     <value>Step 3 of 3 — Done</value>
   </data>
-
-  <!-- ==================== Local MCP server (SettingsWindow) ==================== -->
-
   <data name="SettingsDeveloperModeHeader.Text" xml:space="preserve">
     <value>DEVELOPER MODE</value>
   </data>
@@ -960,7 +854,7 @@ On your gateway host (Mac/Linux), run:
     <value>Enable Local MCP Server</value>
   </data>
   <data name="SettingsMcpDescription.Text" xml:space="preserve">
-    <value>Exposes the same node capabilities (system, screen, camera, canvas) to local MCP clients (Claude Desktop, Cursor, Claude Code).</value>
+    <value>Exposes the same node capabilities (system, screen, camera, microphone, speakers, canvas) to local MCP clients (Claude Desktop, Cursor, Claude Code).</value>
   </data>
   <data name="SettingsMcpEndpointLabel.Text" xml:space="preserve">
     <value>Endpoint:</value>
@@ -986,9 +880,6 @@ On your gateway host (Mac/Linux), run:
   <data name="Mcp_Status_FailedToStart" xml:space="preserve">
     <value>Failed to start: </value>
   </data>
-
-  <!-- ==================== MCP token (SettingsWindow) ==================== -->
-
   <data name="SettingsMcpTokenLabel.Text" xml:space="preserve">
     <value>Bearer token:</value>
   </data>
@@ -1037,18 +928,12 @@ On your gateway host (Mac/Linux), run:
   <data name="SettingsMcpTokenResetDialog_CloseButton" xml:space="preserve">
     <value>Cancel</value>
   </data>
-
-  <!-- ==================== DownloadProgressDialog ==================== -->
-
   <data name="WindowTitle_Downloading" xml:space="preserve">
     <value>Downloading Update...</value>
   </data>
   <data name="Download_ProgressText" xml:space="preserve">
     <value>Downloading update...</value>
   </data>
-
-  <!-- ==================== UrlNavigationApprovalService prompt ==================== -->
-
   <data name="UrlApproval_Caption" xml:space="preserve">
     <value>OpenClaw — Approve URL</value>
   </data>
@@ -1076,7 +961,6 @@ On your gateway host (Mac/Linux), run:
   <data name="UrlApproval_NoHint" xml:space="preserve">
     <value>No — block it.</value>
   </data>
-
   <data name="Onboarding_Title" xml:space="preserve">
     <value>OpenClaw Setup</value>
   </data>
@@ -1413,7 +1297,6 @@ On your gateway host (Mac/Linux), run:
   <data name="Onboarding_Connection_LaterStatus" xml:space="preserve">
     <value>You can configure the gateway later in Settings.</value>
   </data>
-
   <data name="AboutPage_TextBlock_10.Text" xml:space="preserve">
     <value>About</value>
   </data>
@@ -1537,7 +1420,7 @@ On your gateway host (Mac/Linux), run:
   <data name="ClearButton.Content" xml:space="preserve">
     <value>Clear</value>
   </data>
-<data name="BindingsPage_TextBlock_16.Text" xml:space="preserve">
+  <data name="BindingsPage_TextBlock_16.Text" xml:space="preserve">
     <value>🔀 Bindings</value>
   </data>
   <data name="BindingsPage_TextBlock_17.Text" xml:space="preserve">
@@ -1658,7 +1541,7 @@ On your gateway host (Mac/Linux), run:
     <value>Connect</value>
   </data>
   <data name="ConnectionPage_TextBlock_111.Text" xml:space="preserve">
-    <value>Run &apos;openclaw qr&apos; on the gateway host for a setup code, or find the token in the gateway&apos;s config.</value>
+    <value>Run 'openclaw qr' on the gateway host for a setup code, or find the token in the gateway's config.</value>
   </data>
   <data name="GatewayEmptyText.Text" xml:space="preserve">
     <value>No gateways found. Click Scan to search.</value>
@@ -1771,7 +1654,7 @@ On your gateway host (Mac/Linux), run:
   <data name="CronPage_TextBlock_120.Text" xml:space="preserve">
     <value>No cron jobs configured</value>
   </data>
-<data name="DebugPage_TextBlock_8.Text" xml:space="preserve">
+  <data name="DebugPage_TextBlock_8.Text" xml:space="preserve">
     <value>🐛 Debug</value>
   </data>
   <data name="DebugPage_TextBlock_14.Text" xml:space="preserve">
@@ -1825,7 +1708,7 @@ On your gateway host (Mac/Linux), run:
   <data name="DebugPage_Button_114.Content" xml:space="preserve">
     <value>📋 Copy Support Context</value>
   </data>
-<data name="StatusHeadline.Text" xml:space="preserve">
+  <data name="StatusHeadline.Text" xml:space="preserve">
     <value>Not connected to gateway</value>
   </data>
   <data name="ScanButton.Content" xml:space="preserve">
@@ -2341,4 +2224,352 @@ On your gateway host (Mac/Linux), run:
   <data name="CronPageRemoveJobToolTip.Content" xml:space="preserve">
     <value>Remove job</value>
   </data>
-</root>
+  <data name="CapabilitiesPage_SttCardDescription.Text" xml:space="preserve">
+    <value>Recognizes microphone audio locally on this PC using Whisper. The model is downloaded once on first use.</value>
+  </data>
+  <data name="CapabilitiesPage_SttCardHeader.Text" xml:space="preserve">
+    <value>🎤 Speech-to-Text</value>
+  </data>
+  <data name="CapabilitiesPage_SttMoreSettingsLink.Content" xml:space="preserve">
+    <value>More voice settings…</value>
+  </data>
+  <data name="CapabilitiesPage_TtsCardDescription.Text" xml:space="preserve">
+    <value>Default provider for tts.speak. Per-call provider/voiceId/model overrides this default.</value>
+  </data>
+  <data name="CapabilitiesPage_TtsCardHeader.Text" xml:space="preserve">
+    <value>🔊 Text-to-Speech</value>
+  </data>
+  <data name="CapabilitiesPage_TtsElevenLabsApiKey.Header" xml:space="preserve">
+    <value>ElevenLabs API key</value>
+  </data>
+  <data name="CapabilitiesPage_TtsElevenLabsHelp.Text" xml:space="preserve">
+    <value>API key is encrypted at rest with Windows DPAPI. Leave blank to keep the previously saved value when you change other fields.</value>
+  </data>
+  <data name="CapabilitiesPage_TtsElevenLabsModel.Header" xml:space="preserve">
+    <value>ElevenLabs model</value>
+  </data>
+  <data name="CapabilitiesPage_TtsElevenLabsModel.PlaceholderText" xml:space="preserve">
+    <value>eleven_multilingual_v2</value>
+  </data>
+  <data name="CapabilitiesPage_TtsElevenLabsVoiceId.Header" xml:space="preserve">
+    <value>ElevenLabs voice ID</value>
+  </data>
+  <data name="CapabilitiesPage_TtsProviderComboBox.Header" xml:space="preserve">
+    <value>Provider</value>
+  </data>
+  <data name="CapabilitiesPage_TtsProviderElevenLabs.Content" xml:space="preserve">
+    <value>ElevenLabs</value>
+  </data>
+  <data name="CapabilitiesPage_TtsProviderPiper.Content" xml:space="preserve">
+    <value>Piper (local ML, recommended)</value>
+  </data>
+  <data name="CapabilitiesPage_TtsProviderWindows.Content" xml:space="preserve">
+    <value>Windows built-in speech</value>
+  </data>
+  <data name="VoiceSettingsPage_PageTitle.Text" xml:space="preserve">
+    <value>🎙️ Voice &amp; Audio</value>
+  </data>
+  <data name="VoiceSettingsPage_PageDescription.Text" xml:space="preserve">
+    <value>Configure speech-to-text and voice interaction settings. All speech processing runs locally on your device.</value>
+  </data>
+  <data name="VoiceSettingsPage_SttHeader.Text" xml:space="preserve">
+    <value>Speech-to-Text</value>
+  </data>
+  <data name="VoiceSettingsPage_SttDescription.Text" xml:space="preserve">
+    <value>Enable voice input via microphone. Requires a Whisper model download.</value>
+  </data>
+  <data name="VoiceSettingsPage_SttEnabledToggle.Header" xml:space="preserve">
+    <value>Enable Voice Input</value>
+  </data>
+  <data name="VoiceSettingsPage_ModelHeader.Text" xml:space="preserve">
+    <value>Speech Model</value>
+  </data>
+  <data name="VoiceSettingsPage_ModelCombo.Header" xml:space="preserve">
+    <value>Model Size</value>
+  </data>
+  <data name="VoiceSettingsPage_ModelTiny.Content" xml:space="preserve">
+    <value>Tiny (~75 MB) — Fast, basic accuracy</value>
+  </data>
+  <data name="VoiceSettingsPage_ModelBase.Content" xml:space="preserve">
+    <value>Base (~142 MB) — Good balance</value>
+  </data>
+  <data name="VoiceSettingsPage_ModelSmall.Content" xml:space="preserve">
+    <value>Small (~466 MB) — High accuracy</value>
+  </data>
+  <data name="VoiceSettingsPage_DownloadButtonText.Text" xml:space="preserve">
+    <value>Download Model</value>
+  </data>
+  <data name="VoiceSettingsPage_LanguageHeader.Text" xml:space="preserve">
+    <value>Language</value>
+  </data>
+  <data name="VoiceSettingsPage_LangAuto.Content" xml:space="preserve">
+    <value>Auto-detect</value>
+  </data>
+  <data name="VoiceSettingsPage_LangEn.Content" xml:space="preserve">
+    <value>English</value>
+  </data>
+  <data name="VoiceSettingsPage_LangEs.Content" xml:space="preserve">
+    <value>Spanish</value>
+  </data>
+  <data name="VoiceSettingsPage_LangFr.Content" xml:space="preserve">
+    <value>French</value>
+  </data>
+  <data name="VoiceSettingsPage_LangDe.Content" xml:space="preserve">
+    <value>German</value>
+  </data>
+  <data name="VoiceSettingsPage_LangJa.Content" xml:space="preserve">
+    <value>Japanese</value>
+  </data>
+  <data name="VoiceSettingsPage_LangZh.Content" xml:space="preserve">
+    <value>Chinese</value>
+  </data>
+  <data name="VoiceSettingsPage_LangKo.Content" xml:space="preserve">
+    <value>Korean</value>
+  </data>
+  <data name="VoiceSettingsPage_LangPt.Content" xml:space="preserve">
+    <value>Portuguese</value>
+  </data>
+  <data name="VoiceSettingsPage_LangIt.Content" xml:space="preserve">
+    <value>Italian</value>
+  </data>
+  <data name="VoiceSettingsPage_VoiceChatHeader.Text" xml:space="preserve">
+    <value>Voice Chat</value>
+  </data>
+  <data name="VoiceSettingsPage_SilenceSlider.Header" xml:space="preserve">
+    <value>Silence timeout (seconds)</value>
+  </data>
+  <data name="VoiceSettingsPage_TtsResponseToggle.Header" xml:space="preserve">
+    <value>Read responses aloud</value>
+  </data>
+  <data name="VoiceSettingsPage_AudioFeedbackToggle.Header" xml:space="preserve">
+    <value>Audio feedback sounds</value>
+  </data>
+  <data name="VoiceSettingsPage_VoiceHeader.Text" xml:space="preserve">
+    <value>🔊 Companion Voice</value>
+  </data>
+  <data name="VoiceSettingsPage_VoiceDescription.Text" xml:space="preserve">
+    <value>Choose the voice used when reading responses aloud.</value>
+  </data>
+  <data name="VoiceSettingsPage_TtsProviderCombo.Header" xml:space="preserve">
+    <value>Provider</value>
+  </data>
+  <data name="VoiceSettingsPage_TtsPiper.Content" xml:space="preserve">
+    <value>Piper (local neural voices)</value>
+  </data>
+  <data name="VoiceSettingsPage_TtsWindows.Content" xml:space="preserve">
+    <value>Windows (built-in neural voices)</value>
+  </data>
+  <data name="VoiceSettingsPage_TtsElevenLabs.Content" xml:space="preserve">
+    <value>ElevenLabs (cloud, requires API key)</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperVoiceCombo.Header" xml:space="preserve">
+    <value>Voice</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDownloadButtonText.Text" xml:space="preserve">
+    <value>Download Voice</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDeleteButton.Content" xml:space="preserve">
+    <value>Delete</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperPreviewButton.Content" xml:space="preserve">
+    <value>▶ Preview</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperInfoText.Text" xml:space="preserve">
+    <value>Voices download from the sherpa-onnx project's GitHub releases (~25 MB low quality, up to ~150 MB high quality). They run fully on this PC; no audio leaves your device.</value>
+  </data>
+  <data name="VoiceSettingsPage_WindowsVoiceCombo.Header" xml:space="preserve">
+    <value>Voice</value>
+  </data>
+  <data name="VoiceSettingsPage_PreviewVoiceButton.Content" xml:space="preserve">
+    <value>▶ Preview Voice</value>
+  </data>
+  <data name="VoiceSettingsPage_ElevenLabsApiKeyBox.Header" xml:space="preserve">
+    <value>API Key</value>
+  </data>
+  <data name="VoiceSettingsPage_ElevenLabsVoiceIdBox.Header" xml:space="preserve">
+    <value>Voice ID</value>
+  </data>
+  <data name="VoiceSettingsPage_ElevenLabsModelBox.Header" xml:space="preserve">
+    <value>Model (optional)</value>
+  </data>
+  <data name="VoiceSettingsPage_PrivacyNote.Text" xml:space="preserve">
+    <value>All speech processing runs entirely on your device. No audio data is sent to any cloud service.</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusModelReady" xml:space="preserve">
+    <value>✅ Model ready</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusDownloadRequired" xml:space="preserve">
+    <value>⬇️ Download required</value>
+  </data>
+  <data name="VoiceSettingsPage_ButtonReDownload" xml:space="preserve">
+    <value>Re-download</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusDownloading" xml:space="preserve">
+    <value>Downloading...</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusDownloadingPct" xml:space="preserve">
+    <value>Downloading... {0}%</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusDownloadCanceled" xml:space="preserve">
+    <value>Download canceled</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusError" xml:space="preserve">
+    <value>❌ Operation failed (see Debug log)</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperButtonDownloaded" xml:space="preserve">
+    <value>Downloaded</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperVoiceReady" xml:space="preserve">
+    <value>Voice ready on this PC ({0} MB).</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperVoiceNotDownloaded" xml:space="preserve">
+    <value>Voice not downloaded yet. Click Download to fetch the model (~25–150 MB depending on quality).</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperButtonDownloading" xml:space="preserve">
+    <value>Downloading…</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperConnecting" xml:space="preserve">
+    <value>Connecting to sherpa-onnx releases…</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperProgressIndeterminate" xml:space="preserve">
+    <value>Downloading… {0} MB so far</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperProgressBytes" xml:space="preserve">
+    <value>Downloading… {0} / {1} MB</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperExtracting" xml:space="preserve">
+    <value>Download complete. Extracting…</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDownloadCanceled" xml:space="preserve">
+    <value>Download canceled.</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDownloadFailed" xml:space="preserve">
+    <value>Download failed (see Debug log).</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperButtonRetry" xml:space="preserve">
+    <value>Retry Download</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDeleted" xml:space="preserve">
+    <value>Voice deleted.</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDeleteFailed" xml:space="preserve">
+    <value>Delete failed (see Debug log).</value>
+  </data>
+  <data name="VoiceSettingsPage_CompanionPreviewText" xml:space="preserve">
+    <value>Hello! This is your Companion speaking.</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperPreviewFailed" xml:space="preserve">
+    <value>Preview failed (see Debug log).</value>
+  </data>
+  <data name="VoiceSettingsPage_VoiceErrorLoading" xml:space="preserve">
+    <value>Error loading voices (see Debug log).</value>
+  </data>
+  <data name="VoiceSettingsPage_PreviewButtonPlaying" xml:space="preserve">
+    <value>▶ Playing...</value>
+  </data>
+  <data name="VoiceOverlayWindow_HeaderText.Text" xml:space="preserve">
+    <value>Companion Voice</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusBadge.Text" xml:space="preserve">
+    <value>Ready</value>
+  </data>
+  <data name="VoiceOverlayWindow_EmptyStateText.Text" xml:space="preserve">
+    <value>Press Start and begin speaking</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusText.Text" xml:space="preserve">
+    <value>Press Start to begin</value>
+  </data>
+  <data name="VoiceOverlayWindow_StartStopText.Text" xml:space="preserve">
+    <value>Start Listening</value>
+  </data>
+  <data name="VoiceOverlayWindow_MuteButton.[using:Microsoft.UI.Xaml.Controls]ToolTipService.ToolTip" xml:space="preserve">
+    <value>Mute</value>
+  </data>
+  <data name="VoiceOverlayWindow_SettingsButton.[using:Microsoft.UI.Xaml.Controls]ToolTipService.ToolTip" xml:space="preserve">
+    <value>Voice Settings</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusListening" xml:space="preserve">
+    <value>🗣️ Listening...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusSpeakNow" xml:space="preserve">
+    <value>Speak now — I'm listening</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateInitializing" xml:space="preserve">
+    <value>Initializing...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateStarting" xml:space="preserve">
+    <value>Starting</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateDownloadingModel" xml:space="preserve">
+    <value>Downloading speech model...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateDownloadingPct" xml:space="preserve">
+    <value>Downloading model... {0}%</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateLoadingModel" xml:space="preserve">
+    <value>Loading speech model...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateStartingMic" xml:space="preserve">
+    <value>Starting microphone...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateStopping" xml:space="preserve">
+    <value>Stopping...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateError" xml:space="preserve">
+    <value>Error</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusError" xml:space="preserve">
+    <value>Error encountered (see Debug log)</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusMuted" xml:space="preserve">
+    <value>Muted</value>
+  </data>
+  <data name="VoiceOverlayWindow_StopText" xml:space="preserve">
+    <value>Stop</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeStopped" xml:space="preserve">
+    <value>Stopped</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeStartingDots" xml:space="preserve">
+    <value>Starting...</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeListening" xml:space="preserve">
+    <value>Listening</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeProcessing" xml:space="preserve">
+    <value>Processing...</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeUnknown" xml:space="preserve">
+    <value>Unknown</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusInitMic" xml:space="preserve">
+    <value>Initializing microphone...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusTranscribing" xml:space="preserve">
+    <value>Transcribing your speech...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusErrorOccurred" xml:space="preserve">
+    <value>An error occurred</value>
+  </data>
+  <data name="VoiceOverlayWindow_winexWindowEx_2.Title" xml:space="preserve">
+    <value>Companion Voice</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeReady" xml:space="preserve">
+    <value>Ready</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusReadyMessage" xml:space="preserve">
+    <value>Press Start to begin</value>
+  </data>
+  <data name="VoiceOverlayWindow_ButtonStartListening" xml:space="preserve">
+    <value>Start Listening</value>
+  </data>
+  <data name="VoiceSettingsPage_ButtonDownloadModel" xml:space="preserve">
+    <value>Download Model</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperButtonDownloadVoice" xml:space="preserve">
+    <value>Download Voice</value>
+  </data>
+  <data name="VoiceSettingsPage_PreviewVoiceButtonContent" xml:space="preserve">
+    <value>▶ Preview Voice</value>
+  </data>
+</root>
\ No newline at end of file
diff --git a/src/OpenClaw.Tray.WinUI/Strings/fr-fr/Resources.resw b/src/OpenClaw.Tray.WinUI/Strings/fr-fr/Resources.resw
index 1d40baa8..71520ac1 100644
--- a/src/OpenClaw.Tray.WinUI/Strings/fr-fr/Resources.resw
+++ b/src/OpenClaw.Tray.WinUI/Strings/fr-fr/Resources.resw
@@ -1,24 +1,22 @@
-<?xml version="1.0" encoding="utf-8"?>
-<root>
-  <xsd:schema id="root" xmlns="" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:msdata="urn:schemas-microsoft-com:xml-msdata">
-    <xsd:element name="root" msdata:IsDataSet="true">
-      <xsd:complexType>
-        <xsd:choice maxOccurs="unbounded">
-          <xsd:element name="data">
-            <xsd:complexType>
-              <xsd:sequence>
-                <xsd:element name="value" type="xsd:string" minOccurs="0" msdata:Ordinal="1" />
-                <xsd:element name="comment" type="xsd:string" minOccurs="0" msdata:Ordinal="2" />
-              </xsd:sequence>
-              <xsd:attribute name="name" type="xsd:string" use="required" />
-            </xsd:complexType>
-          </xsd:element>
-        </xsd:choice>
-      </xsd:complexType>
-    </xsd:element>
-  </xsd:schema>
-  <!-- ==================== SettingsWindow.xaml ==================== -->
-  <!-- Section headers -->
+<?xml version='1.0' encoding='utf-8'?>
+<root xmlns:ns1="urn:schemas-microsoft-com:xml-msdata" xmlns:xs="http://www.w3.org/2001/XMLSchema">
+  <xs:schema id="root">
+    <xs:element name="root" ns1:IsDataSet="true">
+      <xs:complexType>
+        <xs:choice maxOccurs="unbounded">
+          <xs:element name="data">
+            <xs:complexType>
+              <xs:sequence>
+                <xs:element name="value" type="xsd:string" minOccurs="0" ns1:Ordinal="1" />
+                <xs:element name="comment" type="xsd:string" minOccurs="0" ns1:Ordinal="2" />
+              </xs:sequence>
+              <xs:attribute name="name" type="xsd:string" use="required" />
+            </xs:complexType>
+          </xs:element>
+        </xs:choice>
+      </xs:complexType>
+    </xs:element>
+  </xs:schema>
   <data name="SettingsConnectionHeader.Text" xml:space="preserve">
     <value>CONNEXION</value>
   </data>
@@ -31,7 +29,6 @@
   <data name="SettingsAdvancedHeader.Text" xml:space="preserve">
     <value>AVANCÉ (EXPERIMENTAL)</value>
   </data>
-  <!-- TextBox headers and placeholders -->
   <data name="SettingsGatewayUrlTextBox.Header" xml:space="preserve">
     <value>URL de la passerelle</value>
   </data>
@@ -44,7 +41,6 @@
   <data name="SettingsTokenTextBox.PlaceholderText" xml:space="preserve">
     <value>Votre API Token</value>
   </data>
-  <!-- Toggle headers -->
   <data name="SettingsAutoStartToggle.Header" xml:space="preserve">
     <value>Démarrage automatique avec Windows</value>
   </data>
@@ -57,7 +53,6 @@
   <data name="SettingsNodeModeToggle.Header" xml:space="preserve">
     <value>Activer le mode Node</value>
   </data>
-  <!-- ComboBox -->
   <data name="SettingsSoundComboBox.Header" xml:space="preserve">
     <value>Son</value>
   </data>
@@ -70,14 +65,12 @@
   <data name="SettingsSoundSubtle.Content" xml:space="preserve">
     <value>Subtil</value>
   </data>
-  <!-- Notification filter labels -->
   <data name="SettingsNotifyForLabel.Text" xml:space="preserve">
     <value>Afficher les notifications pour:</value>
   </data>
   <data name="SettingsNotifyFilterHint.Text" xml:space="preserve">
     <value>Filtrer par mots clés dans le message (Ex : 'email', 'rappel')</value>
   </data>
-  <!-- Notification filter checkboxes -->
   <data name="SettingsNotifyHealthCb.Content" xml:space="preserve">
     <value>Alertes de santé</value>
   </data>
@@ -102,7 +95,6 @@
   <data name="SettingsNotifyInfoCb.Content" xml:space="preserve">
     <value>Messages d'informations</value>
   </data>
-  <!-- Buttons -->
   <data name="SettingsTestConnectionButton.Content" xml:space="preserve">
     <value>Tester</value>
   </data>
@@ -115,12 +107,9 @@
   <data name="SettingsCancelButton.Content" xml:space="preserve">
     <value>Annuler</value>
   </data>
-  <!-- Node mode description -->
   <data name="SettingsNodeModeDescription.Text" xml:space="preserve">
     <value>Lorsqu'il est activé, ce PC peut recevoir des commandes de l'agent (canvas, captures d'écran, etc.)</value>
   </data>
-  <!-- ==================== StatusDetailWindow.xaml ==================== -->
-  <!-- Section headers -->
   <data name="StatusUsageHeader.Text" xml:space="preserve">
     <value>UTILISATION</value>
   </data>
@@ -154,7 +143,6 @@
   <data name="StatusRecentActivityHeader.Text" xml:space="preserve">
     <value>ACTIVITÉ RÉCENTE</value>
   </data>
-  <!-- Labels -->
   <data name="StatusCostLabel.Text" xml:space="preserve">
     <value>Coûts (période):</value>
   </data>
@@ -164,7 +152,6 @@
   <data name="StatusProvidersLabel.Text" xml:space="preserve">
     <value>Fournisseurs :</value>
   </data>
-  <!-- Default values -->
   <data name="StatusConnectedText.Text" xml:space="preserve">
     <value>Connecté</value>
   </data>
@@ -198,7 +185,6 @@
   <data name="StatusCronDescription.Text" xml:space="preserve">
     <value>Ouvrir les contrôles d'automatisation planifiée quand la passerelle connectée les prend en charge.</value>
   </data>
-  <!-- Buttons -->
   <data name="StatusRefreshButton.Content" xml:space="preserve">
     <value>Rafraîchir</value>
   </data>
@@ -247,12 +233,9 @@
   <data name="StatusOpenStreamButton.Content" xml:space="preserve">
     <value>Ouvrir le flux</value>
   </data>
-  <!-- ==================== ActivityStreamWindow.xaml ==================== -->
-  <!-- Header -->
   <data name="ActivityStreamTitle.Text" xml:space="preserve">
     <value>⚡ Flux d'activité</value>
   </data>
-  <!-- Filter items -->
   <data name="ActivityFilterAll.Content" xml:space="preserve">
     <value>Toute l'activité</value>
   </data>
@@ -268,11 +251,9 @@
   <data name="ActivityFilterNotifications.Content" xml:space="preserve">
     <value>Alertes</value>
   </data>
-  <!-- Empty state -->
   <data name="ActivityEmptyText.Text" xml:space="preserve">
     <value>Aucune activité pour le moment</value>
   </data>
-  <!-- Buttons -->
   <data name="ActivityOpenDashboardButton.Content" xml:space="preserve">
     <value>Ouvrir le tableau de bord</value>
   </data>
@@ -282,24 +263,18 @@
   <data name="ActivityCloseButton.Content" xml:space="preserve">
     <value>Fermer</value>
   </data>
-  <!-- ==================== NotificationHistoryWindow.xaml ==================== -->
-  <!-- Header -->
   <data name="NotificationHistoryTitle.Text" xml:space="preserve">
     <value>📋 Historique de notifications</value>
   </data>
-  <!-- Empty state -->
   <data name="NotificationEmptyText.Text" xml:space="preserve">
     <value>Aucune notification disponible</value>
   </data>
-  <!-- Buttons -->
   <data name="NotificationClearAllButton.Content" xml:space="preserve">
     <value>Tout effacer</value>
   </data>
   <data name="NotificationCloseButton.Content" xml:space="preserve">
     <value>Fermer</value>
   </data>
-  <!-- ==================== Runtime strings (C# code) ==================== -->
-  <!-- Window titles -->
   <data name="WindowTitle_Settings" xml:space="preserve">
     <value>Paramètres — OpenClaw Tray</value>
   </data>
@@ -324,7 +299,6 @@
   <data name="WindowTitle_Update" xml:space="preserve">
     <value>Mettre à jour OpenClaw</value>
   </data>
-  <!-- Settings runtime status -->
   <data name="Status_Testing" xml:space="preserve">
     <value>Test en cours...</value>
   </data>
@@ -334,7 +308,6 @@
   <data name="Status_ConnectionFailed" xml:space="preserve">
     <value>❌ Problème de connexion</value>
   </data>
-  <!-- QuickSend dialog -->
   <data name="QuickSend_Header" xml:space="preserve">
     <value>📤 Envoi rapide</value>
   </data>
@@ -359,7 +332,6 @@
   <data name="QuickSend_ToastBody" xml:space="preserve">
     <value>Votre message a été envoyé à OpenClaw.</value>
   </data>
-  <!-- WelcomeDialog -->
   <data name="Welcome_Title" xml:space="preserve">
     <value>Bienvenue sur OpenClaw!</value>
   </data>
@@ -384,7 +356,6 @@
   <data name="Welcome_OpenSettingsButton" xml:space="preserve">
     <value>Ouvrir les paramètres</value>
   </data>
-  <!-- UpdateDialog -->
   <data name="Update_VersionAvailable" xml:space="preserve">
     <value>🎉 La version {0} est disponible !</value>
   </data>
@@ -403,7 +374,6 @@
   <data name="Update_DownloadButton" xml:space="preserve">
     <value>Télécharger &amp; Installer</value>
   </data>
-  <!-- Tray menu items -->
   <data name="Menu_OpenDashboard" xml:space="preserve">
     <value>Ouvrir le tableau de bord</value>
   </data>
@@ -494,7 +464,6 @@
   <data name="Menu_RestartSshTunnel" xml:space="preserve">
     <value>Redémarrer le tunnel SSH</value>
   </data>
-  <!-- Tray menu dynamic text -->
   <data name="Menu_StatusFormat" xml:space="preserve">
     <value>État : {0}</value>
   </data>
@@ -531,18 +500,15 @@
   <data name="Menu_NodeDisconnected" xml:space="preserve">
     <value>⚪ Déconnecté</value>
   </data>
-  <!-- Notification test -->
   <data name="TestNotification_Title" xml:space="preserve">
     <value>Test de notifications</value>
   </data>
   <data name="TestNotification_Body" xml:space="preserve">
     <value>Ceci est une notifications de test provenant de OpenClaw Tray.</value>
   </data>
-  <!-- Status detail runtime -->
   <data name="Status_LastCheckFormat" xml:space="preserve">
     <value>Dernière vérification : {0}</value>
   </data>
-  <!-- Time ago strings -->
   <data name="TimeAgo_JustNow" xml:space="preserve">
     <value>Maintenant</value>
   </data>
@@ -552,14 +518,12 @@
   <data name="TimeAgo_HoursFormat" xml:space="preserve">
     <value>Il y a {0}h</value>
   </data>
-  <!-- Activity stream runtime -->
   <data name="Activity_ClickToOpen" xml:space="preserve">
     <value>Cliquez pour ouvrir le tableau de bord</value>
   </data>
   <data name="TimeAgo_DaysFormat" xml:space="preserve">
     <value>Il y a {0}j</value>
   </data>
-  <!-- Status display text (plain, no emoji) -->
   <data name="StatusDisplay_Connected" xml:space="preserve">
     <value>Connecté</value>
   </data>
@@ -578,7 +542,6 @@
   <data name="Status_NotAvailable" xml:space="preserve">
     <value>Indisponible</value>
   </data>
-  <!-- ==================== CanvasWindow.xaml ==================== -->
   <data name="WindowTitle_Canvas" xml:space="preserve">
     <value>Canevas</value>
   </data>
@@ -612,7 +575,6 @@
   <data name="A2UI_MultipleChoiceSelect" xml:space="preserve">
     <value>Sélectionner...</value>
   </data>
-  <!-- ==================== WebChatWindow.xaml ==================== -->
   <data name="WebChatErrorTitle.Text" xml:space="preserve">
     <value>Chat indisponible</value>
   </data>
@@ -651,11 +613,9 @@ Utilisez l'une de ces options :
 • S'il est auto-signé, importez l'AC/le certificat de votre passerelle dans les Autorités de certification racines de confiance de Windows (certmgr.msc)
 • Ou utilisez un tunnel vers localhost : ssh -N -L 18789:localhost:18789 &lt;serveur&gt;</value>
   </data>
-  <!-- ==================== TrayMenuWindow.xaml ==================== -->
   <data name="WindowTitle_TrayMenu" xml:space="preserve">
     <value>Menu OpenClaw</value>
   </data>
-  <!-- ==================== Toast: Device / Node ==================== -->
   <data name="Toast_DeviceIdCopied" xml:space="preserve">
     <value>📋 Identifiant de la machine copié</value>
   </data>
@@ -668,14 +628,12 @@ Utilisez l'une de ces options :
   <data name="Toast_NodeSummaryCopiedDetail" xml:space="preserve">
     <value>{0} nœud(s) copiés dans le presse papier</value>
   </data>
-  <!-- ==================== Toast: Session ==================== -->
   <data name="Toast_SessionActionFailed" xml:space="preserve">
     <value>❌ Action de session échouée</value>
   </data>
   <data name="Toast_SessionActionFailedDetail" xml:space="preserve">
     <value>Impossible d'envoyer une requête vers la passerelle.</value>
   </data>
-  <!-- ==================== Toast: Node Mode ==================== -->
   <data name="Toast_NodeModeActive" xml:space="preserve">
     <value>🔌 Mode de noeud actif</value>
   </data>
@@ -706,7 +664,6 @@ Utilisez l'une de ces options :
   <data name="Toast_PairingRejectedDetail" xml:space="preserve">
     <value>La passerelle a rejeté la demande d'appairage de cet appareil.</value>
   </data>
-  <!-- ==================== Toast: Health Check ==================== -->
   <data name="Toast_HealthCheck" xml:space="preserve">
     <value>Vérification de l'état</value>
   </data>
@@ -719,7 +676,6 @@ Utilisez l'une de ces options :
   <data name="Toast_HealthCheckFailed" xml:space="preserve">
     <value>Vérification de l'état échouée</value>
   </data>
-  <!-- ==================== Toast: Screen / Camera ==================== -->
   <data name="Toast_ScreenCaptured" xml:space="preserve">
     <value>📸 Capture d'écran effectuée</value>
   </data>
@@ -732,7 +688,6 @@ Utilisez l'une de ces options :
   <data name="Toast_CameraBlockedDetail" xml:space="preserve">
     <value>Activez l'accès à la caméra dans les paramètres de confidentialité de Windows pour OpenClaw Tray</value>
   </data>
-  <!-- ==================== Toast: Activity Stream Tip ==================== -->
   <data name="Toast_ActivityStreamTip" xml:space="preserve">
     <value>⚡ Nouveau: Fil d'activité</value>
   </data>
@@ -742,7 +697,6 @@ Utilisez l'une de ces options :
   <data name="Toast_ActivityStreamTipButton" xml:space="preserve">
     <value>Créer un nouveau flux d'activités</value>
   </data>
-  <!-- ==================== SetupWizardWindow (C# code) ==================== -->
   <data name="Setup_Title" xml:space="preserve">
     <value>Configuration d'OpenClaw</value>
   </data>
@@ -893,7 +847,6 @@ Sur votre hôte passerelle (Mac/Linux), exécutez :
   <data name="Setup_StepDone" xml:space="preserve">
     <value>Étape 3 sur 3 — Terminé</value>
   </data>
-  <!-- ==================== Local MCP server (SettingsWindow) ==================== -->
   <data name="SettingsDeveloperModeHeader.Text" xml:space="preserve">
     <value>MODE DÉVELOPPEUR</value>
   </data>
@@ -901,7 +854,7 @@ Sur votre hôte passerelle (Mac/Linux), exécutez :
     <value>Activer le serveur MCP local</value>
   </data>
   <data name="SettingsMcpDescription.Text" xml:space="preserve">
-    <value>Expose les mêmes capacités du nœud (système, écran, caméra, canvas) aux clients MCP locaux (Claude Desktop, Cursor, Claude Code).</value>
+    <value>Expose les mêmes capacités du nœud (système, écran, caméra, microphone, haut-parleurs, canvas) aux clients MCP locaux (Claude Desktop, Cursor, Claude Code).</value>
   </data>
   <data name="SettingsMcpEndpointLabel.Text" xml:space="preserve">
     <value>Point de terminaison :</value>
@@ -927,7 +880,6 @@ Sur votre hôte passerelle (Mac/Linux), exécutez :
   <data name="Mcp_Status_FailedToStart" xml:space="preserve">
     <value>Échec du démarrage : </value>
   </data>
-  <!-- ==================== MCP token (SettingsWindow) ==================== -->
   <data name="SettingsMcpTokenLabel.Text" xml:space="preserve">
     <value>Jeton porteur :</value>
   </data>
@@ -976,14 +928,12 @@ Sur votre hôte passerelle (Mac/Linux), exécutez :
   <data name="SettingsMcpTokenResetDialog_CloseButton" xml:space="preserve">
     <value>Annuler</value>
   </data>
-  <!-- ==================== DownloadProgressDialog ==================== -->
   <data name="WindowTitle_Downloading" xml:space="preserve">
     <value>Téléchargement en cours...</value>
   </data>
   <data name="Download_ProgressText" xml:space="preserve">
     <value>Téléchargement en cours...</value>
   </data>
-  <!-- ==================== UrlNavigationApprovalService prompt ==================== -->
   <data name="UrlApproval_Caption" xml:space="preserve">
     <value>OpenClaw — Approuver l'URL</value>
   </data>
@@ -1470,7 +1420,7 @@ Sur votre hôte passerelle (Mac/Linux), exécutez :
   <data name="ClearButton.Content" xml:space="preserve">
     <value>Effacer</value>
   </data>
-<data name="BindingsPage_TextBlock_16.Text" xml:space="preserve">
+  <data name="BindingsPage_TextBlock_16.Text" xml:space="preserve">
     <value>🔀 Liaisons</value>
   </data>
   <data name="BindingsPage_TextBlock_17.Text" xml:space="preserve">
@@ -1704,7 +1654,7 @@ Sur votre hôte passerelle (Mac/Linux), exécutez :
   <data name="CronPage_TextBlock_120.Text" xml:space="preserve">
     <value>Aucune tâche cron configurée</value>
   </data>
-<data name="DebugPage_TextBlock_8.Text" xml:space="preserve">
+  <data name="DebugPage_TextBlock_8.Text" xml:space="preserve">
     <value>🐛 Débogage</value>
   </data>
   <data name="DebugPage_TextBlock_14.Text" xml:space="preserve">
@@ -1758,7 +1708,7 @@ Sur votre hôte passerelle (Mac/Linux), exécutez :
   <data name="DebugPage_Button_114.Content" xml:space="preserve">
     <value>📋 Copier le contexte de support</value>
   </data>
-<data name="StatusHeadline.Text" xml:space="preserve">
+  <data name="StatusHeadline.Text" xml:space="preserve">
     <value>Non connecté à la passerelle</value>
   </data>
   <data name="ScanButton.Content" xml:space="preserve">
@@ -2274,4 +2224,352 @@ Sur votre hôte passerelle (Mac/Linux), exécutez :
   <data name="CronPageRemoveJobToolTip.Content" xml:space="preserve">
     <value>Supprimer la tâche</value>
   </data>
-</root>
+  <data name="CapabilitiesPage_SttCardDescription.Text" xml:space="preserve">
+    <value>Reconnaît l'audio du microphone localement sur ce PC à l'aide de Whisper. Le modèle est téléchargé une seule fois lors de la première utilisation.</value>
+  </data>
+  <data name="CapabilitiesPage_SttCardHeader.Text" xml:space="preserve">
+    <value>🎤 Reconnaissance vocale</value>
+  </data>
+  <data name="CapabilitiesPage_SttMoreSettingsLink.Content" xml:space="preserve">
+    <value>Plus de paramètres vocaux…</value>
+  </data>
+  <data name="CapabilitiesPage_TtsCardDescription.Text" xml:space="preserve">
+    <value>Fournisseur par défaut pour tts.speak. Le fournisseur/voiceId/modèle par appel remplace cette valeur par défaut.</value>
+  </data>
+  <data name="CapabilitiesPage_TtsCardHeader.Text" xml:space="preserve">
+    <value>🔊 Synthèse vocale</value>
+  </data>
+  <data name="CapabilitiesPage_TtsElevenLabsApiKey.Header" xml:space="preserve">
+    <value>Clé API ElevenLabs</value>
+  </data>
+  <data name="CapabilitiesPage_TtsElevenLabsHelp.Text" xml:space="preserve">
+    <value>La clé API est chiffrée au repos avec Windows DPAPI. Laissez vide pour conserver la valeur précédente lorsque vous modifiez d'autres champs.</value>
+  </data>
+  <data name="CapabilitiesPage_TtsElevenLabsModel.Header" xml:space="preserve">
+    <value>Modèle ElevenLabs</value>
+  </data>
+  <data name="CapabilitiesPage_TtsElevenLabsModel.PlaceholderText" xml:space="preserve">
+    <value>eleven_multilingual_v2</value>
+  </data>
+  <data name="CapabilitiesPage_TtsElevenLabsVoiceId.Header" xml:space="preserve">
+    <value>ID de voix ElevenLabs</value>
+  </data>
+  <data name="CapabilitiesPage_TtsProviderComboBox.Header" xml:space="preserve">
+    <value>Fournisseur</value>
+  </data>
+  <data name="CapabilitiesPage_TtsProviderElevenLabs.Content" xml:space="preserve">
+    <value>ElevenLabs</value>
+  </data>
+  <data name="CapabilitiesPage_TtsProviderPiper.Content" xml:space="preserve">
+    <value>Piper (ML local, recommandé)</value>
+  </data>
+  <data name="CapabilitiesPage_TtsProviderWindows.Content" xml:space="preserve">
+    <value>Voix intégrée à Windows</value>
+  </data>
+  <data name="VoiceSettingsPage_PageTitle.Text" xml:space="preserve">
+    <value>🎙️ Voix et audio</value>
+  </data>
+  <data name="VoiceSettingsPage_PageDescription.Text" xml:space="preserve">
+    <value>Configurez la reconnaissance vocale et les paramètres d'interaction vocale. Tout le traitement vocal s'exécute localement sur votre appareil.</value>
+  </data>
+  <data name="VoiceSettingsPage_SttHeader.Text" xml:space="preserve">
+    <value>Reconnaissance vocale</value>
+  </data>
+  <data name="VoiceSettingsPage_SttDescription.Text" xml:space="preserve">
+    <value>Activer la saisie vocale via le microphone. Nécessite le téléchargement d'un modèle Whisper.</value>
+  </data>
+  <data name="VoiceSettingsPage_SttEnabledToggle.Header" xml:space="preserve">
+    <value>Activer la saisie vocale</value>
+  </data>
+  <data name="VoiceSettingsPage_ModelHeader.Text" xml:space="preserve">
+    <value>Modèle vocal</value>
+  </data>
+  <data name="VoiceSettingsPage_ModelCombo.Header" xml:space="preserve">
+    <value>Taille du modèle</value>
+  </data>
+  <data name="VoiceSettingsPage_ModelTiny.Content" xml:space="preserve">
+    <value>Tiny (~75 Mo) — Rapide, précision de base</value>
+  </data>
+  <data name="VoiceSettingsPage_ModelBase.Content" xml:space="preserve">
+    <value>Base (~142 Mo) — Bon équilibre</value>
+  </data>
+  <data name="VoiceSettingsPage_ModelSmall.Content" xml:space="preserve">
+    <value>Small (~466 Mo) — Haute précision</value>
+  </data>
+  <data name="VoiceSettingsPage_DownloadButtonText.Text" xml:space="preserve">
+    <value>Télécharger le modèle</value>
+  </data>
+  <data name="VoiceSettingsPage_LanguageHeader.Text" xml:space="preserve">
+    <value>Langue</value>
+  </data>
+  <data name="VoiceSettingsPage_LangAuto.Content" xml:space="preserve">
+    <value>Détection automatique</value>
+  </data>
+  <data name="VoiceSettingsPage_LangEn.Content" xml:space="preserve">
+    <value>Anglais</value>
+  </data>
+  <data name="VoiceSettingsPage_LangEs.Content" xml:space="preserve">
+    <value>Espagnol</value>
+  </data>
+  <data name="VoiceSettingsPage_LangFr.Content" xml:space="preserve">
+    <value>Français</value>
+  </data>
+  <data name="VoiceSettingsPage_LangDe.Content" xml:space="preserve">
+    <value>Allemand</value>
+  </data>
+  <data name="VoiceSettingsPage_LangJa.Content" xml:space="preserve">
+    <value>Japonais</value>
+  </data>
+  <data name="VoiceSettingsPage_LangZh.Content" xml:space="preserve">
+    <value>Chinois</value>
+  </data>
+  <data name="VoiceSettingsPage_LangKo.Content" xml:space="preserve">
+    <value>Coréen</value>
+  </data>
+  <data name="VoiceSettingsPage_LangPt.Content" xml:space="preserve">
+    <value>Portugais</value>
+  </data>
+  <data name="VoiceSettingsPage_LangIt.Content" xml:space="preserve">
+    <value>Italien</value>
+  </data>
+  <data name="VoiceSettingsPage_VoiceChatHeader.Text" xml:space="preserve">
+    <value>Chat vocal</value>
+  </data>
+  <data name="VoiceSettingsPage_SilenceSlider.Header" xml:space="preserve">
+    <value>Délai de silence (secondes)</value>
+  </data>
+  <data name="VoiceSettingsPage_TtsResponseToggle.Header" xml:space="preserve">
+    <value>Lire les réponses à voix haute</value>
+  </data>
+  <data name="VoiceSettingsPage_AudioFeedbackToggle.Header" xml:space="preserve">
+    <value>Sons de retour audio</value>
+  </data>
+  <data name="VoiceSettingsPage_VoiceHeader.Text" xml:space="preserve">
+    <value>🔊 Voix de Companion</value>
+  </data>
+  <data name="VoiceSettingsPage_VoiceDescription.Text" xml:space="preserve">
+    <value>Choisissez la voix utilisée pour lire les réponses à voix haute.</value>
+  </data>
+  <data name="VoiceSettingsPage_TtsProviderCombo.Header" xml:space="preserve">
+    <value>Fournisseur</value>
+  </data>
+  <data name="VoiceSettingsPage_TtsPiper.Content" xml:space="preserve">
+    <value>Piper (voix neuronales locales)</value>
+  </data>
+  <data name="VoiceSettingsPage_TtsWindows.Content" xml:space="preserve">
+    <value>Windows (voix neuronales intégrées)</value>
+  </data>
+  <data name="VoiceSettingsPage_TtsElevenLabs.Content" xml:space="preserve">
+    <value>ElevenLabs (cloud, clé API requise)</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperVoiceCombo.Header" xml:space="preserve">
+    <value>Voix</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDownloadButtonText.Text" xml:space="preserve">
+    <value>Télécharger la voix</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDeleteButton.Content" xml:space="preserve">
+    <value>Supprimer</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperPreviewButton.Content" xml:space="preserve">
+    <value>▶ Aperçu</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperInfoText.Text" xml:space="preserve">
+    <value>Les voix sont téléchargées depuis les versions GitHub du projet sherpa-onnx (~25 Mo basse qualité, jusqu'à ~150 Mo haute qualité). Elles s'exécutent entièrement sur ce PC ; aucun son ne quitte votre appareil.</value>
+  </data>
+  <data name="VoiceSettingsPage_WindowsVoiceCombo.Header" xml:space="preserve">
+    <value>Voix</value>
+  </data>
+  <data name="VoiceSettingsPage_PreviewVoiceButton.Content" xml:space="preserve">
+    <value>▶ Aperçu de la voix</value>
+  </data>
+  <data name="VoiceSettingsPage_ElevenLabsApiKeyBox.Header" xml:space="preserve">
+    <value>Clé API</value>
+  </data>
+  <data name="VoiceSettingsPage_ElevenLabsVoiceIdBox.Header" xml:space="preserve">
+    <value>ID de la voix</value>
+  </data>
+  <data name="VoiceSettingsPage_ElevenLabsModelBox.Header" xml:space="preserve">
+    <value>Modèle (facultatif)</value>
+  </data>
+  <data name="VoiceSettingsPage_PrivacyNote.Text" xml:space="preserve">
+    <value>Tout le traitement vocal s'exécute entièrement sur votre appareil. Aucune donnée audio n'est envoyée à un service cloud.</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusModelReady" xml:space="preserve">
+    <value>✅ Modèle prêt</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusDownloadRequired" xml:space="preserve">
+    <value>⬇️ Téléchargement requis</value>
+  </data>
+  <data name="VoiceSettingsPage_ButtonReDownload" xml:space="preserve">
+    <value>Retélécharger</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusDownloading" xml:space="preserve">
+    <value>Téléchargement...</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusDownloadingPct" xml:space="preserve">
+    <value>Téléchargement... {0}%</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusDownloadCanceled" xml:space="preserve">
+    <value>Téléchargement annulé</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusError" xml:space="preserve">
+    <value>❌ Échec de l'opération (voir le journal Debug)</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperButtonDownloaded" xml:space="preserve">
+    <value>Téléchargé</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperVoiceReady" xml:space="preserve">
+    <value>Voix prête sur ce PC ({0} Mo).</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperVoiceNotDownloaded" xml:space="preserve">
+    <value>Voix pas encore téléchargée. Cliquez sur Télécharger pour récupérer le modèle (~25–150 Mo selon la qualité).</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperButtonDownloading" xml:space="preserve">
+    <value>Téléchargement…</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperConnecting" xml:space="preserve">
+    <value>Connexion aux versions sherpa-onnx…</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperProgressIndeterminate" xml:space="preserve">
+    <value>Téléchargement… {0} Mo jusqu'à présent</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperProgressBytes" xml:space="preserve">
+    <value>Téléchargement… {0} / {1} Mo</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperExtracting" xml:space="preserve">
+    <value>Téléchargement terminé. Extraction…</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDownloadCanceled" xml:space="preserve">
+    <value>Téléchargement annulé.</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDownloadFailed" xml:space="preserve">
+    <value>Échec du téléchargement (voir le journal Debug).</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperButtonRetry" xml:space="preserve">
+    <value>Réessayer le téléchargement</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDeleted" xml:space="preserve">
+    <value>Voix supprimée.</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDeleteFailed" xml:space="preserve">
+    <value>Échec de la suppression (voir le journal Debug).</value>
+  </data>
+  <data name="VoiceSettingsPage_CompanionPreviewText" xml:space="preserve">
+    <value>Bonjour ! C'est votre Companion qui parle.</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperPreviewFailed" xml:space="preserve">
+    <value>Échec de l'aperçu (voir le journal Debug).</value>
+  </data>
+  <data name="VoiceSettingsPage_VoiceErrorLoading" xml:space="preserve">
+    <value>Erreur de chargement des voix (voir le journal Debug).</value>
+  </data>
+  <data name="VoiceSettingsPage_PreviewButtonPlaying" xml:space="preserve">
+    <value>▶ Lecture...</value>
+  </data>
+  <data name="VoiceOverlayWindow_HeaderText.Text" xml:space="preserve">
+    <value>Voix de Companion</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusBadge.Text" xml:space="preserve">
+    <value>Prêt</value>
+  </data>
+  <data name="VoiceOverlayWindow_EmptyStateText.Text" xml:space="preserve">
+    <value>Appuyez sur Démarrer et commencez à parler</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusText.Text" xml:space="preserve">
+    <value>Appuyez sur Démarrer pour commencer</value>
+  </data>
+  <data name="VoiceOverlayWindow_StartStopText.Text" xml:space="preserve">
+    <value>Commencer l'écoute</value>
+  </data>
+  <data name="VoiceOverlayWindow_MuteButton.[using:Microsoft.UI.Xaml.Controls]ToolTipService.ToolTip" xml:space="preserve">
+    <value>Muet</value>
+  </data>
+  <data name="VoiceOverlayWindow_SettingsButton.[using:Microsoft.UI.Xaml.Controls]ToolTipService.ToolTip" xml:space="preserve">
+    <value>Paramètres vocaux</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusListening" xml:space="preserve">
+    <value>🗣️ Écoute en cours...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusSpeakNow" xml:space="preserve">
+    <value>Parlez maintenant — j'écoute</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateInitializing" xml:space="preserve">
+    <value>Initialisation...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateStarting" xml:space="preserve">
+    <value>Démarrage</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateDownloadingModel" xml:space="preserve">
+    <value>Téléchargement du modèle vocal...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateDownloadingPct" xml:space="preserve">
+    <value>Téléchargement du modèle... {0}%</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateLoadingModel" xml:space="preserve">
+    <value>Chargement du modèle vocal...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateStartingMic" xml:space="preserve">
+    <value>Démarrage du microphone...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateStopping" xml:space="preserve">
+    <value>Arrêt...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateError" xml:space="preserve">
+    <value>Erreur</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusError" xml:space="preserve">
+    <value>Erreur rencontrée (voir le journal Debug)</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusMuted" xml:space="preserve">
+    <value>Muet</value>
+  </data>
+  <data name="VoiceOverlayWindow_StopText" xml:space="preserve">
+    <value>Arrêter</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeStopped" xml:space="preserve">
+    <value>Arrêté</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeStartingDots" xml:space="preserve">
+    <value>Démarrage...</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeListening" xml:space="preserve">
+    <value>À l'écoute</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeProcessing" xml:space="preserve">
+    <value>Traitement...</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeUnknown" xml:space="preserve">
+    <value>Inconnu</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusInitMic" xml:space="preserve">
+    <value>Initialisation du microphone...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusTranscribing" xml:space="preserve">
+    <value>Transcription de votre discours...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusErrorOccurred" xml:space="preserve">
+    <value>Une erreur s'est produite</value>
+  </data>
+  <data name="VoiceOverlayWindow_winexWindowEx_2.Title" xml:space="preserve">
+    <value>Companion Voice</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeReady" xml:space="preserve">
+    <value>Prêt</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusReadyMessage" xml:space="preserve">
+    <value>Appuyez sur Démarrer pour commencer</value>
+  </data>
+  <data name="VoiceOverlayWindow_ButtonStartListening" xml:space="preserve">
+    <value>Commencer l'écoute</value>
+  </data>
+  <data name="VoiceSettingsPage_ButtonDownloadModel" xml:space="preserve">
+    <value>Télécharger le modèle</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperButtonDownloadVoice" xml:space="preserve">
+    <value>Télécharger la voix</value>
+  </data>
+  <data name="VoiceSettingsPage_PreviewVoiceButtonContent" xml:space="preserve">
+    <value>▶ Aperçu de la voix</value>
+  </data>
+</root>
\ No newline at end of file
diff --git a/src/OpenClaw.Tray.WinUI/Strings/nl-nl/Resources.resw b/src/OpenClaw.Tray.WinUI/Strings/nl-nl/Resources.resw
index 225d0ef8..5a0a5525 100644
--- a/src/OpenClaw.Tray.WinUI/Strings/nl-nl/Resources.resw
+++ b/src/OpenClaw.Tray.WinUI/Strings/nl-nl/Resources.resw
@@ -1,24 +1,22 @@
-<?xml version="1.0" encoding="utf-8"?>
-<root>
-  <xsd:schema id="root" xmlns="" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:msdata="urn:schemas-microsoft-com:xml-msdata">
-    <xsd:element name="root" msdata:IsDataSet="true">
-      <xsd:complexType>
-        <xsd:choice maxOccurs="unbounded">
-          <xsd:element name="data">
-            <xsd:complexType>
-              <xsd:sequence>
-                <xsd:element name="value" type="xsd:string" minOccurs="0" msdata:Ordinal="1" />
-                <xsd:element name="comment" type="xsd:string" minOccurs="0" msdata:Ordinal="2" />
-              </xsd:sequence>
-              <xsd:attribute name="name" type="xsd:string" use="required" />
-            </xsd:complexType>
-          </xsd:element>
-        </xsd:choice>
-      </xsd:complexType>
-    </xsd:element>
-  </xsd:schema>
-  <!-- ==================== SettingsWindow.xaml ==================== -->
-  <!-- Section headers -->
+<?xml version='1.0' encoding='utf-8'?>
+<root xmlns:ns1="urn:schemas-microsoft-com:xml-msdata" xmlns:xs="http://www.w3.org/2001/XMLSchema">
+  <xs:schema id="root">
+    <xs:element name="root" ns1:IsDataSet="true">
+      <xs:complexType>
+        <xs:choice maxOccurs="unbounded">
+          <xs:element name="data">
+            <xs:complexType>
+              <xs:sequence>
+                <xs:element name="value" type="xsd:string" minOccurs="0" ns1:Ordinal="1" />
+                <xs:element name="comment" type="xsd:string" minOccurs="0" ns1:Ordinal="2" />
+              </xs:sequence>
+              <xs:attribute name="name" type="xsd:string" use="required" />
+            </xs:complexType>
+          </xs:element>
+        </xs:choice>
+      </xs:complexType>
+    </xs:element>
+  </xs:schema>
   <data name="SettingsConnectionHeader.Text" xml:space="preserve">
     <value>VERBINDING</value>
   </data>
@@ -31,7 +29,6 @@
   <data name="SettingsAdvancedHeader.Text" xml:space="preserve">
     <value>GEAVANCEERD (EXPERIMENTEEL)</value>
   </data>
-  <!-- TextBox headers and placeholders -->
   <data name="SettingsGatewayUrlTextBox.Header" xml:space="preserve">
     <value>Gateway-URL</value>
   </data>
@@ -44,7 +41,6 @@
   <data name="SettingsTokenTextBox.PlaceholderText" xml:space="preserve">
     <value>Je API-token</value>
   </data>
-  <!-- Toggle headers -->
   <data name="SettingsAutoStartToggle.Header" xml:space="preserve">
     <value>Automatisch starten met Windows</value>
   </data>
@@ -57,7 +53,6 @@
   <data name="SettingsNodeModeToggle.Header" xml:space="preserve">
     <value>Node-modus inschakelen</value>
   </data>
-  <!-- ComboBox -->
   <data name="SettingsSoundComboBox.Header" xml:space="preserve">
     <value>Geluid</value>
   </data>
@@ -70,14 +65,12 @@
   <data name="SettingsSoundSubtle.Content" xml:space="preserve">
     <value>Subtiel</value>
   </data>
-  <!-- Notification filter labels -->
   <data name="SettingsNotifyForLabel.Text" xml:space="preserve">
     <value>Meldingen weergeven voor:</value>
   </data>
   <data name="SettingsNotifyFilterHint.Text" xml:space="preserve">
     <value>Filtert op trefwoorden in het bericht (bijv. 'e-mail', 'herinnering')</value>
   </data>
-  <!-- Notification filter checkboxes -->
   <data name="SettingsNotifyHealthCb.Content" xml:space="preserve">
     <value>Gezondheidsmeldingen</value>
   </data>
@@ -102,7 +95,6 @@
   <data name="SettingsNotifyInfoCb.Content" xml:space="preserve">
     <value>Informatieberichten</value>
   </data>
-  <!-- Buttons -->
   <data name="SettingsTestConnectionButton.Content" xml:space="preserve">
     <value>Testen</value>
   </data>
@@ -115,12 +107,9 @@
   <data name="SettingsCancelButton.Content" xml:space="preserve">
     <value>Annuleren</value>
   </data>
-  <!-- Node mode description -->
   <data name="SettingsNodeModeDescription.Text" xml:space="preserve">
     <value>Wanneer ingeschakeld, kan deze pc opdrachten ontvangen van de agent (canvas, schermafbeeldingen, enz.)</value>
   </data>
-  <!-- ==================== StatusDetailWindow.xaml ==================== -->
-  <!-- Section headers -->
   <data name="StatusUsageHeader.Text" xml:space="preserve">
     <value>GEBRUIK</value>
   </data>
@@ -154,7 +143,6 @@
   <data name="StatusRecentActivityHeader.Text" xml:space="preserve">
     <value>RECENTE ACTIVITEIT</value>
   </data>
-  <!-- Labels -->
   <data name="StatusCostLabel.Text" xml:space="preserve">
     <value>Kosten (venster):</value>
   </data>
@@ -164,7 +152,6 @@
   <data name="StatusProvidersLabel.Text" xml:space="preserve">
     <value>Aanbieders:</value>
   </data>
-  <!-- Default values -->
   <data name="StatusConnectedText.Text" xml:space="preserve">
     <value>Verbonden</value>
   </data>
@@ -198,7 +185,6 @@
   <data name="StatusCronDescription.Text" xml:space="preserve">
     <value>Open geplande automatiseringsbesturing wanneer de verbonden gateway dit ondersteunt.</value>
   </data>
-  <!-- Buttons -->
   <data name="StatusRefreshButton.Content" xml:space="preserve">
     <value>Vernieuwen</value>
   </data>
@@ -247,12 +233,9 @@
   <data name="StatusOpenStreamButton.Content" xml:space="preserve">
     <value>Stream openen</value>
   </data>
-  <!-- ==================== ActivityStreamWindow.xaml ==================== -->
-  <!-- Header -->
   <data name="ActivityStreamTitle.Text" xml:space="preserve">
     <value>⚡ Activiteitenstroom</value>
   </data>
-  <!-- Filter items -->
   <data name="ActivityFilterAll.Content" xml:space="preserve">
     <value>Alle activiteit</value>
   </data>
@@ -268,11 +251,9 @@
   <data name="ActivityFilterNotifications.Content" xml:space="preserve">
     <value>Meldingen</value>
   </data>
-  <!-- Empty state -->
   <data name="ActivityEmptyText.Text" xml:space="preserve">
     <value>Nog geen activiteit</value>
   </data>
-  <!-- Buttons -->
   <data name="ActivityOpenDashboardButton.Content" xml:space="preserve">
     <value>Dashboard openen</value>
   </data>
@@ -282,24 +263,18 @@
   <data name="ActivityCloseButton.Content" xml:space="preserve">
     <value>Sluiten</value>
   </data>
-  <!-- ==================== NotificationHistoryWindow.xaml ==================== -->
-  <!-- Header -->
   <data name="NotificationHistoryTitle.Text" xml:space="preserve">
     <value>📋 Meldingsgeschiedenis</value>
   </data>
-  <!-- Empty state -->
   <data name="NotificationEmptyText.Text" xml:space="preserve">
     <value>Nog geen meldingen</value>
   </data>
-  <!-- Buttons -->
   <data name="NotificationClearAllButton.Content" xml:space="preserve">
     <value>Alles wissen</value>
   </data>
   <data name="NotificationCloseButton.Content" xml:space="preserve">
     <value>Sluiten</value>
   </data>
-  <!-- ==================== Runtime strings (C# code) ==================== -->
-  <!-- Window titles -->
   <data name="WindowTitle_Settings" xml:space="preserve">
     <value>Instellingen — OpenClaw Tray</value>
   </data>
@@ -324,7 +299,6 @@
   <data name="WindowTitle_Update" xml:space="preserve">
     <value>OpenClaw-update</value>
   </data>
-  <!-- Settings runtime status -->
   <data name="Status_Testing" xml:space="preserve">
     <value>Testen...</value>
   </data>
@@ -334,7 +308,6 @@
   <data name="Status_ConnectionFailed" xml:space="preserve">
     <value>❌ Verbinding mislukt</value>
   </data>
-  <!-- QuickSend dialog -->
   <data name="QuickSend_Header" xml:space="preserve">
     <value>📤 Snel verzenden</value>
   </data>
@@ -359,7 +332,6 @@
   <data name="QuickSend_ToastBody" xml:space="preserve">
     <value>Je bericht is verzonden naar OpenClaw.</value>
   </data>
-  <!-- WelcomeDialog -->
   <data name="Welcome_Title" xml:space="preserve">
     <value>Welkom bij OpenClaw!</value>
   </data>
@@ -384,7 +356,6 @@
   <data name="Welcome_OpenSettingsButton" xml:space="preserve">
     <value>Instellingen openen</value>
   </data>
-  <!-- UpdateDialog -->
   <data name="Update_VersionAvailable" xml:space="preserve">
     <value>🎉 Versie {0} is beschikbaar!</value>
   </data>
@@ -403,7 +374,6 @@
   <data name="Update_DownloadButton" xml:space="preserve">
     <value>Downloaden en installeren</value>
   </data>
-  <!-- Tray menu items -->
   <data name="Menu_OpenDashboard" xml:space="preserve">
     <value>Dashboard openen</value>
   </data>
@@ -494,7 +464,6 @@
   <data name="Menu_RestartSshTunnel" xml:space="preserve">
     <value>SSH-tunnel opnieuw starten</value>
   </data>
-  <!-- Tray menu dynamic text -->
   <data name="Menu_StatusFormat" xml:space="preserve">
     <value>Toestand: {0}</value>
   </data>
@@ -531,18 +500,15 @@
   <data name="Menu_NodeDisconnected" xml:space="preserve">
     <value>⚪ Niet verbonden</value>
   </data>
-  <!-- Notification test -->
   <data name="TestNotification_Title" xml:space="preserve">
     <value>Testmelding</value>
   </data>
   <data name="TestNotification_Body" xml:space="preserve">
     <value>Dit is een testmelding van OpenClaw Tray.</value>
   </data>
-  <!-- Status detail runtime -->
   <data name="Status_LastCheckFormat" xml:space="preserve">
     <value>Laatste controle: {0}</value>
   </data>
-  <!-- Time ago strings -->
   <data name="TimeAgo_JustNow" xml:space="preserve">
     <value>Zojuist</value>
   </data>
@@ -552,14 +518,12 @@
   <data name="TimeAgo_HoursFormat" xml:space="preserve">
     <value>{0}u geleden</value>
   </data>
-  <!-- Activity stream runtime -->
   <data name="Activity_ClickToOpen" xml:space="preserve">
     <value>Klik om te openen in het dashboard</value>
   </data>
   <data name="TimeAgo_DaysFormat" xml:space="preserve">
     <value>{0}d geleden</value>
   </data>
-  <!-- Status display text (plain, no emoji) -->
   <data name="StatusDisplay_Connected" xml:space="preserve">
     <value>Verbonden</value>
   </data>
@@ -578,7 +542,6 @@
   <data name="Status_NotAvailable" xml:space="preserve">
     <value>n.v.t.</value>
   </data>
-  <!-- ==================== CanvasWindow.xaml ==================== -->
   <data name="WindowTitle_Canvas" xml:space="preserve">
     <value>Canvasweergave</value>
   </data>
@@ -612,7 +575,6 @@
   <data name="A2UI_MultipleChoiceSelect" xml:space="preserve">
     <value>Kiezen...</value>
   </data>
-  <!-- ==================== WebChatWindow.xaml ==================== -->
   <data name="WebChatErrorTitle.Text" xml:space="preserve">
     <value>Webchat niet beschikbaar</value>
   </data>
@@ -651,11 +613,9 @@ Gebruik een van deze opties:
 • Bij een zelfondertekend certificaat: importeer je gateway-CA/certificaat in Vertrouwde basiscertificeringsinstanties van Windows (certmgr.msc)
 • Of tunnel naar localhost: ssh -N -L 18789:localhost:18789 &lt;server&gt;</value>
   </data>
-  <!-- ==================== TrayMenuWindow.xaml ==================== -->
   <data name="WindowTitle_TrayMenu" xml:space="preserve">
     <value>OpenClaw-menu</value>
   </data>
-  <!-- ==================== Toast: Device / Node ==================== -->
   <data name="Toast_DeviceIdCopied" xml:space="preserve">
     <value>📋 Apparaat-ID gekopieerd</value>
   </data>
@@ -668,14 +628,12 @@ Gebruik een van deze opties:
   <data name="Toast_NodeSummaryCopiedDetail" xml:space="preserve">
     <value>{0} node(s) gekopieerd naar klembord</value>
   </data>
-  <!-- ==================== Toast: Session ==================== -->
   <data name="Toast_SessionActionFailed" xml:space="preserve">
     <value>❌ Sessieactie mislukt</value>
   </data>
   <data name="Toast_SessionActionFailedDetail" xml:space="preserve">
     <value>Kan verzoek niet naar gateway verzenden.</value>
   </data>
-  <!-- ==================== Toast: Node Mode ==================== -->
   <data name="Toast_NodeModeActive" xml:space="preserve">
     <value>🔌 Node-modus actief</value>
   </data>
@@ -706,7 +664,6 @@ Gebruik een van deze opties:
   <data name="Toast_PairingRejectedDetail" xml:space="preserve">
     <value>De gateway heeft het koppelingsverzoek voor dit apparaat geweigerd.</value>
   </data>
-  <!-- ==================== Toast: Health Check ==================== -->
   <data name="Toast_HealthCheck" xml:space="preserve">
     <value>Statuscontrole</value>
   </data>
@@ -719,7 +676,6 @@ Gebruik een van deze opties:
   <data name="Toast_HealthCheckFailed" xml:space="preserve">
     <value>Statuscontrole mislukt</value>
   </data>
-  <!-- ==================== Toast: Screen / Camera ==================== -->
   <data name="Toast_ScreenCaptured" xml:space="preserve">
     <value>📸 Schermafbeelding gemaakt</value>
   </data>
@@ -732,7 +688,6 @@ Gebruik een van deze opties:
   <data name="Toast_CameraBlockedDetail" xml:space="preserve">
     <value>Schakel cameratoegang in via Windows-privacyinstellingen voor OpenClaw Tray</value>
   </data>
-  <!-- ==================== Toast: Activity Stream Tip ==================== -->
   <data name="Toast_ActivityStreamTip" xml:space="preserve">
     <value>⚡ Nieuw: Activiteitenstroom</value>
   </data>
@@ -742,7 +697,6 @@ Gebruik een van deze opties:
   <data name="Toast_ActivityStreamTipButton" xml:space="preserve">
     <value>Activiteitenstroom openen</value>
   </data>
-  <!-- ==================== SetupWizardWindow (C# code) ==================== -->
   <data name="Setup_Title" xml:space="preserve">
     <value>OpenClaw Instellen</value>
   </data>
@@ -893,7 +847,6 @@ Voer op uw gateway-host (Mac/Linux) uit:
   <data name="Setup_StepDone" xml:space="preserve">
     <value>Stap 3 van 3 — Gereed</value>
   </data>
-  <!-- ==================== Local MCP server (SettingsWindow) ==================== -->
   <data name="SettingsDeveloperModeHeader.Text" xml:space="preserve">
     <value>ONTWIKKELAARSMODUS</value>
   </data>
@@ -901,7 +854,7 @@ Voer op uw gateway-host (Mac/Linux) uit:
     <value>Lokale MCP-server inschakelen</value>
   </data>
   <data name="SettingsMcpDescription.Text" xml:space="preserve">
-    <value>Stelt dezelfde knooppuntmogelijkheden (systeem, scherm, camera, canvas) beschikbaar voor lokale MCP-clients (Claude Desktop, Cursor, Claude Code).</value>
+    <value>Stelt dezelfde knooppuntmogelijkheden (systeem, scherm, camera, microfoon, luidsprekers, canvas) beschikbaar voor lokale MCP-clients (Claude Desktop, Cursor, Claude Code).</value>
   </data>
   <data name="SettingsMcpEndpointLabel.Text" xml:space="preserve">
     <value>Eindpunt:</value>
@@ -927,7 +880,6 @@ Voer op uw gateway-host (Mac/Linux) uit:
   <data name="Mcp_Status_FailedToStart" xml:space="preserve">
     <value>Starten mislukt: </value>
   </data>
-  <!-- ==================== MCP token (SettingsWindow) ==================== -->
   <data name="SettingsMcpTokenLabel.Text" xml:space="preserve">
     <value>Bearer-token:</value>
   </data>
@@ -976,14 +928,12 @@ Voer op uw gateway-host (Mac/Linux) uit:
   <data name="SettingsMcpTokenResetDialog_CloseButton" xml:space="preserve">
     <value>Annuleren</value>
   </data>
-  <!-- ==================== DownloadProgressDialog ==================== -->
   <data name="WindowTitle_Downloading" xml:space="preserve">
     <value>Update downloaden...</value>
   </data>
   <data name="Download_ProgressText" xml:space="preserve">
     <value>Update downloaden...</value>
   </data>
-  <!-- ==================== UrlNavigationApprovalService prompt ==================== -->
   <data name="UrlApproval_Caption" xml:space="preserve">
     <value>OpenClaw — URL goedkeuren</value>
   </data>
@@ -1470,7 +1420,7 @@ Voer op uw gateway-host (Mac/Linux) uit:
   <data name="ClearButton.Content" xml:space="preserve">
     <value>Wissen</value>
   </data>
-<data name="BindingsPage_TextBlock_16.Text" xml:space="preserve">
+  <data name="BindingsPage_TextBlock_16.Text" xml:space="preserve">
     <value>🔀 Koppelingen</value>
   </data>
   <data name="BindingsPage_TextBlock_17.Text" xml:space="preserve">
@@ -1704,7 +1654,7 @@ Voer op uw gateway-host (Mac/Linux) uit:
   <data name="CronPage_TextBlock_120.Text" xml:space="preserve">
     <value>Geen cron-taken geconfigureerd</value>
   </data>
-<data name="DebugPage_TextBlock_8.Text" xml:space="preserve">
+  <data name="DebugPage_TextBlock_8.Text" xml:space="preserve">
     <value>🐛 Foutopsporing</value>
   </data>
   <data name="DebugPage_TextBlock_14.Text" xml:space="preserve">
@@ -1758,7 +1708,7 @@ Voer op uw gateway-host (Mac/Linux) uit:
   <data name="DebugPage_Button_114.Content" xml:space="preserve">
     <value>📋 Ondersteuningscontext kopiëren</value>
   </data>
-<data name="StatusHeadline.Text" xml:space="preserve">
+  <data name="StatusHeadline.Text" xml:space="preserve">
     <value>Niet verbonden met gateway</value>
   </data>
   <data name="ScanButton.Content" xml:space="preserve">
@@ -2274,4 +2224,352 @@ Voer op uw gateway-host (Mac/Linux) uit:
   <data name="CronPageRemoveJobToolTip.Content" xml:space="preserve">
     <value>Taak verwijderen</value>
   </data>
-</root>
+  <data name="CapabilitiesPage_SttCardDescription.Text" xml:space="preserve">
+    <value>Herkent microfoonaudio lokaal op deze pc met Whisper. Het model wordt één keer gedownload bij het eerste gebruik.</value>
+  </data>
+  <data name="CapabilitiesPage_SttCardHeader.Text" xml:space="preserve">
+    <value>🎤 Spraak-naar-tekst</value>
+  </data>
+  <data name="CapabilitiesPage_SttMoreSettingsLink.Content" xml:space="preserve">
+    <value>Meer spraakinstellingen…</value>
+  </data>
+  <data name="CapabilitiesPage_TtsCardDescription.Text" xml:space="preserve">
+    <value>Standaardaanbieder voor tts.speak. Per-aanroep provider/voiceId/model overschrijft deze standaard.</value>
+  </data>
+  <data name="CapabilitiesPage_TtsCardHeader.Text" xml:space="preserve">
+    <value>🔊 Tekst-naar-spraak</value>
+  </data>
+  <data name="CapabilitiesPage_TtsElevenLabsApiKey.Header" xml:space="preserve">
+    <value>ElevenLabs API-sleutel</value>
+  </data>
+  <data name="CapabilitiesPage_TtsElevenLabsHelp.Text" xml:space="preserve">
+    <value>API-sleutel wordt versleuteld opgeslagen met Windows DPAPI. Laat leeg om de eerder opgeslagen waarde te behouden bij het wijzigen van andere velden.</value>
+  </data>
+  <data name="CapabilitiesPage_TtsElevenLabsModel.Header" xml:space="preserve">
+    <value>ElevenLabs-model</value>
+  </data>
+  <data name="CapabilitiesPage_TtsElevenLabsModel.PlaceholderText" xml:space="preserve">
+    <value>eleven_multilingual_v2</value>
+  </data>
+  <data name="CapabilitiesPage_TtsElevenLabsVoiceId.Header" xml:space="preserve">
+    <value>ElevenLabs stem-ID</value>
+  </data>
+  <data name="CapabilitiesPage_TtsProviderComboBox.Header" xml:space="preserve">
+    <value>Aanbieder</value>
+  </data>
+  <data name="CapabilitiesPage_TtsProviderElevenLabs.Content" xml:space="preserve">
+    <value>ElevenLabs</value>
+  </data>
+  <data name="CapabilitiesPage_TtsProviderPiper.Content" xml:space="preserve">
+    <value>Piper (lokale ML, aanbevolen)</value>
+  </data>
+  <data name="CapabilitiesPage_TtsProviderWindows.Content" xml:space="preserve">
+    <value>Ingebouwde Windows-spraak</value>
+  </data>
+  <data name="VoiceSettingsPage_PageTitle.Text" xml:space="preserve">
+    <value>🎙️ Spraak en audio</value>
+  </data>
+  <data name="VoiceSettingsPage_PageDescription.Text" xml:space="preserve">
+    <value>Configureer spraak-naar-tekst en spraakinteractie-instellingen. Alle spraakverwerking draait lokaal op uw apparaat.</value>
+  </data>
+  <data name="VoiceSettingsPage_SttHeader.Text" xml:space="preserve">
+    <value>Spraak-naar-tekst</value>
+  </data>
+  <data name="VoiceSettingsPage_SttDescription.Text" xml:space="preserve">
+    <value>Schakel spraakinvoer via microfoon in. Vereist een Whisper-modeldownload.</value>
+  </data>
+  <data name="VoiceSettingsPage_SttEnabledToggle.Header" xml:space="preserve">
+    <value>Spraakinvoer inschakelen</value>
+  </data>
+  <data name="VoiceSettingsPage_ModelHeader.Text" xml:space="preserve">
+    <value>Spraakmodel</value>
+  </data>
+  <data name="VoiceSettingsPage_ModelCombo.Header" xml:space="preserve">
+    <value>Modelgrootte</value>
+  </data>
+  <data name="VoiceSettingsPage_ModelTiny.Content" xml:space="preserve">
+    <value>Tiny (~75 MB) — Snel, basisnauwkeurigheid</value>
+  </data>
+  <data name="VoiceSettingsPage_ModelBase.Content" xml:space="preserve">
+    <value>Base (~142 MB) — Goede balans</value>
+  </data>
+  <data name="VoiceSettingsPage_ModelSmall.Content" xml:space="preserve">
+    <value>Small (~466 MB) — Hoge nauwkeurigheid</value>
+  </data>
+  <data name="VoiceSettingsPage_DownloadButtonText.Text" xml:space="preserve">
+    <value>Model downloaden</value>
+  </data>
+  <data name="VoiceSettingsPage_LanguageHeader.Text" xml:space="preserve">
+    <value>Taal</value>
+  </data>
+  <data name="VoiceSettingsPage_LangAuto.Content" xml:space="preserve">
+    <value>Automatisch detecteren</value>
+  </data>
+  <data name="VoiceSettingsPage_LangEn.Content" xml:space="preserve">
+    <value>Engels</value>
+  </data>
+  <data name="VoiceSettingsPage_LangEs.Content" xml:space="preserve">
+    <value>Spaans</value>
+  </data>
+  <data name="VoiceSettingsPage_LangFr.Content" xml:space="preserve">
+    <value>Frans</value>
+  </data>
+  <data name="VoiceSettingsPage_LangDe.Content" xml:space="preserve">
+    <value>Duits</value>
+  </data>
+  <data name="VoiceSettingsPage_LangJa.Content" xml:space="preserve">
+    <value>Japans</value>
+  </data>
+  <data name="VoiceSettingsPage_LangZh.Content" xml:space="preserve">
+    <value>Chinees</value>
+  </data>
+  <data name="VoiceSettingsPage_LangKo.Content" xml:space="preserve">
+    <value>Koreaans</value>
+  </data>
+  <data name="VoiceSettingsPage_LangPt.Content" xml:space="preserve">
+    <value>Portugees</value>
+  </data>
+  <data name="VoiceSettingsPage_LangIt.Content" xml:space="preserve">
+    <value>Italiaans</value>
+  </data>
+  <data name="VoiceSettingsPage_VoiceChatHeader.Text" xml:space="preserve">
+    <value>Spraakchat</value>
+  </data>
+  <data name="VoiceSettingsPage_SilenceSlider.Header" xml:space="preserve">
+    <value>Stilte-time-out (seconden)</value>
+  </data>
+  <data name="VoiceSettingsPage_TtsResponseToggle.Header" xml:space="preserve">
+    <value>Lees antwoorden hardop voor</value>
+  </data>
+  <data name="VoiceSettingsPage_AudioFeedbackToggle.Header" xml:space="preserve">
+    <value>Audio-feedbackgeluiden</value>
+  </data>
+  <data name="VoiceSettingsPage_VoiceHeader.Text" xml:space="preserve">
+    <value>🔊 Companion-stem</value>
+  </data>
+  <data name="VoiceSettingsPage_VoiceDescription.Text" xml:space="preserve">
+    <value>Kies de stem die wordt gebruikt bij het hardop voorlezen van antwoorden.</value>
+  </data>
+  <data name="VoiceSettingsPage_TtsProviderCombo.Header" xml:space="preserve">
+    <value>Aanbieder</value>
+  </data>
+  <data name="VoiceSettingsPage_TtsPiper.Content" xml:space="preserve">
+    <value>Piper (lokale neurale stemmen)</value>
+  </data>
+  <data name="VoiceSettingsPage_TtsWindows.Content" xml:space="preserve">
+    <value>Windows (ingebouwde neurale stemmen)</value>
+  </data>
+  <data name="VoiceSettingsPage_TtsElevenLabs.Content" xml:space="preserve">
+    <value>ElevenLabs (cloud, API-sleutel vereist)</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperVoiceCombo.Header" xml:space="preserve">
+    <value>Stem</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDownloadButtonText.Text" xml:space="preserve">
+    <value>Stem downloaden</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDeleteButton.Content" xml:space="preserve">
+    <value>Verwijderen</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperPreviewButton.Content" xml:space="preserve">
+    <value>▶ Voorbeeld</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperInfoText.Text" xml:space="preserve">
+    <value>Stemmen worden gedownload van de GitHub-releases van het sherpa-onnx-project (~25 MB lage kwaliteit, tot ~150 MB hoge kwaliteit). Ze draaien volledig op deze pc; er verlaat geen audio uw apparaat.</value>
+  </data>
+  <data name="VoiceSettingsPage_WindowsVoiceCombo.Header" xml:space="preserve">
+    <value>Stem</value>
+  </data>
+  <data name="VoiceSettingsPage_PreviewVoiceButton.Content" xml:space="preserve">
+    <value>▶ Voorbeeld van stem</value>
+  </data>
+  <data name="VoiceSettingsPage_ElevenLabsApiKeyBox.Header" xml:space="preserve">
+    <value>API-sleutel</value>
+  </data>
+  <data name="VoiceSettingsPage_ElevenLabsVoiceIdBox.Header" xml:space="preserve">
+    <value>Stem-ID</value>
+  </data>
+  <data name="VoiceSettingsPage_ElevenLabsModelBox.Header" xml:space="preserve">
+    <value>Model (optioneel)</value>
+  </data>
+  <data name="VoiceSettingsPage_PrivacyNote.Text" xml:space="preserve">
+    <value>Alle spraakverwerking draait volledig op uw apparaat. Er worden geen audiogegevens verzonden naar een cloudservice.</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusModelReady" xml:space="preserve">
+    <value>✅ Model gereed</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusDownloadRequired" xml:space="preserve">
+    <value>⬇️ Download vereist</value>
+  </data>
+  <data name="VoiceSettingsPage_ButtonReDownload" xml:space="preserve">
+    <value>Opnieuw downloaden</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusDownloading" xml:space="preserve">
+    <value>Downloaden...</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusDownloadingPct" xml:space="preserve">
+    <value>Downloaden... {0}%</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusDownloadCanceled" xml:space="preserve">
+    <value>Download geannuleerd</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusError" xml:space="preserve">
+    <value>❌ Bewerking mislukt (zie Debug-log)</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperButtonDownloaded" xml:space="preserve">
+    <value>Gedownload</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperVoiceReady" xml:space="preserve">
+    <value>Stem gereed op deze pc ({0} MB).</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperVoiceNotDownloaded" xml:space="preserve">
+    <value>Stem nog niet gedownload. Klik op Downloaden om het model op te halen (~25–150 MB afhankelijk van kwaliteit).</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperButtonDownloading" xml:space="preserve">
+    <value>Downloaden…</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperConnecting" xml:space="preserve">
+    <value>Verbinding maken met sherpa-onnx-releases…</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperProgressIndeterminate" xml:space="preserve">
+    <value>Downloaden… {0} MB tot nu toe</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperProgressBytes" xml:space="preserve">
+    <value>Downloaden… {0} / {1} MB</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperExtracting" xml:space="preserve">
+    <value>Download voltooid. Uitpakken…</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDownloadCanceled" xml:space="preserve">
+    <value>Download geannuleerd.</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDownloadFailed" xml:space="preserve">
+    <value>Download mislukt (zie Debug-log).</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperButtonRetry" xml:space="preserve">
+    <value>Download opnieuw proberen</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDeleted" xml:space="preserve">
+    <value>Stem verwijderd.</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDeleteFailed" xml:space="preserve">
+    <value>Verwijderen mislukt (zie Debug-log).</value>
+  </data>
+  <data name="VoiceSettingsPage_CompanionPreviewText" xml:space="preserve">
+    <value>Hallo! Dit is uw Companion die spreekt.</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperPreviewFailed" xml:space="preserve">
+    <value>Voorbeeld mislukt (zie Debug-log).</value>
+  </data>
+  <data name="VoiceSettingsPage_VoiceErrorLoading" xml:space="preserve">
+    <value>Fout bij laden van stemmen (zie Debug-log).</value>
+  </data>
+  <data name="VoiceSettingsPage_PreviewButtonPlaying" xml:space="preserve">
+    <value>▶ Afspelen...</value>
+  </data>
+  <data name="VoiceOverlayWindow_HeaderText.Text" xml:space="preserve">
+    <value>Companion-stem</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusBadge.Text" xml:space="preserve">
+    <value>Gereed</value>
+  </data>
+  <data name="VoiceOverlayWindow_EmptyStateText.Text" xml:space="preserve">
+    <value>Druk op Start en begin met spreken</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusText.Text" xml:space="preserve">
+    <value>Druk op Start om te beginnen</value>
+  </data>
+  <data name="VoiceOverlayWindow_StartStopText.Text" xml:space="preserve">
+    <value>Beginnen met luisteren</value>
+  </data>
+  <data name="VoiceOverlayWindow_MuteButton.[using:Microsoft.UI.Xaml.Controls]ToolTipService.ToolTip" xml:space="preserve">
+    <value>Dempen</value>
+  </data>
+  <data name="VoiceOverlayWindow_SettingsButton.[using:Microsoft.UI.Xaml.Controls]ToolTipService.ToolTip" xml:space="preserve">
+    <value>Spraakinstellingen</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusListening" xml:space="preserve">
+    <value>🗣️ Aan het luisteren...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusSpeakNow" xml:space="preserve">
+    <value>Spreek nu — ik luister</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateInitializing" xml:space="preserve">
+    <value>Initialiseren...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateStarting" xml:space="preserve">
+    <value>Bezig met starten</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateDownloadingModel" xml:space="preserve">
+    <value>Spraakmodel downloaden...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateDownloadingPct" xml:space="preserve">
+    <value>Model downloaden... {0}%</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateLoadingModel" xml:space="preserve">
+    <value>Spraakmodel laden...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateStartingMic" xml:space="preserve">
+    <value>Microfoon starten...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateStopping" xml:space="preserve">
+    <value>Stoppen...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateError" xml:space="preserve">
+    <value>Fout</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusError" xml:space="preserve">
+    <value>Fout opgetreden (zie Debug-log)</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusMuted" xml:space="preserve">
+    <value>Gedempt</value>
+  </data>
+  <data name="VoiceOverlayWindow_StopText" xml:space="preserve">
+    <value>Stoppen</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeStopped" xml:space="preserve">
+    <value>Gestopt</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeStartingDots" xml:space="preserve">
+    <value>Bezig met starten...</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeListening" xml:space="preserve">
+    <value>Aan het luisteren</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeProcessing" xml:space="preserve">
+    <value>Verwerken...</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeUnknown" xml:space="preserve">
+    <value>Onbekend</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusInitMic" xml:space="preserve">
+    <value>Microfoon initialiseren...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusTranscribing" xml:space="preserve">
+    <value>Uw spraak transcriberen...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusErrorOccurred" xml:space="preserve">
+    <value>Er is een fout opgetreden</value>
+  </data>
+  <data name="VoiceOverlayWindow_winexWindowEx_2.Title" xml:space="preserve">
+    <value>Companion Voice</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeReady" xml:space="preserve">
+    <value>Gereed</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusReadyMessage" xml:space="preserve">
+    <value>Druk op Start om te beginnen</value>
+  </data>
+  <data name="VoiceOverlayWindow_ButtonStartListening" xml:space="preserve">
+    <value>Beginnen met luisteren</value>
+  </data>
+  <data name="VoiceSettingsPage_ButtonDownloadModel" xml:space="preserve">
+    <value>Model downloaden</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperButtonDownloadVoice" xml:space="preserve">
+    <value>Stem downloaden</value>
+  </data>
+  <data name="VoiceSettingsPage_PreviewVoiceButtonContent" xml:space="preserve">
+    <value>▶ Voorbeeld van stem</value>
+  </data>
+</root>
\ No newline at end of file
diff --git a/src/OpenClaw.Tray.WinUI/Strings/zh-cn/Resources.resw b/src/OpenClaw.Tray.WinUI/Strings/zh-cn/Resources.resw
index f2d804ef..063c1304 100644
--- a/src/OpenClaw.Tray.WinUI/Strings/zh-cn/Resources.resw
+++ b/src/OpenClaw.Tray.WinUI/Strings/zh-cn/Resources.resw
@@ -1,24 +1,22 @@
-<?xml version="1.0" encoding="utf-8"?>
-<root>
-  <xsd:schema id="root" xmlns="" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:msdata="urn:schemas-microsoft-com:xml-msdata">
-    <xsd:element name="root" msdata:IsDataSet="true">
-      <xsd:complexType>
-        <xsd:choice maxOccurs="unbounded">
-          <xsd:element name="data">
-            <xsd:complexType>
-              <xsd:sequence>
-                <xsd:element name="value" type="xsd:string" minOccurs="0" msdata:Ordinal="1" />
-                <xsd:element name="comment" type="xsd:string" minOccurs="0" msdata:Ordinal="2" />
-              </xsd:sequence>
-              <xsd:attribute name="name" type="xsd:string" use="required" />
-            </xsd:complexType>
-          </xsd:element>
-        </xsd:choice>
-      </xsd:complexType>
-    </xsd:element>
-  </xsd:schema>
-  <!-- ==================== SettingsWindow.xaml ==================== -->
-  <!-- Section headers -->
+<?xml version='1.0' encoding='utf-8'?>
+<root xmlns:ns1="urn:schemas-microsoft-com:xml-msdata" xmlns:xs="http://www.w3.org/2001/XMLSchema">
+  <xs:schema id="root">
+    <xs:element name="root" ns1:IsDataSet="true">
+      <xs:complexType>
+        <xs:choice maxOccurs="unbounded">
+          <xs:element name="data">
+            <xs:complexType>
+              <xs:sequence>
+                <xs:element name="value" type="xsd:string" minOccurs="0" ns1:Ordinal="1" />
+                <xs:element name="comment" type="xsd:string" minOccurs="0" ns1:Ordinal="2" />
+              </xs:sequence>
+              <xs:attribute name="name" type="xsd:string" use="required" />
+            </xs:complexType>
+          </xs:element>
+        </xs:choice>
+      </xs:complexType>
+    </xs:element>
+  </xs:schema>
   <data name="SettingsConnectionHeader.Text" xml:space="preserve">
     <value>连接</value>
   </data>
@@ -31,7 +29,6 @@
   <data name="SettingsAdvancedHeader.Text" xml:space="preserve">
     <value>高级（实验性）</value>
   </data>
-  <!-- TextBox headers and placeholders -->
   <data name="SettingsGatewayUrlTextBox.Header" xml:space="preserve">
     <value>网关地址</value>
   </data>
@@ -44,7 +41,6 @@
   <data name="SettingsTokenTextBox.PlaceholderText" xml:space="preserve">
     <value>您的 API 令牌</value>
   </data>
-  <!-- Toggle headers -->
   <data name="SettingsAutoStartToggle.Header" xml:space="preserve">
     <value>随 Windows 自动启动</value>
   </data>
@@ -57,7 +53,6 @@
   <data name="SettingsNodeModeToggle.Header" xml:space="preserve">
     <value>启用节点模式</value>
   </data>
-  <!-- ComboBox -->
   <data name="SettingsSoundComboBox.Header" xml:space="preserve">
     <value>提示音</value>
   </data>
@@ -70,14 +65,12 @@
   <data name="SettingsSoundSubtle.Content" xml:space="preserve">
     <value>柔和</value>
   </data>
-  <!-- Notification filter labels -->
   <data name="SettingsNotifyForLabel.Text" xml:space="preserve">
     <value>显示以下类型通知:</value>
   </data>
   <data name="SettingsNotifyFilterHint.Text" xml:space="preserve">
     <value>按消息中的关键词过滤（例如"邮件"、"提醒"）</value>
   </data>
-  <!-- Notification filter checkboxes -->
   <data name="SettingsNotifyHealthCb.Content" xml:space="preserve">
     <value>健康警报</value>
   </data>
@@ -102,7 +95,6 @@
   <data name="SettingsNotifyInfoCb.Content" xml:space="preserve">
     <value>常规信息</value>
   </data>
-  <!-- Buttons -->
   <data name="SettingsTestConnectionButton.Content" xml:space="preserve">
     <value>测试</value>
   </data>
@@ -115,12 +107,9 @@
   <data name="SettingsCancelButton.Content" xml:space="preserve">
     <value>取消</value>
   </data>
-  <!-- Node mode description -->
   <data name="SettingsNodeModeDescription.Text" xml:space="preserve">
     <value>启用后，此电脑可以接收来自代理的命令（画布、截图等）</value>
   </data>
-  <!-- ==================== StatusDetailWindow.xaml ==================== -->
-  <!-- Section headers -->
   <data name="StatusUsageHeader.Text" xml:space="preserve">
     <value>用量</value>
   </data>
@@ -154,7 +143,6 @@
   <data name="StatusRecentActivityHeader.Text" xml:space="preserve">
     <value>最近活动</value>
   </data>
-  <!-- Labels -->
   <data name="StatusCostLabel.Text" xml:space="preserve">
     <value>费用（窗口期）:</value>
   </data>
@@ -164,7 +152,6 @@
   <data name="StatusProvidersLabel.Text" xml:space="preserve">
     <value>提供者:</value>
   </data>
-  <!-- Default values -->
   <data name="StatusConnectedText.Text" xml:space="preserve">
     <value>已连接</value>
   </data>
@@ -198,7 +185,6 @@
   <data name="StatusCronDescription.Text" xml:space="preserve">
     <value>当连接的网关支持时，打开计划自动化控件。</value>
   </data>
-  <!-- Buttons -->
   <data name="StatusRefreshButton.Content" xml:space="preserve">
     <value>刷新</value>
   </data>
@@ -247,12 +233,9 @@
   <data name="StatusOpenStreamButton.Content" xml:space="preserve">
     <value>打开流</value>
   </data>
-  <!-- ==================== ActivityStreamWindow.xaml ==================== -->
-  <!-- Header -->
   <data name="ActivityStreamTitle.Text" xml:space="preserve">
     <value>⚡ 活动流</value>
   </data>
-  <!-- Filter items -->
   <data name="ActivityFilterAll.Content" xml:space="preserve">
     <value>全部活动</value>
   </data>
@@ -268,11 +251,9 @@
   <data name="ActivityFilterNotifications.Content" xml:space="preserve">
     <value>通知</value>
   </data>
-  <!-- Empty state -->
   <data name="ActivityEmptyText.Text" xml:space="preserve">
     <value>暂无活动</value>
   </data>
-  <!-- Buttons -->
   <data name="ActivityOpenDashboardButton.Content" xml:space="preserve">
     <value>打开仪表板</value>
   </data>
@@ -282,24 +263,18 @@
   <data name="ActivityCloseButton.Content" xml:space="preserve">
     <value>关闭</value>
   </data>
-  <!-- ==================== NotificationHistoryWindow.xaml ==================== -->
-  <!-- Header -->
   <data name="NotificationHistoryTitle.Text" xml:space="preserve">
     <value>📋 通知历史</value>
   </data>
-  <!-- Empty state -->
   <data name="NotificationEmptyText.Text" xml:space="preserve">
     <value>暂无通知</value>
   </data>
-  <!-- Buttons -->
   <data name="NotificationClearAllButton.Content" xml:space="preserve">
     <value>全部清除</value>
   </data>
   <data name="NotificationCloseButton.Content" xml:space="preserve">
     <value>关闭</value>
   </data>
-  <!-- ==================== Runtime strings (C# code) ==================== -->
-  <!-- Window titles -->
   <data name="WindowTitle_Settings" xml:space="preserve">
     <value>设置 — OpenClaw 托盘</value>
   </data>
@@ -324,7 +299,6 @@
   <data name="WindowTitle_Update" xml:space="preserve">
     <value>OpenClaw 更新</value>
   </data>
-  <!-- Settings runtime status -->
   <data name="Status_Testing" xml:space="preserve">
     <value>测试中...</value>
   </data>
@@ -334,7 +308,6 @@
   <data name="Status_ConnectionFailed" xml:space="preserve">
     <value>❌ 连接失败</value>
   </data>
-  <!-- QuickSend dialog -->
   <data name="QuickSend_Header" xml:space="preserve">
     <value>📤 快速发送</value>
   </data>
@@ -359,7 +332,6 @@
   <data name="QuickSend_ToastBody" xml:space="preserve">
     <value>您的消息已发送至 OpenClaw。</value>
   </data>
-  <!-- WelcomeDialog -->
   <data name="Welcome_Title" xml:space="preserve">
     <value>欢迎使用 OpenClaw！</value>
   </data>
@@ -384,7 +356,6 @@
   <data name="Welcome_OpenSettingsButton" xml:space="preserve">
     <value>打开设置</value>
   </data>
-  <!-- UpdateDialog -->
   <data name="Update_VersionAvailable" xml:space="preserve">
     <value>🎉 版本 {0} 已可用！</value>
   </data>
@@ -403,7 +374,6 @@
   <data name="Update_DownloadButton" xml:space="preserve">
     <value>下载并安装</value>
   </data>
-  <!-- Tray menu items -->
   <data name="Menu_OpenDashboard" xml:space="preserve">
     <value>打开仪表板</value>
   </data>
@@ -494,7 +464,6 @@
   <data name="Menu_RestartSshTunnel" xml:space="preserve">
     <value>重启 SSH 隧道</value>
   </data>
-  <!-- Tray menu dynamic text -->
   <data name="Menu_StatusFormat" xml:space="preserve">
     <value>状态: {0}</value>
   </data>
@@ -531,18 +500,15 @@
   <data name="Menu_NodeDisconnected" xml:space="preserve">
     <value>⚪ 已断开</value>
   </data>
-  <!-- Notification test -->
   <data name="TestNotification_Title" xml:space="preserve">
     <value>测试通知</value>
   </data>
   <data name="TestNotification_Body" xml:space="preserve">
     <value>这是来自 OpenClaw 托盘的测试通知。</value>
   </data>
-  <!-- Status detail runtime -->
   <data name="Status_LastCheckFormat" xml:space="preserve">
     <value>上次检查: {0}</value>
   </data>
-  <!-- Time ago strings -->
   <data name="TimeAgo_JustNow" xml:space="preserve">
     <value>刚刚</value>
   </data>
@@ -552,14 +518,12 @@
   <data name="TimeAgo_HoursFormat" xml:space="preserve">
     <value>{0}小时前</value>
   </data>
-  <!-- Activity stream runtime -->
   <data name="Activity_ClickToOpen" xml:space="preserve">
     <value>点击在仪表板中打开</value>
   </data>
   <data name="TimeAgo_DaysFormat" xml:space="preserve">
     <value>{0}天前</value>
   </data>
-  <!-- Status display text (plain, no emoji) -->
   <data name="StatusDisplay_Connected" xml:space="preserve">
     <value>已连接</value>
   </data>
@@ -578,7 +542,6 @@
   <data name="Status_NotAvailable" xml:space="preserve">
     <value>无</value>
   </data>
-  <!-- ==================== CanvasWindow.xaml ==================== -->
   <data name="WindowTitle_Canvas" xml:space="preserve">
     <value>画布</value>
   </data>
@@ -612,7 +575,6 @@
   <data name="A2UI_UnsupportedComponent" xml:space="preserve">
     <value>不支持的组件: {0}</value>
   </data>
-  <!-- ==================== WebChatWindow.xaml ==================== -->
   <data name="WebChatErrorTitle.Text" xml:space="preserve">
     <value>网页聊天不可用</value>
   </data>
@@ -651,11 +613,9 @@
 • 如果是自签名证书，请将网关 CA/证书导入 Windows 受信任的根证书（certmgr.msc）
 • 或通过隧道连接到 localhost：ssh -N -L 18789:localhost:18789 &lt;服务器&gt;</value>
   </data>
-  <!-- ==================== TrayMenuWindow.xaml ==================== -->
   <data name="WindowTitle_TrayMenu" xml:space="preserve">
     <value>OpenClaw 菜单</value>
   </data>
-  <!-- ==================== Toast: Device / Node ==================== -->
   <data name="Toast_DeviceIdCopied" xml:space="preserve">
     <value>📋 设备 ID 已复制</value>
   </data>
@@ -668,14 +628,12 @@
   <data name="Toast_NodeSummaryCopiedDetail" xml:space="preserve">
     <value>已复制 {0} 个节点到剪贴板</value>
   </data>
-  <!-- ==================== Toast: Session ==================== -->
   <data name="Toast_SessionActionFailed" xml:space="preserve">
     <value>❌ 会话操作失败</value>
   </data>
   <data name="Toast_SessionActionFailedDetail" xml:space="preserve">
     <value>无法向网关发送请求。</value>
   </data>
-  <!-- ==================== Toast: Node Mode ==================== -->
   <data name="Toast_NodeModeActive" xml:space="preserve">
     <value>🔌 节点模式已激活</value>
   </data>
@@ -706,7 +664,6 @@
   <data name="Toast_PairingRejectedDetail" xml:space="preserve">
     <value>网关拒绝了此设备的配对请求。</value>
   </data>
-  <!-- ==================== Toast: Health Check ==================== -->
   <data name="Toast_HealthCheck" xml:space="preserve">
     <value>健康检查</value>
   </data>
@@ -719,7 +676,6 @@
   <data name="Toast_HealthCheckFailed" xml:space="preserve">
     <value>健康检查失败</value>
   </data>
-  <!-- ==================== Toast: Screen / Camera ==================== -->
   <data name="Toast_ScreenCaptured" xml:space="preserve">
     <value>📸 屏幕已捕获</value>
   </data>
@@ -732,7 +688,6 @@
   <data name="Toast_CameraBlockedDetail" xml:space="preserve">
     <value>请在 Windows 隐私设置中为 OpenClaw Tray 启用相机访问</value>
   </data>
-  <!-- ==================== Toast: Activity Stream Tip ==================== -->
   <data name="Toast_ActivityStreamTip" xml:space="preserve">
     <value>⚡ 新功能: 活动流</value>
   </data>
@@ -742,7 +697,6 @@
   <data name="Toast_ActivityStreamTipButton" xml:space="preserve">
     <value>打开活动流</value>
   </data>
-  <!-- ==================== SetupWizardWindow (C# code) ==================== -->
   <data name="Setup_Title" xml:space="preserve">
     <value>OpenClaw 设置</value>
   </data>
@@ -893,7 +847,6 @@
   <data name="Setup_StepDone" xml:space="preserve">
     <value>第 3 步（共 3 步）— 完成</value>
   </data>
-  <!-- ==================== Local MCP server (SettingsWindow) ==================== -->
   <data name="SettingsDeveloperModeHeader.Text" xml:space="preserve">
     <value>开发者模式</value>
   </data>
@@ -901,7 +854,7 @@
     <value>启用本地 MCP 服务器</value>
   </data>
   <data name="SettingsMcpDescription.Text" xml:space="preserve">
-    <value>向本地 MCP 客户端（Claude Desktop、Cursor、Claude Code）公开相同的节点功能（系统、屏幕、摄像头、画布）。</value>
+    <value>向本地 MCP 客户端（Claude Desktop、Cursor、Claude Code）公开相同的节点功能（系统、屏幕、摄像头、麦克风、扬声器、画布）。</value>
   </data>
   <data name="SettingsMcpEndpointLabel.Text" xml:space="preserve">
     <value>终结点：</value>
@@ -927,7 +880,6 @@
   <data name="Mcp_Status_FailedToStart" xml:space="preserve">
     <value>启动失败：</value>
   </data>
-  <!-- ==================== MCP token (SettingsWindow) ==================== -->
   <data name="SettingsMcpTokenLabel.Text" xml:space="preserve">
     <value>Bearer 令牌：</value>
   </data>
@@ -976,14 +928,12 @@
   <data name="SettingsMcpTokenResetDialog_CloseButton" xml:space="preserve">
     <value>取消</value>
   </data>
-  <!-- ==================== DownloadProgressDialog ==================== -->
   <data name="WindowTitle_Downloading" xml:space="preserve">
     <value>正在下载更新...</value>
   </data>
   <data name="Download_ProgressText" xml:space="preserve">
     <value>正在下载更新...</value>
   </data>
-  <!-- ==================== UrlNavigationApprovalService prompt ==================== -->
   <data name="UrlApproval_Caption" xml:space="preserve">
     <value>OpenClaw — 批准 URL</value>
   </data>
@@ -1470,7 +1420,7 @@
   <data name="ClearButton.Content" xml:space="preserve">
     <value>清除</value>
   </data>
-<data name="BindingsPage_TextBlock_16.Text" xml:space="preserve">
+  <data name="BindingsPage_TextBlock_16.Text" xml:space="preserve">
     <value>🔀 绑定</value>
   </data>
   <data name="BindingsPage_TextBlock_17.Text" xml:space="preserve">
@@ -1704,7 +1654,7 @@
   <data name="CronPage_TextBlock_120.Text" xml:space="preserve">
     <value>未配置 Cron 作业</value>
   </data>
-<data name="DebugPage_TextBlock_8.Text" xml:space="preserve">
+  <data name="DebugPage_TextBlock_8.Text" xml:space="preserve">
     <value>🐛 调试</value>
   </data>
   <data name="DebugPage_TextBlock_14.Text" xml:space="preserve">
@@ -1758,7 +1708,7 @@
   <data name="DebugPage_Button_114.Content" xml:space="preserve">
     <value>📋 复制支持上下文</value>
   </data>
-<data name="StatusHeadline.Text" xml:space="preserve">
+  <data name="StatusHeadline.Text" xml:space="preserve">
     <value>未连接到网关</value>
   </data>
   <data name="ScanButton.Content" xml:space="preserve">
@@ -2274,4 +2224,352 @@
   <data name="CronPageRemoveJobToolTip.Content" xml:space="preserve">
     <value>删除任务</value>
   </data>
-</root>
+  <data name="CapabilitiesPage_SttCardDescription.Text" xml:space="preserve">
+    <value>使用 Whisper 在本机本地识别麦克风音频。模型在首次使用时下载一次。</value>
+  </data>
+  <data name="CapabilitiesPage_SttCardHeader.Text" xml:space="preserve">
+    <value>🎤 语音转文字</value>
+  </data>
+  <data name="CapabilitiesPage_SttMoreSettingsLink.Content" xml:space="preserve">
+    <value>更多语音设置…</value>
+  </data>
+  <data name="CapabilitiesPage_TtsCardDescription.Text" xml:space="preserve">
+    <value>tts.speak 的默认提供程序。按调用传入的 provider/voiceId/model 会覆盖此默认值。</value>
+  </data>
+  <data name="CapabilitiesPage_TtsCardHeader.Text" xml:space="preserve">
+    <value>🔊 文字转语音</value>
+  </data>
+  <data name="CapabilitiesPage_TtsElevenLabsApiKey.Header" xml:space="preserve">
+    <value>ElevenLabs API 密钥</value>
+  </data>
+  <data name="CapabilitiesPage_TtsElevenLabsHelp.Text" xml:space="preserve">
+    <value>API 密钥使用 Windows DPAPI 加密保存。修改其他字段时如保持为空，则保留先前保存的值。</value>
+  </data>
+  <data name="CapabilitiesPage_TtsElevenLabsModel.Header" xml:space="preserve">
+    <value>ElevenLabs 模型</value>
+  </data>
+  <data name="CapabilitiesPage_TtsElevenLabsModel.PlaceholderText" xml:space="preserve">
+    <value>eleven_multilingual_v2</value>
+  </data>
+  <data name="CapabilitiesPage_TtsElevenLabsVoiceId.Header" xml:space="preserve">
+    <value>ElevenLabs 语音 ID</value>
+  </data>
+  <data name="CapabilitiesPage_TtsProviderComboBox.Header" xml:space="preserve">
+    <value>提供程序</value>
+  </data>
+  <data name="CapabilitiesPage_TtsProviderElevenLabs.Content" xml:space="preserve">
+    <value>ElevenLabs</value>
+  </data>
+  <data name="CapabilitiesPage_TtsProviderPiper.Content" xml:space="preserve">
+    <value>Piper（本地 ML，推荐）</value>
+  </data>
+  <data name="CapabilitiesPage_TtsProviderWindows.Content" xml:space="preserve">
+    <value>Windows 内置语音</value>
+  </data>
+  <data name="VoiceSettingsPage_PageTitle.Text" xml:space="preserve">
+    <value>🎙️ 语音和音频</value>
+  </data>
+  <data name="VoiceSettingsPage_PageDescription.Text" xml:space="preserve">
+    <value>配置语音转文字和语音交互设置。所有语音处理均在本机本地运行。</value>
+  </data>
+  <data name="VoiceSettingsPage_SttHeader.Text" xml:space="preserve">
+    <value>语音转文字</value>
+  </data>
+  <data name="VoiceSettingsPage_SttDescription.Text" xml:space="preserve">
+    <value>通过麦克风启用语音输入。需要下载 Whisper 模型。</value>
+  </data>
+  <data name="VoiceSettingsPage_SttEnabledToggle.Header" xml:space="preserve">
+    <value>启用语音输入</value>
+  </data>
+  <data name="VoiceSettingsPage_ModelHeader.Text" xml:space="preserve">
+    <value>语音模型</value>
+  </data>
+  <data name="VoiceSettingsPage_ModelCombo.Header" xml:space="preserve">
+    <value>模型大小</value>
+  </data>
+  <data name="VoiceSettingsPage_ModelTiny.Content" xml:space="preserve">
+    <value>Tiny (~75 MB) — 快速,基础精度</value>
+  </data>
+  <data name="VoiceSettingsPage_ModelBase.Content" xml:space="preserve">
+    <value>Base (~142 MB) — 良好平衡</value>
+  </data>
+  <data name="VoiceSettingsPage_ModelSmall.Content" xml:space="preserve">
+    <value>Small (~466 MB) — 高精度</value>
+  </data>
+  <data name="VoiceSettingsPage_DownloadButtonText.Text" xml:space="preserve">
+    <value>下载模型</value>
+  </data>
+  <data name="VoiceSettingsPage_LanguageHeader.Text" xml:space="preserve">
+    <value>语言</value>
+  </data>
+  <data name="VoiceSettingsPage_LangAuto.Content" xml:space="preserve">
+    <value>自动检测</value>
+  </data>
+  <data name="VoiceSettingsPage_LangEn.Content" xml:space="preserve">
+    <value>英语</value>
+  </data>
+  <data name="VoiceSettingsPage_LangEs.Content" xml:space="preserve">
+    <value>西班牙语</value>
+  </data>
+  <data name="VoiceSettingsPage_LangFr.Content" xml:space="preserve">
+    <value>法语</value>
+  </data>
+  <data name="VoiceSettingsPage_LangDe.Content" xml:space="preserve">
+    <value>德语</value>
+  </data>
+  <data name="VoiceSettingsPage_LangJa.Content" xml:space="preserve">
+    <value>日语</value>
+  </data>
+  <data name="VoiceSettingsPage_LangZh.Content" xml:space="preserve">
+    <value>中文</value>
+  </data>
+  <data name="VoiceSettingsPage_LangKo.Content" xml:space="preserve">
+    <value>韩语</value>
+  </data>
+  <data name="VoiceSettingsPage_LangPt.Content" xml:space="preserve">
+    <value>葡萄牙语</value>
+  </data>
+  <data name="VoiceSettingsPage_LangIt.Content" xml:space="preserve">
+    <value>意大利语</value>
+  </data>
+  <data name="VoiceSettingsPage_VoiceChatHeader.Text" xml:space="preserve">
+    <value>语音聊天</value>
+  </data>
+  <data name="VoiceSettingsPage_SilenceSlider.Header" xml:space="preserve">
+    <value>静音超时(秒)</value>
+  </data>
+  <data name="VoiceSettingsPage_TtsResponseToggle.Header" xml:space="preserve">
+    <value>朗读响应</value>
+  </data>
+  <data name="VoiceSettingsPage_AudioFeedbackToggle.Header" xml:space="preserve">
+    <value>音频反馈声音</value>
+  </data>
+  <data name="VoiceSettingsPage_VoiceHeader.Text" xml:space="preserve">
+    <value>🔊 Companion 语音</value>
+  </data>
+  <data name="VoiceSettingsPage_VoiceDescription.Text" xml:space="preserve">
+    <value>选择朗读响应时使用的语音。</value>
+  </data>
+  <data name="VoiceSettingsPage_TtsProviderCombo.Header" xml:space="preserve">
+    <value>提供商</value>
+  </data>
+  <data name="VoiceSettingsPage_TtsPiper.Content" xml:space="preserve">
+    <value>Piper(本地神经语音)</value>
+  </data>
+  <data name="VoiceSettingsPage_TtsWindows.Content" xml:space="preserve">
+    <value>Windows(内置神经语音)</value>
+  </data>
+  <data name="VoiceSettingsPage_TtsElevenLabs.Content" xml:space="preserve">
+    <value>ElevenLabs(云端,需要 API 密钥)</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperVoiceCombo.Header" xml:space="preserve">
+    <value>语音</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDownloadButtonText.Text" xml:space="preserve">
+    <value>下载语音</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDeleteButton.Content" xml:space="preserve">
+    <value>删除</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperPreviewButton.Content" xml:space="preserve">
+    <value>▶ 预览</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperInfoText.Text" xml:space="preserve">
+    <value>语音从 sherpa-onnx 项目的 GitHub 发布版下载(低质量约 25 MB,高质量最高约 150 MB)。它们完全在本机运行;无音频离开您的设备。</value>
+  </data>
+  <data name="VoiceSettingsPage_WindowsVoiceCombo.Header" xml:space="preserve">
+    <value>语音</value>
+  </data>
+  <data name="VoiceSettingsPage_PreviewVoiceButton.Content" xml:space="preserve">
+    <value>▶ 预览语音</value>
+  </data>
+  <data name="VoiceSettingsPage_ElevenLabsApiKeyBox.Header" xml:space="preserve">
+    <value>API 密钥</value>
+  </data>
+  <data name="VoiceSettingsPage_ElevenLabsVoiceIdBox.Header" xml:space="preserve">
+    <value>语音 ID</value>
+  </data>
+  <data name="VoiceSettingsPage_ElevenLabsModelBox.Header" xml:space="preserve">
+    <value>模型(可选)</value>
+  </data>
+  <data name="VoiceSettingsPage_PrivacyNote.Text" xml:space="preserve">
+    <value>所有语音处理均完全在您的设备上运行。不会向任何云服务发送音频数据。</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusModelReady" xml:space="preserve">
+    <value>✅ 模型就绪</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusDownloadRequired" xml:space="preserve">
+    <value>⬇️ 需要下载</value>
+  </data>
+  <data name="VoiceSettingsPage_ButtonReDownload" xml:space="preserve">
+    <value>重新下载</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusDownloading" xml:space="preserve">
+    <value>正在下载...</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusDownloadingPct" xml:space="preserve">
+    <value>正在下载... {0}%</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusDownloadCanceled" xml:space="preserve">
+    <value>下载已取消</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusError" xml:space="preserve">
+    <value>❌ 操作失败(参见 Debug 日志)</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperButtonDownloaded" xml:space="preserve">
+    <value>已下载</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperVoiceReady" xml:space="preserve">
+    <value>语音已在此 PC 上就绪({0} MB)。</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperVoiceNotDownloaded" xml:space="preserve">
+    <value>语音尚未下载。点击下载以获取模型(根据质量约 25–150 MB)。</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperButtonDownloading" xml:space="preserve">
+    <value>正在下载…</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperConnecting" xml:space="preserve">
+    <value>正在连接到 sherpa-onnx 发布版…</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperProgressIndeterminate" xml:space="preserve">
+    <value>正在下载… 已下载 {0} MB</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperProgressBytes" xml:space="preserve">
+    <value>正在下载… {0} / {1} MB</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperExtracting" xml:space="preserve">
+    <value>下载完成。正在提取…</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDownloadCanceled" xml:space="preserve">
+    <value>下载已取消。</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDownloadFailed" xml:space="preserve">
+    <value>下载失败(参见 Debug 日志)。</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperButtonRetry" xml:space="preserve">
+    <value>重试下载</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDeleted" xml:space="preserve">
+    <value>已删除语音。</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDeleteFailed" xml:space="preserve">
+    <value>删除失败(参见 Debug 日志)。</value>
+  </data>
+  <data name="VoiceSettingsPage_CompanionPreviewText" xml:space="preserve">
+    <value>您好!这是您的 Companion 在说话。</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperPreviewFailed" xml:space="preserve">
+    <value>预览失败(参见 Debug 日志)。</value>
+  </data>
+  <data name="VoiceSettingsPage_VoiceErrorLoading" xml:space="preserve">
+    <value>加载语音时出错(参见 Debug 日志)。</value>
+  </data>
+  <data name="VoiceSettingsPage_PreviewButtonPlaying" xml:space="preserve">
+    <value>▶ 正在播放...</value>
+  </data>
+  <data name="VoiceOverlayWindow_HeaderText.Text" xml:space="preserve">
+    <value>Companion 语音</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusBadge.Text" xml:space="preserve">
+    <value>就绪</value>
+  </data>
+  <data name="VoiceOverlayWindow_EmptyStateText.Text" xml:space="preserve">
+    <value>按开始并开始说话</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusText.Text" xml:space="preserve">
+    <value>按开始以开始</value>
+  </data>
+  <data name="VoiceOverlayWindow_StartStopText.Text" xml:space="preserve">
+    <value>开始聆听</value>
+  </data>
+  <data name="VoiceOverlayWindow_MuteButton.[using:Microsoft.UI.Xaml.Controls]ToolTipService.ToolTip" xml:space="preserve">
+    <value>静音</value>
+  </data>
+  <data name="VoiceOverlayWindow_SettingsButton.[using:Microsoft.UI.Xaml.Controls]ToolTipService.ToolTip" xml:space="preserve">
+    <value>语音设置</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusListening" xml:space="preserve">
+    <value>🗣️ 正在聆听...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusSpeakNow" xml:space="preserve">
+    <value>现在请说话 — 我在聆听</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateInitializing" xml:space="preserve">
+    <value>正在初始化...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateStarting" xml:space="preserve">
+    <value>正在启动</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateDownloadingModel" xml:space="preserve">
+    <value>正在下载语音模型...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateDownloadingPct" xml:space="preserve">
+    <value>正在下载模型... {0}%</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateLoadingModel" xml:space="preserve">
+    <value>正在加载语音模型...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateStartingMic" xml:space="preserve">
+    <value>正在启动麦克风...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateStopping" xml:space="preserve">
+    <value>正在停止...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateError" xml:space="preserve">
+    <value>错误</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusError" xml:space="preserve">
+    <value>遇到错误(参见 Debug 日志)</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusMuted" xml:space="preserve">
+    <value>已静音</value>
+  </data>
+  <data name="VoiceOverlayWindow_StopText" xml:space="preserve">
+    <value>停止</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeStopped" xml:space="preserve">
+    <value>已停止</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeStartingDots" xml:space="preserve">
+    <value>正在启动...</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeListening" xml:space="preserve">
+    <value>正在聆听</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeProcessing" xml:space="preserve">
+    <value>正在处理...</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeUnknown" xml:space="preserve">
+    <value>未知</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusInitMic" xml:space="preserve">
+    <value>正在初始化麦克风...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusTranscribing" xml:space="preserve">
+    <value>正在转录您的语音...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusErrorOccurred" xml:space="preserve">
+    <value>发生错误</value>
+  </data>
+  <data name="VoiceOverlayWindow_winexWindowEx_2.Title" xml:space="preserve">
+    <value>Companion Voice</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeReady" xml:space="preserve">
+    <value>就绪</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusReadyMessage" xml:space="preserve">
+    <value>按开始以开始</value>
+  </data>
+  <data name="VoiceOverlayWindow_ButtonStartListening" xml:space="preserve">
+    <value>开始聆听</value>
+  </data>
+  <data name="VoiceSettingsPage_ButtonDownloadModel" xml:space="preserve">
+    <value>下载模型</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperButtonDownloadVoice" xml:space="preserve">
+    <value>下载语音</value>
+  </data>
+  <data name="VoiceSettingsPage_PreviewVoiceButtonContent" xml:space="preserve">
+    <value>▶ 预览语音</value>
+  </data>
+</root>
\ No newline at end of file
diff --git a/src/OpenClaw.Tray.WinUI/Strings/zh-tw/Resources.resw b/src/OpenClaw.Tray.WinUI/Strings/zh-tw/Resources.resw
index 166e5bac..f51d7b0f 100644
--- a/src/OpenClaw.Tray.WinUI/Strings/zh-tw/Resources.resw
+++ b/src/OpenClaw.Tray.WinUI/Strings/zh-tw/Resources.resw
@@ -1,24 +1,22 @@
-<?xml version="1.0" encoding="utf-8"?>
-<root>
-  <xsd:schema id="root" xmlns="" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:msdata="urn:schemas-microsoft-com:xml-msdata">
-    <xsd:element name="root" msdata:IsDataSet="true">
-      <xsd:complexType>
-        <xsd:choice maxOccurs="unbounded">
-          <xsd:element name="data">
-            <xsd:complexType>
-              <xsd:sequence>
-                <xsd:element name="value" type="xsd:string" minOccurs="0" msdata:Ordinal="1" />
-                <xsd:element name="comment" type="xsd:string" minOccurs="0" msdata:Ordinal="2" />
-              </xsd:sequence>
-              <xsd:attribute name="name" type="xsd:string" use="required" />
-            </xsd:complexType>
-          </xsd:element>
-        </xsd:choice>
-      </xsd:complexType>
-    </xsd:element>
-  </xsd:schema>
-  <!-- ==================== SettingsWindow.xaml ==================== -->
-  <!-- Section headers -->
+<?xml version='1.0' encoding='utf-8'?>
+<root xmlns:ns1="urn:schemas-microsoft-com:xml-msdata" xmlns:xs="http://www.w3.org/2001/XMLSchema">
+  <xs:schema id="root">
+    <xs:element name="root" ns1:IsDataSet="true">
+      <xs:complexType>
+        <xs:choice maxOccurs="unbounded">
+          <xs:element name="data">
+            <xs:complexType>
+              <xs:sequence>
+                <xs:element name="value" type="xsd:string" minOccurs="0" ns1:Ordinal="1" />
+                <xs:element name="comment" type="xsd:string" minOccurs="0" ns1:Ordinal="2" />
+              </xs:sequence>
+              <xs:attribute name="name" type="xsd:string" use="required" />
+            </xs:complexType>
+          </xs:element>
+        </xs:choice>
+      </xs:complexType>
+    </xs:element>
+  </xs:schema>
   <data name="SettingsConnectionHeader.Text" xml:space="preserve">
     <value>連線</value>
   </data>
@@ -31,7 +29,6 @@
   <data name="SettingsAdvancedHeader.Text" xml:space="preserve">
     <value>高級（實驗性）</value>
   </data>
-  <!-- TextBox headers and placeholders -->
   <data name="SettingsGatewayUrlTextBox.Header" xml:space="preserve">
     <value>閘道器地址</value>
   </data>
@@ -44,7 +41,6 @@
   <data name="SettingsTokenTextBox.PlaceholderText" xml:space="preserve">
     <value>您的 API Token</value>
   </data>
-  <!-- Toggle headers -->
   <data name="SettingsAutoStartToggle.Header" xml:space="preserve">
     <value>開機自啟動</value>
   </data>
@@ -57,7 +53,6 @@
   <data name="SettingsNodeModeToggle.Header" xml:space="preserve">
     <value>啟用節點模式</value>
   </data>
-  <!-- ComboBox -->
   <data name="SettingsSoundComboBox.Header" xml:space="preserve">
     <value>提示音</value>
   </data>
@@ -70,14 +65,12 @@
   <data name="SettingsSoundSubtle.Content" xml:space="preserve">
     <value>柔和</value>
   </data>
-  <!-- Notification filter labels -->
   <data name="SettingsNotifyForLabel.Text" xml:space="preserve">
     <value>顯示以下型別通知:</value>
   </data>
   <data name="SettingsNotifyFilterHint.Text" xml:space="preserve">
     <value>按訊息中的關鍵詞過濾（例如"郵件"、"提醒"）</value>
   </data>
-  <!-- Notification filter checkboxes -->
   <data name="SettingsNotifyHealthCb.Content" xml:space="preserve">
     <value>健康警報</value>
   </data>
@@ -102,7 +95,6 @@
   <data name="SettingsNotifyInfoCb.Content" xml:space="preserve">
     <value>常規資訊</value>
   </data>
-  <!-- Buttons -->
   <data name="SettingsTestConnectionButton.Content" xml:space="preserve">
     <value>測試</value>
   </data>
@@ -115,12 +107,9 @@
   <data name="SettingsCancelButton.Content" xml:space="preserve">
     <value>取消</value>
   </data>
-  <!-- Node mode description -->
   <data name="SettingsNodeModeDescription.Text" xml:space="preserve">
     <value>啟用後，此電腦可以接收來自代理的命令（Canvas、截圖等）</value>
   </data>
-  <!-- ==================== StatusDetailWindow.xaml ==================== -->
-  <!-- Section headers -->
   <data name="StatusUsageHeader.Text" xml:space="preserve">
     <value>用量</value>
   </data>
@@ -154,7 +143,6 @@
   <data name="StatusRecentActivityHeader.Text" xml:space="preserve">
     <value>最近活動</value>
   </data>
-  <!-- Labels -->
   <data name="StatusCostLabel.Text" xml:space="preserve">
     <value>費用（視窗期）:</value>
   </data>
@@ -164,7 +152,6 @@
   <data name="StatusProvidersLabel.Text" xml:space="preserve">
     <value>提供者:</value>
   </data>
-  <!-- Default values -->
   <data name="StatusConnectedText.Text" xml:space="preserve">
     <value>已連線</value>
   </data>
@@ -198,7 +185,6 @@
   <data name="StatusCronDescription.Text" xml:space="preserve">
     <value>當連線的閘道支援時，開啟排程自動化控制項。</value>
   </data>
-  <!-- Buttons -->
   <data name="StatusRefreshButton.Content" xml:space="preserve">
     <value>重新整理</value>
   </data>
@@ -247,12 +233,9 @@
   <data name="StatusOpenStreamButton.Content" xml:space="preserve">
     <value>開啟串流</value>
   </data>
-  <!-- ==================== ActivityStreamWindow.xaml ==================== -->
-  <!-- Header -->
   <data name="ActivityStreamTitle.Text" xml:space="preserve">
     <value>⚡ 串流活動</value>
   </data>
-  <!-- Filter items -->
   <data name="ActivityFilterAll.Content" xml:space="preserve">
     <value>全部活動</value>
   </data>
@@ -268,11 +251,9 @@
   <data name="ActivityFilterNotifications.Content" xml:space="preserve">
     <value>通知</value>
   </data>
-  <!-- Empty state -->
   <data name="ActivityEmptyText.Text" xml:space="preserve">
     <value>暫無活動</value>
   </data>
-  <!-- Buttons -->
   <data name="ActivityOpenDashboardButton.Content" xml:space="preserve">
     <value>打開儀表板</value>
   </data>
@@ -282,24 +263,18 @@
   <data name="ActivityCloseButton.Content" xml:space="preserve">
     <value>關閉</value>
   </data>
-  <!-- ==================== NotificationHistoryWindow.xaml ==================== -->
-  <!-- Header -->
   <data name="NotificationHistoryTitle.Text" xml:space="preserve">
     <value>📋 通知歷史</value>
   </data>
-  <!-- Empty state -->
   <data name="NotificationEmptyText.Text" xml:space="preserve">
     <value>暫無通知</value>
   </data>
-  <!-- Buttons -->
   <data name="NotificationClearAllButton.Content" xml:space="preserve">
     <value>全部清除</value>
   </data>
   <data name="NotificationCloseButton.Content" xml:space="preserve">
     <value>關閉</value>
   </data>
-  <!-- ==================== Runtime strings (C# code) ==================== -->
-  <!-- Window titles -->
   <data name="WindowTitle_Settings" xml:space="preserve">
     <value>設定 — OpenClaw 設定</value>
   </data>
@@ -324,7 +299,6 @@
   <data name="WindowTitle_Update" xml:space="preserve">
     <value>OpenClaw 更新</value>
   </data>
-  <!-- Settings runtime status -->
   <data name="Status_Testing" xml:space="preserve">
     <value>測試中...</value>
   </data>
@@ -334,7 +308,6 @@
   <data name="Status_ConnectionFailed" xml:space="preserve">
     <value>❌ 連線失敗</value>
   </data>
-  <!-- QuickSend dialog -->
   <data name="QuickSend_Header" xml:space="preserve">
     <value>📤 快速發送</value>
   </data>
@@ -359,7 +332,6 @@
   <data name="QuickSend_ToastBody" xml:space="preserve">
     <value>您的訊息已發送至 OpenClaw。</value>
   </data>
-  <!-- WelcomeDialog -->
   <data name="Welcome_Title" xml:space="preserve">
     <value>歡迎使用 OpenClaw！</value>
   </data>
@@ -384,7 +356,6 @@
   <data name="Welcome_OpenSettingsButton" xml:space="preserve">
     <value>打開設定</value>
   </data>
-  <!-- UpdateDialog -->
   <data name="Update_VersionAvailable" xml:space="preserve">
     <value>🎉 版本 {0} 已可用！</value>
   </data>
@@ -403,7 +374,6 @@
   <data name="Update_DownloadButton" xml:space="preserve">
     <value>下載並安裝</value>
   </data>
-  <!-- Tray menu items -->
   <data name="Menu_OpenDashboard" xml:space="preserve">
     <value>打開儀表板</value>
   </data>
@@ -494,7 +464,6 @@
   <data name="Menu_RestartSshTunnel" xml:space="preserve">
     <value>重新啟動 SSH 通道</value>
   </data>
-  <!-- Tray menu dynamic text -->
   <data name="Menu_StatusFormat" xml:space="preserve">
     <value>狀態: {0}</value>
   </data>
@@ -531,18 +500,15 @@
   <data name="Menu_NodeDisconnected" xml:space="preserve">
     <value>⚪ 中斷連線</value>
   </data>
-  <!-- Notification test -->
   <data name="TestNotification_Title" xml:space="preserve">
     <value>測試通知</value>
   </data>
   <data name="TestNotification_Body" xml:space="preserve">
     <value>這是來自 OpenClaw 設定的測試通知。</value>
   </data>
-  <!-- Status detail runtime -->
   <data name="Status_LastCheckFormat" xml:space="preserve">
     <value>上次檢查: {0}</value>
   </data>
-  <!-- Time ago strings -->
   <data name="TimeAgo_JustNow" xml:space="preserve">
     <value>剛剛</value>
   </data>
@@ -552,14 +518,12 @@
   <data name="TimeAgo_HoursFormat" xml:space="preserve">
     <value>{0}小時前</value>
   </data>
-  <!-- Activity stream runtime -->
   <data name="Activity_ClickToOpen" xml:space="preserve">
     <value>點選在儀表板中打開</value>
   </data>
   <data name="TimeAgo_DaysFormat" xml:space="preserve">
     <value>{0}天前</value>
   </data>
-  <!-- Status display text (plain, no emoji) -->
   <data name="StatusDisplay_Connected" xml:space="preserve">
     <value>已連線</value>
   </data>
@@ -578,7 +542,6 @@
   <data name="Status_NotAvailable" xml:space="preserve">
     <value>無</value>
   </data>
-  <!-- ==================== CanvasWindow.xaml ==================== -->
   <data name="WindowTitle_Canvas" xml:space="preserve">
     <value>畫布</value>
   </data>
@@ -612,7 +575,6 @@
   <data name="A2UI_UnsupportedComponent" xml:space="preserve">
     <value>不支援的元件: {0}</value>
   </data>
-  <!-- ==================== WebChatWindow.xaml ==================== -->
   <data name="WebChatErrorTitle.Text" xml:space="preserve">
     <value>網頁聊天不可用</value>
   </data>
@@ -651,11 +613,9 @@
 • 如果是自簽名證書，請將閘道器 CA/證書匯入 Windows 受信任的根證書（certmgr.msc）
 • 或通過隧道連線到 localhost：ssh -N -L 18789:localhost:18789 &lt;伺服器&gt;</value>
   </data>
-  <!-- ==================== TrayMenuWindow.xaml ==================== -->
   <data name="WindowTitle_TrayMenu" xml:space="preserve">
     <value>OpenClaw 選項</value>
   </data>
-  <!-- ==================== Toast: Device / Node ==================== -->
   <data name="Toast_DeviceIdCopied" xml:space="preserve">
     <value>📋 裝置 ID 已複製</value>
   </data>
@@ -668,14 +628,12 @@
   <data name="Toast_NodeSummaryCopiedDetail" xml:space="preserve">
     <value>已複製 {0} 個節點到剪貼簿</value>
   </data>
-  <!-- ==================== Toast: Session ==================== -->
   <data name="Toast_SessionActionFailed" xml:space="preserve">
     <value>❌ 會話操作失敗</value>
   </data>
   <data name="Toast_SessionActionFailedDetail" xml:space="preserve">
     <value>無法向閘道器發送請求。</value>
   </data>
-  <!-- ==================== Toast: Node Mode ==================== -->
   <data name="Toast_NodeModeActive" xml:space="preserve">
     <value>🔌 節點模式已啟用</value>
   </data>
@@ -706,7 +664,6 @@
   <data name="Toast_PairingRejectedDetail" xml:space="preserve">
     <value>閘道器拒絕了此裝置的配對請求。</value>
   </data>
-  <!-- ==================== Toast: Health Check ==================== -->
   <data name="Toast_HealthCheck" xml:space="preserve">
     <value>健康檢查</value>
   </data>
@@ -719,7 +676,6 @@
   <data name="Toast_HealthCheckFailed" xml:space="preserve">
     <value>健康檢查失敗</value>
   </data>
-  <!-- ==================== Toast: Screen / Camera ==================== -->
   <data name="Toast_ScreenCaptured" xml:space="preserve">
     <value>📸 螢幕已擷取</value>
   </data>
@@ -732,7 +688,6 @@
   <data name="Toast_CameraBlockedDetail" xml:space="preserve">
     <value>請在 Windows 隱私設定中為 OpenClaw Tray 啟用相機訪問</value>
   </data>
-  <!-- ==================== Toast: Activity Stream Tip ==================== -->
   <data name="Toast_ActivityStreamTip" xml:space="preserve">
     <value>⚡ 新功能: 串流活動</value>
   </data>
@@ -742,7 +697,6 @@
   <data name="Toast_ActivityStreamTipButton" xml:space="preserve">
     <value>打開串流活動</value>
   </data>
-  <!-- ==================== SetupWizardWindow (C# code) ==================== -->
   <data name="Setup_Title" xml:space="preserve">
     <value>OpenClaw 設定</value>
   </data>
@@ -893,7 +847,6 @@
   <data name="Setup_StepDone" xml:space="preserve">
     <value>第 3 步（共 3 步）— 完成</value>
   </data>
-  <!-- ==================== Local MCP server (SettingsWindow) ==================== -->
   <data name="SettingsDeveloperModeHeader.Text" xml:space="preserve">
     <value>開發人員模式</value>
   </data>
@@ -901,7 +854,7 @@
     <value>啟用本機 MCP 伺服器</value>
   </data>
   <data name="SettingsMcpDescription.Text" xml:space="preserve">
-    <value>向本機 MCP 用戶端（Claude Desktop、Cursor、Claude Code）公開相同的節點功能（系統、螢幕、攝影機、畫布）。</value>
+    <value>向本機 MCP 用戶端（Claude Desktop、Cursor、Claude Code）公開相同的節點功能（系統、螢幕、攝影機、麥克風、喇叭、畫布）。</value>
   </data>
   <data name="SettingsMcpEndpointLabel.Text" xml:space="preserve">
     <value>端點：</value>
@@ -927,7 +880,6 @@
   <data name="Mcp_Status_FailedToStart" xml:space="preserve">
     <value>啟動失敗：</value>
   </data>
-  <!-- ==================== MCP token (SettingsWindow) ==================== -->
   <data name="SettingsMcpTokenLabel.Text" xml:space="preserve">
     <value>Bearer 權杖：</value>
   </data>
@@ -976,14 +928,12 @@
   <data name="SettingsMcpTokenResetDialog_CloseButton" xml:space="preserve">
     <value>取消</value>
   </data>
-  <!-- ==================== DownloadProgressDialog ==================== -->
   <data name="WindowTitle_Downloading" xml:space="preserve">
     <value>正在下載更新...</value>
   </data>
   <data name="Download_ProgressText" xml:space="preserve">
     <value>正在下載更新...</value>
   </data>
-  <!-- ==================== UrlNavigationApprovalService prompt ==================== -->
   <data name="UrlApproval_Caption" xml:space="preserve">
     <value>OpenClaw — 核准 URL</value>
   </data>
@@ -1470,7 +1420,7 @@
   <data name="ClearButton.Content" xml:space="preserve">
     <value>清除</value>
   </data>
-<data name="BindingsPage_TextBlock_16.Text" xml:space="preserve">
+  <data name="BindingsPage_TextBlock_16.Text" xml:space="preserve">
     <value>🔀 繫結</value>
   </data>
   <data name="BindingsPage_TextBlock_17.Text" xml:space="preserve">
@@ -1704,7 +1654,7 @@
   <data name="CronPage_TextBlock_120.Text" xml:space="preserve">
     <value>未設定 Cron 工作</value>
   </data>
-<data name="DebugPage_TextBlock_8.Text" xml:space="preserve">
+  <data name="DebugPage_TextBlock_8.Text" xml:space="preserve">
     <value>🐛 偵錯</value>
   </data>
   <data name="DebugPage_TextBlock_14.Text" xml:space="preserve">
@@ -1758,7 +1708,7 @@
   <data name="DebugPage_Button_114.Content" xml:space="preserve">
     <value>📋 複製支援內容</value>
   </data>
-<data name="StatusHeadline.Text" xml:space="preserve">
+  <data name="StatusHeadline.Text" xml:space="preserve">
     <value>未連線到閘道</value>
   </data>
   <data name="ScanButton.Content" xml:space="preserve">
@@ -2274,4 +2224,352 @@
   <data name="CronPageRemoveJobToolTip.Content" xml:space="preserve">
     <value>刪除工作</value>
   </data>
-</root>
+  <data name="CapabilitiesPage_SttCardDescription.Text" xml:space="preserve">
+    <value>使用 Whisper 於本機本地辨識麥克風音訊。模型在首次使用時下載一次。</value>
+  </data>
+  <data name="CapabilitiesPage_SttCardHeader.Text" xml:space="preserve">
+    <value>🎤 語音轉文字</value>
+  </data>
+  <data name="CapabilitiesPage_SttMoreSettingsLink.Content" xml:space="preserve">
+    <value>更多語音設定…</value>
+  </data>
+  <data name="CapabilitiesPage_TtsCardDescription.Text" xml:space="preserve">
+    <value>tts.speak 的預設提供者。每次呼叫傳入的 provider/voiceId/model 會覆寫此預設值。</value>
+  </data>
+  <data name="CapabilitiesPage_TtsCardHeader.Text" xml:space="preserve">
+    <value>🔊 文字轉語音</value>
+  </data>
+  <data name="CapabilitiesPage_TtsElevenLabsApiKey.Header" xml:space="preserve">
+    <value>ElevenLabs API 金鑰</value>
+  </data>
+  <data name="CapabilitiesPage_TtsElevenLabsHelp.Text" xml:space="preserve">
+    <value>API 金鑰會使用 Windows DPAPI 加密儲存。修改其他欄位時若保持空白，則保留先前儲存的值。</value>
+  </data>
+  <data name="CapabilitiesPage_TtsElevenLabsModel.Header" xml:space="preserve">
+    <value>ElevenLabs 模型</value>
+  </data>
+  <data name="CapabilitiesPage_TtsElevenLabsModel.PlaceholderText" xml:space="preserve">
+    <value>eleven_multilingual_v2</value>
+  </data>
+  <data name="CapabilitiesPage_TtsElevenLabsVoiceId.Header" xml:space="preserve">
+    <value>ElevenLabs 語音 ID</value>
+  </data>
+  <data name="CapabilitiesPage_TtsProviderComboBox.Header" xml:space="preserve">
+    <value>提供者</value>
+  </data>
+  <data name="CapabilitiesPage_TtsProviderElevenLabs.Content" xml:space="preserve">
+    <value>ElevenLabs</value>
+  </data>
+  <data name="CapabilitiesPage_TtsProviderPiper.Content" xml:space="preserve">
+    <value>Piper（本機 ML，建議）</value>
+  </data>
+  <data name="CapabilitiesPage_TtsProviderWindows.Content" xml:space="preserve">
+    <value>Windows 內建語音</value>
+  </data>
+  <data name="VoiceSettingsPage_PageTitle.Text" xml:space="preserve">
+    <value>🎙️ 語音與音訊</value>
+  </data>
+  <data name="VoiceSettingsPage_PageDescription.Text" xml:space="preserve">
+    <value>設定語音轉文字和語音互動設定。所有語音處理均在本機本地執行。</value>
+  </data>
+  <data name="VoiceSettingsPage_SttHeader.Text" xml:space="preserve">
+    <value>語音轉文字</value>
+  </data>
+  <data name="VoiceSettingsPage_SttDescription.Text" xml:space="preserve">
+    <value>透過麥克風啟用語音輸入。需要下載 Whisper 模型。</value>
+  </data>
+  <data name="VoiceSettingsPage_SttEnabledToggle.Header" xml:space="preserve">
+    <value>啟用語音輸入</value>
+  </data>
+  <data name="VoiceSettingsPage_ModelHeader.Text" xml:space="preserve">
+    <value>語音模型</value>
+  </data>
+  <data name="VoiceSettingsPage_ModelCombo.Header" xml:space="preserve">
+    <value>模型大小</value>
+  </data>
+  <data name="VoiceSettingsPage_ModelTiny.Content" xml:space="preserve">
+    <value>Tiny (~75 MB) — 快速,基本精確度</value>
+  </data>
+  <data name="VoiceSettingsPage_ModelBase.Content" xml:space="preserve">
+    <value>Base (~142 MB) — 良好平衡</value>
+  </data>
+  <data name="VoiceSettingsPage_ModelSmall.Content" xml:space="preserve">
+    <value>Small (~466 MB) — 高精確度</value>
+  </data>
+  <data name="VoiceSettingsPage_DownloadButtonText.Text" xml:space="preserve">
+    <value>下載模型</value>
+  </data>
+  <data name="VoiceSettingsPage_LanguageHeader.Text" xml:space="preserve">
+    <value>語言</value>
+  </data>
+  <data name="VoiceSettingsPage_LangAuto.Content" xml:space="preserve">
+    <value>自動偵測</value>
+  </data>
+  <data name="VoiceSettingsPage_LangEn.Content" xml:space="preserve">
+    <value>英文</value>
+  </data>
+  <data name="VoiceSettingsPage_LangEs.Content" xml:space="preserve">
+    <value>西班牙文</value>
+  </data>
+  <data name="VoiceSettingsPage_LangFr.Content" xml:space="preserve">
+    <value>法文</value>
+  </data>
+  <data name="VoiceSettingsPage_LangDe.Content" xml:space="preserve">
+    <value>德文</value>
+  </data>
+  <data name="VoiceSettingsPage_LangJa.Content" xml:space="preserve">
+    <value>日文</value>
+  </data>
+  <data name="VoiceSettingsPage_LangZh.Content" xml:space="preserve">
+    <value>中文</value>
+  </data>
+  <data name="VoiceSettingsPage_LangKo.Content" xml:space="preserve">
+    <value>韓文</value>
+  </data>
+  <data name="VoiceSettingsPage_LangPt.Content" xml:space="preserve">
+    <value>葡萄牙文</value>
+  </data>
+  <data name="VoiceSettingsPage_LangIt.Content" xml:space="preserve">
+    <value>義大利文</value>
+  </data>
+  <data name="VoiceSettingsPage_VoiceChatHeader.Text" xml:space="preserve">
+    <value>語音聊天</value>
+  </data>
+  <data name="VoiceSettingsPage_SilenceSlider.Header" xml:space="preserve">
+    <value>靜音逾時(秒)</value>
+  </data>
+  <data name="VoiceSettingsPage_TtsResponseToggle.Header" xml:space="preserve">
+    <value>朗讀回應</value>
+  </data>
+  <data name="VoiceSettingsPage_AudioFeedbackToggle.Header" xml:space="preserve">
+    <value>音訊回饋音效</value>
+  </data>
+  <data name="VoiceSettingsPage_VoiceHeader.Text" xml:space="preserve">
+    <value>🔊 Companion 語音</value>
+  </data>
+  <data name="VoiceSettingsPage_VoiceDescription.Text" xml:space="preserve">
+    <value>選擇朗讀回應時使用的語音。</value>
+  </data>
+  <data name="VoiceSettingsPage_TtsProviderCombo.Header" xml:space="preserve">
+    <value>提供者</value>
+  </data>
+  <data name="VoiceSettingsPage_TtsPiper.Content" xml:space="preserve">
+    <value>Piper(本機神經語音)</value>
+  </data>
+  <data name="VoiceSettingsPage_TtsWindows.Content" xml:space="preserve">
+    <value>Windows(內建神經語音)</value>
+  </data>
+  <data name="VoiceSettingsPage_TtsElevenLabs.Content" xml:space="preserve">
+    <value>ElevenLabs(雲端,需要 API 金鑰)</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperVoiceCombo.Header" xml:space="preserve">
+    <value>語音</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDownloadButtonText.Text" xml:space="preserve">
+    <value>下載語音</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDeleteButton.Content" xml:space="preserve">
+    <value>刪除</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperPreviewButton.Content" xml:space="preserve">
+    <value>▶ 預覽</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperInfoText.Text" xml:space="preserve">
+    <value>語音從 sherpa-onnx 專案的 GitHub 發行版下載(低品質約 25 MB,高品質最高約 150 MB)。它們完全在本機執行;無音訊離開您的裝置。</value>
+  </data>
+  <data name="VoiceSettingsPage_WindowsVoiceCombo.Header" xml:space="preserve">
+    <value>語音</value>
+  </data>
+  <data name="VoiceSettingsPage_PreviewVoiceButton.Content" xml:space="preserve">
+    <value>▶ 預覽語音</value>
+  </data>
+  <data name="VoiceSettingsPage_ElevenLabsApiKeyBox.Header" xml:space="preserve">
+    <value>API 金鑰</value>
+  </data>
+  <data name="VoiceSettingsPage_ElevenLabsVoiceIdBox.Header" xml:space="preserve">
+    <value>語音 ID</value>
+  </data>
+  <data name="VoiceSettingsPage_ElevenLabsModelBox.Header" xml:space="preserve">
+    <value>模型(可選)</value>
+  </data>
+  <data name="VoiceSettingsPage_PrivacyNote.Text" xml:space="preserve">
+    <value>所有語音處理均完全在您的裝置上執行。不會向任何雲端服務傳送音訊資料。</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusModelReady" xml:space="preserve">
+    <value>✅ 模型就緒</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusDownloadRequired" xml:space="preserve">
+    <value>⬇️ 需要下載</value>
+  </data>
+  <data name="VoiceSettingsPage_ButtonReDownload" xml:space="preserve">
+    <value>重新下載</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusDownloading" xml:space="preserve">
+    <value>正在下載...</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusDownloadingPct" xml:space="preserve">
+    <value>正在下載... {0}%</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusDownloadCanceled" xml:space="preserve">
+    <value>下載已取消</value>
+  </data>
+  <data name="VoiceSettingsPage_StatusError" xml:space="preserve">
+    <value>❌ 操作失敗(參見 Debug 日誌)</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperButtonDownloaded" xml:space="preserve">
+    <value>已下載</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperVoiceReady" xml:space="preserve">
+    <value>語音已在此 PC 上就緒({0} MB)。</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperVoiceNotDownloaded" xml:space="preserve">
+    <value>語音尚未下載。按下載以取得模型(根據品質約 25–150 MB)。</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperButtonDownloading" xml:space="preserve">
+    <value>正在下載…</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperConnecting" xml:space="preserve">
+    <value>正在連接到 sherpa-onnx 發行版…</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperProgressIndeterminate" xml:space="preserve">
+    <value>正在下載… 已下載 {0} MB</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperProgressBytes" xml:space="preserve">
+    <value>正在下載… {0} / {1} MB</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperExtracting" xml:space="preserve">
+    <value>下載完成。正在解壓縮…</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDownloadCanceled" xml:space="preserve">
+    <value>下載已取消。</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDownloadFailed" xml:space="preserve">
+    <value>下載失敗(參見 Debug 日誌)。</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperButtonRetry" xml:space="preserve">
+    <value>重試下載</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDeleted" xml:space="preserve">
+    <value>已刪除語音。</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperDeleteFailed" xml:space="preserve">
+    <value>刪除失敗(參見 Debug 日誌)。</value>
+  </data>
+  <data name="VoiceSettingsPage_CompanionPreviewText" xml:space="preserve">
+    <value>您好!這是您的 Companion 在說話。</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperPreviewFailed" xml:space="preserve">
+    <value>預覽失敗(參見 Debug 日誌)。</value>
+  </data>
+  <data name="VoiceSettingsPage_VoiceErrorLoading" xml:space="preserve">
+    <value>載入語音時發生錯誤(參見 Debug 日誌)。</value>
+  </data>
+  <data name="VoiceSettingsPage_PreviewButtonPlaying" xml:space="preserve">
+    <value>▶ 正在播放...</value>
+  </data>
+  <data name="VoiceOverlayWindow_HeaderText.Text" xml:space="preserve">
+    <value>Companion 語音</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusBadge.Text" xml:space="preserve">
+    <value>就緒</value>
+  </data>
+  <data name="VoiceOverlayWindow_EmptyStateText.Text" xml:space="preserve">
+    <value>按開始並開始說話</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusText.Text" xml:space="preserve">
+    <value>按開始以開始</value>
+  </data>
+  <data name="VoiceOverlayWindow_StartStopText.Text" xml:space="preserve">
+    <value>開始聆聽</value>
+  </data>
+  <data name="VoiceOverlayWindow_MuteButton.[using:Microsoft.UI.Xaml.Controls]ToolTipService.ToolTip" xml:space="preserve">
+    <value>靜音</value>
+  </data>
+  <data name="VoiceOverlayWindow_SettingsButton.[using:Microsoft.UI.Xaml.Controls]ToolTipService.ToolTip" xml:space="preserve">
+    <value>語音設定</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusListening" xml:space="preserve">
+    <value>🗣️ 正在聆聽...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusSpeakNow" xml:space="preserve">
+    <value>現在請說話 — 我在聆聽</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateInitializing" xml:space="preserve">
+    <value>正在初始化...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateStarting" xml:space="preserve">
+    <value>正在啟動</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateDownloadingModel" xml:space="preserve">
+    <value>正在下載語音模型...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateDownloadingPct" xml:space="preserve">
+    <value>正在下載模型... {0}%</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateLoadingModel" xml:space="preserve">
+    <value>正在載入語音模型...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateStartingMic" xml:space="preserve">
+    <value>正在啟動麥克風...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateStopping" xml:space="preserve">
+    <value>正在停止...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StateError" xml:space="preserve">
+    <value>錯誤</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusError" xml:space="preserve">
+    <value>遇到錯誤(參見 Debug 日誌)</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusMuted" xml:space="preserve">
+    <value>已靜音</value>
+  </data>
+  <data name="VoiceOverlayWindow_StopText" xml:space="preserve">
+    <value>停止</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeStopped" xml:space="preserve">
+    <value>已停止</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeStartingDots" xml:space="preserve">
+    <value>正在啟動...</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeListening" xml:space="preserve">
+    <value>正在聆聽</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeProcessing" xml:space="preserve">
+    <value>正在處理...</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeUnknown" xml:space="preserve">
+    <value>未知</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusInitMic" xml:space="preserve">
+    <value>正在初始化麥克風...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusTranscribing" xml:space="preserve">
+    <value>正在轉錄您的語音...</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusErrorOccurred" xml:space="preserve">
+    <value>發生錯誤</value>
+  </data>
+  <data name="VoiceOverlayWindow_winexWindowEx_2.Title" xml:space="preserve">
+    <value>Companion Voice</value>
+  </data>
+  <data name="VoiceOverlayWindow_BadgeReady" xml:space="preserve">
+    <value>就緒</value>
+  </data>
+  <data name="VoiceOverlayWindow_StatusReadyMessage" xml:space="preserve">
+    <value>按開始以開始</value>
+  </data>
+  <data name="VoiceOverlayWindow_ButtonStartListening" xml:space="preserve">
+    <value>開始聆聽</value>
+  </data>
+  <data name="VoiceSettingsPage_ButtonDownloadModel" xml:space="preserve">
+    <value>下載模型</value>
+  </data>
+  <data name="VoiceSettingsPage_PiperButtonDownloadVoice" xml:space="preserve">
+    <value>下載語音</value>
+  </data>
+  <data name="VoiceSettingsPage_PreviewVoiceButtonContent" xml:space="preserve">
+    <value>▶ 預覽語音</value>
+  </data>
+</root>
\ No newline at end of file
diff --git a/src/OpenClaw.Tray.WinUI/Windows/HubWindow.xaml b/src/OpenClaw.Tray.WinUI/Windows/HubWindow.xaml
index dd98ce75..2c416782 100644
--- a/src/OpenClaw.Tray.WinUI/Windows/HubWindow.xaml
+++ b/src/OpenClaw.Tray.WinUI/Windows/HubWindow.xaml
@@ -72,7 +72,9 @@
             x:Name="NavView"
             IsBackButtonVisible="Collapsed"
             IsSettingsVisible="False"
-            PaneDisplayMode="Auto"
+            PaneDisplayMode="Left"
+            PaneOpening="OnNavPaneStateChanged"
+            PaneClosing="OnNavPaneStateChanged"
             SelectionChanged="NavView_SelectionChanged">
 
         <NavigationView.MenuItems>
@@ -133,6 +135,9 @@
             <NavigationViewItem x:Uid="HubWindow_NavigationViewItem_130" Tag="capabilities" Content="Capabilities">
                 <NavigationViewItem.Icon><FontIcon Glyph="&#xE964;"/></NavigationViewItem.Icon>
             </NavigationViewItem>
+            <NavigationViewItem Tag="voice" Content="Voice &amp; Audio">
+                <NavigationViewItem.Icon><FontIcon Glyph="&#xE720;"/></NavigationViewItem.Icon>
+            </NavigationViewItem>
             <NavigationViewItem x:Uid="HubWindow_NavigationViewItem_133" Tag="settings" Content="Settings">
                 <NavigationViewItem.Icon><FontIcon Glyph="&#xE713;"/></NavigationViewItem.Icon>
             </NavigationViewItem>
diff --git a/src/OpenClaw.Tray.WinUI/Windows/HubWindow.xaml.cs b/src/OpenClaw.Tray.WinUI/Windows/HubWindow.xaml.cs
index 2c120610..fd592f84 100644
--- a/src/OpenClaw.Tray.WinUI/Windows/HubWindow.xaml.cs
+++ b/src/OpenClaw.Tray.WinUI/Windows/HubWindow.xaml.cs
@@ -17,7 +17,22 @@ public sealed partial class HubWindow : WindowEx
     public bool IsClosed { get; private set; }
 
     // Shared state accessible by pages
-    public SettingsManager? Settings { get; set; }
+    private SettingsManager? _settings;
+    public SettingsManager? Settings
+    {
+        get => _settings;
+        set
+        {
+            _settings = value;
+            // Apply persisted nav-pane state. NavView starts with its XAML
+            // default of IsPaneOpen=true; honor the user's last preference
+            // here so they don't re-toggle on every Hub open.
+            if (value != null && NavView != null)
+            {
+                NavView.IsPaneOpen = value.HubNavPaneOpen;
+            }
+        }
+    }
     public OpenClawGatewayClient? GatewayClient { get; set; }
     public ConnectionStatus CurrentStatus { get; set; }
     private string _currentAgentId = "main";
@@ -38,6 +53,7 @@ public sealed partial class HubWindow : WindowEx
     public bool NodeIsPendingApproval { get; set; }
     public string? LastAuthError { get; set; }
     public string? NodeShortDeviceId { get; set; }
+    public VoiceService? VoiceServiceInstance { get; set; }
     public string? NodeFullDeviceId { get; set; }
 
     // Cached gateway data — pages read these on navigation
@@ -527,6 +543,23 @@ private void NavView_SelectionChanged(NavigationView sender, NavigationViewSelec
         }
     }
 
+    /// <summary>
+    /// Persist the NavigationView's expanded/compact state on every toggle.
+    /// Both PaneOpening and PaneClosing route here; we read the current
+    /// state from the sender so we don't have to distinguish the two.
+    /// </summary>
+    private void OnNavPaneStateChanged(NavigationView sender, object args)
+    {
+        if (_settings == null) return;
+        // PaneOpening fires BEFORE IsPaneOpen flips, PaneClosing fires
+        // BEFORE it flips the other way. Use the event identity to know
+        // the new state rather than reading IsPaneOpen.
+        var newState = args is NavigationViewPaneClosingEventArgs ? false : true;
+        if (_settings.HubNavPaneOpen == newState) return;
+        _settings.HubNavPaneOpen = newState;
+        try { _settings.Save(); } catch { /* swallow — don't block UI */ }
+    }
+
     private void InitializeCurrentPage()
     {
         switch (ContentFrame.Content)
@@ -566,6 +599,7 @@ private void InitializeCurrentPage()
                 break;
             case PermissionsPage permissions: permissions.Initialize(this); break;
             case CapabilitiesPage capabilities: capabilities.Initialize(this); break;
+            case VoiceSettingsPage voice: voice.Initialize(this, VoiceServiceInstance); break;
             case ConversationsPage convos: convos.Initialize(this); break;
             case ActivityPage activity: activity.Initialize(this); break;
             case AgentEventsPage agentEvents:
@@ -614,6 +648,7 @@ public void SetActivityFilter(string? filter)
         "usage" => typeof(UsagePage),
         "bindings" => typeof(BindingsPage),
         "capabilities" => typeof(CapabilitiesPage),
+        "voice" => typeof(VoiceSettingsPage),
         "permissions" => typeof(PermissionsPage),
         "activity" => typeof(ActivityPage),
         "settings" => typeof(SettingsPage),
diff --git a/src/OpenClaw.Tray.WinUI/Windows/VoiceOverlayWindow.xaml b/src/OpenClaw.Tray.WinUI/Windows/VoiceOverlayWindow.xaml
new file mode 100644
index 00000000..1b2a570a
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Windows/VoiceOverlayWindow.xaml
@@ -0,0 +1,127 @@
+<winex:WindowEx
+    x:Class="OpenClawTray.Windows.VoiceOverlayWindow"
+    x:Uid="VoiceOverlayWindow_winexWindowEx_2"
+    xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
+    xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
+    xmlns:winex="using:WinUIEx"
+    Title="Companion Voice"
+    Width="400" Height="520"
+    MinWidth="340" MinHeight="400"
+    IsAlwaysOnTop="True"
+    IsMinimizable="True"
+    IsMaximizable="False"
+    IsResizable="True">
+
+    <Window.SystemBackdrop>
+        <MicaBackdrop/>
+    </Window.SystemBackdrop>
+
+    <Grid>
+        <Grid.RowDefinitions>
+            <RowDefinition Height="40"/>
+            <RowDefinition Height="*"/>
+        </Grid.RowDefinitions>
+
+        <!-- Custom Title Bar -->
+        <Grid x:Name="AppTitleBar" Grid.Row="0" Padding="14,0,140,0">
+            <Grid.ColumnDefinitions>
+                <ColumnDefinition Width="Auto"/>
+                <ColumnDefinition Width="Auto"/>
+                <ColumnDefinition Width="*"/>
+                <ColumnDefinition Width="Auto"/>
+            </Grid.ColumnDefinitions>
+
+            <FontIcon Grid.Column="0" Glyph="&#xE720;" FontSize="16"
+                      Foreground="{ThemeResource SystemAccentColor}"
+                      VerticalAlignment="Center" Margin="0,0,8,0"/>
+            <TextBlock x:Uid="VoiceOverlayWindow_HeaderText" x:Name="HeaderText" Grid.Column="1" Text="Companion Voice" FontSize="13"
+                       FontWeight="SemiBold" VerticalAlignment="Center"/>
+            <Border Grid.Column="3" Background="{ThemeResource SystemAccentColor}"
+                    CornerRadius="10" Padding="10,3" VerticalAlignment="Center">
+                <TextBlock x:Uid="VoiceOverlayWindow_StatusBadge" x:Name="StatusBadge" Text="Ready" FontSize="11"
+                           Foreground="White" FontWeight="SemiBold"/>
+            </Border>
+        </Grid>
+
+        <!-- Main Content -->
+        <Grid Grid.Row="1" Padding="16,4,16,16">
+            <Grid.RowDefinitions>
+                <RowDefinition Height="*" />
+                <RowDefinition Height="Auto" />
+                <RowDefinition Height="Auto" />
+                <RowDefinition Height="Auto" />
+            </Grid.RowDefinitions>
+
+            <!-- Conversation transcript -->
+            <Border Grid.Row="0" Background="{ThemeResource CardBackgroundFillColorSecondaryBrush}"
+                    CornerRadius="8" Margin="0,0,0,12">
+                <ScrollViewer VerticalScrollBarVisibility="Auto" Padding="12"
+                              x:Name="TranscriptScroller">
+                    <StackPanel x:Name="TranscriptPanel" Spacing="8">
+                        <!-- Empty state -->
+                        <StackPanel x:Name="EmptyState" HorizontalAlignment="Center"
+                                    VerticalAlignment="Center" Spacing="8" Margin="0,40,0,0">
+                            <FontIcon Glyph="&#xE720;" FontSize="32" Opacity="0.3"
+                                      HorizontalAlignment="Center"/>
+                            <TextBlock x:Uid="VoiceOverlayWindow_EmptyStateText" x:Name="EmptyStateText"
+                                       Text="Press Start and begin speaking"
+                                       FontSize="13" Opacity="0.4"
+                                       HorizontalAlignment="Center"/>
+                        </StackPanel>
+                    </StackPanel>
+                </ScrollViewer>
+            </Border>
+
+            <!-- Audio level bar -->
+            <Grid Grid.Row="1" Margin="0,0,0,10" Height="4" CornerRadius="2">
+                <Border Background="{ThemeResource ControlStrongFillColorDefaultBrush}"
+                        CornerRadius="2" Opacity="0.3"/>
+                <Border x:Name="AudioLevelBar"
+                        Background="{ThemeResource SystemAccentColor}"
+                        CornerRadius="2"
+                        HorizontalAlignment="Left" Width="0" />
+            </Grid>
+
+            <!-- Status text -->
+            <TextBlock x:Uid="VoiceOverlayWindow_StatusText" Grid.Row="2" x:Name="StatusText" Text="Press Start to begin"
+                       FontSize="12" Opacity="0.5" HorizontalAlignment="Center"
+                       Margin="0,0,0,12" />
+
+            <!-- Controls -->
+            <Grid Grid.Row="3">
+                <Grid.ColumnDefinitions>
+                    <ColumnDefinition Width="*"/>
+                    <ColumnDefinition Width="Auto"/>
+                    <ColumnDefinition Width="Auto"/>
+                </Grid.ColumnDefinitions>
+
+                <Button x:Name="StartStopButton" Click="OnStartStopClick"
+                        Grid.Column="0"
+                        Style="{StaticResource AccentButtonStyle}"
+                        HorizontalAlignment="Stretch" Height="40"
+                        Margin="0,0,8,0">
+                    <StackPanel Orientation="Horizontal" Spacing="8">
+                        <FontIcon x:Name="StartStopIcon" Glyph="&#xE768;" FontSize="14" />
+                        <TextBlock x:Uid="VoiceOverlayWindow_StartStopText" x:Name="StartStopText" Text="Start Listening" FontWeight="SemiBold"/>
+                    </StackPanel>
+                </Button>
+
+                <Button x:Uid="VoiceOverlayWindow_MuteButton" x:Name="MuteButton" Click="OnMuteClick"
+                        Grid.Column="1"
+                        Width="40" Height="40" Margin="0,0,8,0"
+                        CornerRadius="20"
+                        ToolTipService.ToolTip="Mute">
+                    <FontIcon x:Name="MuteIcon" Glyph="&#xE767;" FontSize="14" />
+                </Button>
+
+                <Button x:Uid="VoiceOverlayWindow_SettingsButton" x:Name="SettingsButton" Click="OnSettingsClick"
+                        Grid.Column="2"
+                        Width="40" Height="40"
+                        CornerRadius="20"
+                        ToolTipService.ToolTip="Voice Settings">
+                    <FontIcon Glyph="&#xE713;" FontSize="14" />
+                </Button>
+            </Grid>
+        </Grid>
+    </Grid>
+</winex:WindowEx>
diff --git a/src/OpenClaw.Tray.WinUI/Windows/VoiceOverlayWindow.xaml.cs b/src/OpenClaw.Tray.WinUI/Windows/VoiceOverlayWindow.xaml.cs
new file mode 100644
index 00000000..e18feb78
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Windows/VoiceOverlayWindow.xaml.cs
@@ -0,0 +1,345 @@
+using System;
+using System.Globalization;
+using System.Threading.Tasks;
+using Microsoft.UI.Dispatching;
+using Microsoft.UI.Xaml;
+using Microsoft.UI.Xaml.Controls;
+using Microsoft.UI.Xaml.Media;
+using OpenClaw.Shared;
+using OpenClaw.Shared.Audio;
+using OpenClawTray.Helpers;
+using OpenClawTray.Services;
+using WinUIEx;
+
+namespace OpenClawTray.Windows;
+
+/// <summary>
+/// Floating voice overlay window for voice chat sessions.
+/// Shows conversation transcript, audio levels, and controls.
+/// </summary>
+public sealed partial class VoiceOverlayWindow : WindowEx
+{
+    private readonly VoiceService _voiceService;
+    private readonly IOpenClawLogger _logger;
+    private readonly DispatcherQueue _dispatcherQueue;
+    private bool _isMuted;
+
+    /// <summary>Fired when the user submits transcribed text to the agent.</summary>
+    public event Action<string>? TextSubmitted;
+
+    /// <summary>Fired when the user clicks the Settings button. Hosts should
+    /// navigate to the Voice & Audio page (e.g. via <c>ShowHub("voice")</c>).</summary>
+    public event Action? SettingsRequested;
+
+    public VoiceOverlayWindow(VoiceService voiceService, IOpenClawLogger logger)
+    {
+        InitializeComponent();
+        _voiceService = voiceService;
+        _logger = logger;
+        _dispatcherQueue = DispatcherQueue.GetForCurrentThread();
+
+        // Modern custom title bar
+        ExtendsContentIntoTitleBar = true;
+        SetTitleBar(AppTitleBar);
+
+        _voiceService.TranscriptionReceived += OnTranscriptionReceived;
+        _voiceService.UtteranceCompleted += OnUtteranceCompleted;
+        _voiceService.SpeakingChanged += OnSpeakingChanged;
+        _voiceService.AudioLevelChanged += OnAudioLevelChanged;
+        _voiceService.ModeChanged += OnModeChanged;
+        _voiceService.PipelineStateChanged += OnPipelineStateChanged;
+        _voiceService.DiagnosticMessage += OnDiagnosticMessage;
+
+        Closed += WindowClosed;
+        UpdateUI();
+    }
+
+    private DateTime _lastUserBubbleTime = DateTime.MinValue;
+    private TextBlock? _lastUserTextBlock;
+
+    private void OnTranscriptionReceived(string text)
+    {
+        _dispatcherQueue.TryEnqueue(() =>
+        {
+            // Per-segment bubble update (visual streaming). Consolidate into
+            // the last user bubble when fragments arrive within 5 seconds so
+            // a multi-segment utterance reads as one bubble in the transcript.
+            var elapsed = DateTime.UtcNow - _lastUserBubbleTime;
+            if (_lastUserTextBlock != null && elapsed.TotalSeconds < 5)
+            {
+                _lastUserTextBlock.Text += " " + text;
+                _lastUserBubbleTime = DateTime.UtcNow;
+                try
+                {
+                    TranscriptScroller.UpdateLayout();
+                    TranscriptScroller.ChangeView(null, TranscriptScroller.ScrollableHeight, null);
+                }
+                catch { }
+            }
+            else
+            {
+                AddTranscriptBubble(text, isUser: true);
+            }
+            // NOTE: chat submission moved to OnUtteranceCompleted so the
+            // gateway receives one message per spoken utterance, not one per
+            // Whisper segment.
+        });
+    }
+
+    private void OnUtteranceCompleted(OpenClaw.Shared.Audio.UtteranceResult utterance)
+    {
+        // Fire once per silence-bounded utterance. The visual bubble already
+        // shows the streamed text; here we just hand the complete sentence
+        // to the gateway exactly once.
+        _dispatcherQueue.TryEnqueue(() =>
+        {
+            if (!string.IsNullOrWhiteSpace(utterance.Text))
+                TextSubmitted?.Invoke(utterance.Text);
+        });
+    }
+
+    /// <summary>Add an agent response to the transcript.</summary>
+    public void AddAgentResponse(string text)
+    {
+        _dispatcherQueue.TryEnqueue(() =>
+        {
+            AddTranscriptBubble(text, isUser: false);
+        });
+    }
+
+    private void AddTranscriptBubble(string text, bool isUser)
+    {
+        try
+        {
+            // Hide empty state on first message
+            if (EmptyState.Visibility == Visibility.Visible)
+                EmptyState.Visibility = Visibility.Collapsed;
+
+            var bubble = new Border
+            {
+                Background = isUser
+                    ? new SolidColorBrush(Microsoft.UI.Colors.DodgerBlue)
+                    : (Brush)Application.Current.Resources["CardBackgroundFillColorDefaultBrush"],
+                CornerRadius = isUser
+                    ? new CornerRadius(12, 12, 4, 12)
+                    : new CornerRadius(12, 12, 12, 4),
+                Padding = new Thickness(12, 10, 12, 10),
+                HorizontalAlignment = isUser
+                    ? HorizontalAlignment.Right
+                    : HorizontalAlignment.Left,
+                Margin = new Thickness(isUser ? 24 : 0, 4, isUser ? 0 : 24, 4)
+            };
+
+            var icon = isUser ? "\uE77B" : "\uE799"; // Person / Robot
+            var grid = new Grid { ColumnSpacing = 8 };
+            grid.ColumnDefinitions.Add(new ColumnDefinition { Width = GridLength.Auto });
+            grid.ColumnDefinitions.Add(new ColumnDefinition { Width = new GridLength(1, GridUnitType.Star) });
+
+            var fontIcon = new FontIcon { Glyph = icon, FontSize = 12, VerticalAlignment = VerticalAlignment.Top, Margin = new Thickness(0, 3, 0, 0) };
+            Grid.SetColumn(fontIcon, 0);
+            grid.Children.Add(fontIcon);
+
+            var textBlock = new TextBlock
+            {
+                Text = text,
+                TextWrapping = TextWrapping.Wrap,
+                FontSize = 13,
+                IsTextSelectionEnabled = true
+            };
+            if (isUser)
+            {
+                textBlock.Foreground = new SolidColorBrush(Microsoft.UI.Colors.White);
+                _lastUserTextBlock = textBlock;
+                _lastUserBubbleTime = DateTime.UtcNow;
+            }
+            else
+            {
+                // Agent response breaks the consolidation window
+                _lastUserTextBlock = null;
+            }
+            Grid.SetColumn(textBlock, 1);
+            grid.Children.Add(textBlock);
+
+            bubble.Child = grid;
+            TranscriptPanel.Children.Add(bubble);
+
+            // Auto-scroll to bottom
+            TranscriptScroller.UpdateLayout();
+            TranscriptScroller.ChangeView(null, TranscriptScroller.ScrollableHeight, null);
+        }
+        catch (Exception ex)
+        {
+            _logger.Error("Failed to add transcript bubble", ex);
+        }
+    }
+
+    private static string L(string key) => LocalizationHelper.GetString(key);
+    private static string Lf(string key, params object?[] args) =>
+        string.Format(CultureInfo.CurrentCulture, LocalizationHelper.GetString(key), args);
+
+    private void OnSpeakingChanged(bool isSpeaking)
+    {
+        _dispatcherQueue.TryEnqueue(() =>
+        {
+            StatusText.Text = isSpeaking
+                ? L("VoiceOverlayWindow_StatusListening")
+                : L("VoiceOverlayWindow_StatusSpeakNow");
+        });
+    }
+
+    private void OnAudioLevelChanged(float level)
+    {
+        _dispatcherQueue.TryEnqueue(() =>
+        {
+            // Scale the level bar width (max width = parent width)
+            var maxWidth = AudioLevelBar.Parent is FrameworkElement parent ? parent.ActualWidth : 300;
+            AudioLevelBar.Width = Math.Max(0, level * maxWidth);
+        });
+    }
+
+    private void OnModeChanged(VoiceMode mode)
+    {
+        _dispatcherQueue.TryEnqueue(UpdateUI);
+    }
+
+    private void OnDiagnosticMessage(string message)
+    {
+        _dispatcherQueue.TryEnqueue(() =>
+        {
+            StatusText.Text = message;
+        });
+    }
+
+    private void OnPipelineStateChanged(AudioPipelineState state)
+    {
+        _dispatcherQueue.TryEnqueue(() =>
+        {
+            StatusBadge.Text = state switch
+            {
+                AudioPipelineState.Stopped    => L("VoiceOverlayWindow_BadgeStopped"),
+                AudioPipelineState.Starting   => L("VoiceOverlayWindow_BadgeStartingDots"),
+                AudioPipelineState.Listening  => L("VoiceOverlayWindow_BadgeListening"),
+                AudioPipelineState.Processing => L("VoiceOverlayWindow_BadgeProcessing"),
+                AudioPipelineState.Error      => L("VoiceOverlayWindow_StateError"),
+                _                             => L("VoiceOverlayWindow_BadgeUnknown")
+            };
+
+            StatusText.Text = state switch
+            {
+                AudioPipelineState.Stopped    => L("VoiceOverlayWindow_StatusReadyMessage"),
+                AudioPipelineState.Starting   => L("VoiceOverlayWindow_StatusInitMic"),
+                AudioPipelineState.Listening  => L("VoiceOverlayWindow_StatusSpeakNow"),
+                AudioPipelineState.Processing => L("VoiceOverlayWindow_StatusTranscribing"),
+                AudioPipelineState.Error      => L("VoiceOverlayWindow_StatusErrorOccurred"),
+                _                             => ""
+            };
+        });
+    }
+
+    private void UpdateUI()
+    {
+        var isActive = _voiceService.CurrentMode != VoiceMode.Inactive;
+
+        StartStopIcon.Glyph = isActive ? "\uE71A" : "\uE768"; // Stop / Play
+        StartStopText.Text = isActive
+            ? L("VoiceOverlayWindow_StopText")
+            : L("VoiceOverlayWindow_ButtonStartListening");
+        MuteButton.IsEnabled = isActive;
+
+        if (!isActive)
+        {
+            StatusBadge.Text = L("VoiceOverlayWindow_BadgeReady");
+            StatusText.Text = L("VoiceOverlayWindow_StatusReadyMessage");
+            AudioLevelBar.Width = 0;
+        }
+    }
+
+    private async void OnStartStopClick(object sender, RoutedEventArgs e)
+    {
+        try
+        {
+            if (_voiceService.CurrentMode == VoiceMode.Inactive)
+            {
+                StatusText.Text = L("VoiceOverlayWindow_StateInitializing");
+                StatusBadge.Text = L("VoiceOverlayWindow_StateStarting");
+                StartStopButton.IsEnabled = false;
+
+                // Initialize models if needed (may trigger downloads)
+                if (!_voiceService.IsModelLoaded)
+                {
+                    if (!_voiceService.IsModelDownloaded)
+                    {
+                        StatusText.Text = L("VoiceOverlayWindow_StateDownloadingModel");
+                        var progress = new Progress<(long downloaded, long total)>(p =>
+                        {
+                            _dispatcherQueue.TryEnqueue(() =>
+                            {
+                                var pct = p.total > 0 ? (int)(p.downloaded * 100 / p.total) : 0;
+                                StatusText.Text = Lf("VoiceOverlayWindow_StateDownloadingPct", pct);
+                            });
+                        });
+                        await _voiceService.DownloadModelAsync(progress: progress);
+                    }
+
+                    StatusText.Text = L("VoiceOverlayWindow_StateLoadingModel");
+                    await _voiceService.InitializeAsync();
+                }
+
+                StatusText.Text = L("VoiceOverlayWindow_StateStartingMic");
+                await _voiceService.StartVoiceChatAsync();
+            }
+            else
+            {
+                StatusText.Text = L("VoiceOverlayWindow_StateStopping");
+                await _voiceService.StopAsync();
+            }
+        }
+        catch (Exception ex)
+        {
+            _logger.Error("Voice overlay start/stop failed", ex);
+            // Sanitized — full ex.Message is in the log.
+            StatusText.Text = L("VoiceOverlayWindow_StatusError");
+            StatusBadge.Text = L("VoiceOverlayWindow_StateError");
+        }
+        finally
+        {
+            StartStopButton.IsEnabled = true;
+            UpdateUI();
+        }
+    }
+
+    private async void OnMuteClick(object sender, RoutedEventArgs e)
+    {
+        _isMuted = !_isMuted;
+        MuteIcon.Glyph = _isMuted ? "\uE74F" : "\uE767"; // Muted / Volume
+
+        if (_isMuted)
+        {
+            await _voiceService.StopAsync();
+            StatusText.Text = L("VoiceOverlayWindow_StatusMuted");
+        }
+        else
+        {
+            await _voiceService.StartVoiceChatAsync();
+        }
+    }
+
+    private void OnSettingsClick(object sender, RoutedEventArgs e)
+    {
+        SettingsRequested?.Invoke();
+    }
+
+    private void WindowClosed(object sender, WindowEventArgs args)
+    {
+        _voiceService.TranscriptionReceived -= OnTranscriptionReceived;
+        _voiceService.UtteranceCompleted -= OnUtteranceCompleted;
+        _voiceService.SpeakingChanged -= OnSpeakingChanged;
+        _voiceService.AudioLevelChanged -= OnAudioLevelChanged;
+        _voiceService.ModeChanged -= OnModeChanged;
+        _voiceService.PipelineStateChanged -= OnPipelineStateChanged;
+        _voiceService.DiagnosticMessage -= OnDiagnosticMessage;
+
+        // Stop voice session when window closes
+        _ = _voiceService.StopAsync();
+    }
+}
diff --git a/src/OpenClaw.WinNode.Cli/skill.md b/src/OpenClaw.WinNode.Cli/skill.md
index acb5add9..d7373fa9 100644
--- a/src/OpenClaw.WinNode.Cli/skill.md
+++ b/src/OpenClaw.WinNode.Cli/skill.md
@@ -221,6 +221,124 @@ default camera.
 ```
 Returns `{ format, durationMs, base64 }`.
 
+## Speech-to-text (stt.*)
+
+Local Whisper.net runs on this device — no audio leaves the box. The
+model is downloaded on first use; until then every `stt.*` call returns
+a clear error pointing the caller at the Voice Settings page.
+**Privacy-sensitive: requires `NodeSttEnabled` in tray Settings.**
+
+### stt.transcribe
+Bounded fixed-duration mic capture + transcription.
+```
+{
+  "maxDurationMs": 5000,      // required, > 0, max 30000
+  "language": "en"            // optional BCP-47 tag or "auto" — falls back to SttLanguage setting
+}
+```
+Returns `{ transcribed, text, durationMs, language, engineEffective: "whisper" }`.
+
+### stt.listen
+Mic capture with voice-activity detection. Returns when the user stops
+speaking or after `timeoutMs`. Result is the full silence-bounded
+utterance (all Whisper segments concatenated), not a partial first
+segment.
+```
+{
+  "timeoutMs": 30000,         // optional, default 30000, range 1000..120000
+  "language": "auto"          // optional BCP-47 tag or "auto"
+}
+```
+Returns `{ text, language, durationMs, segments[{ text, startMs, endMs }], engineEffective: "whisper" }`.
+
+### stt.status
+Engine readiness. No params. Carries no PII (no transcript history,
+no language history, no device IDs, no model paths).
+Returns `{ engine: "whisper", readiness, modelDownloadProgress, isListenWithVadSupported, isBoundedTranscribeSupported }`
+where `readiness` ∈ `"ready" | "initializing" | "model-downloading" | "model-not-downloaded" | "unavailable"`.
+
+## Text-to-speech (tts.*)
+
+Three providers — Piper (local neural via Sherpa-ONNX, default), Windows
+built-in speech, and ElevenLabs (cloud). Provider + per-provider voice
+are configured in tray Settings.
+
+### tts.speak
+Speak text aloud on the Windows node.
+```
+{
+  "text": "string",           // required
+  "provider": "piper|windows|elevenlabs",  // optional, falls back to TtsProvider setting
+  "voiceId": "string",        // optional, overrides the per-provider configured voice
+  "model": "string",          // optional, ElevenLabs only
+  "interrupt": false          // default false; true cuts off any in-progress playback
+}
+```
+Returns `{ spoken, provider, contentType, durationMs }`.
+
+## App control (app.*)
+
+Read-only and small write operations targeting the running tray. Used
+by the command palette and by automation that wants to drive the UI.
+
+### app.navigate
+Navigate the companion app to a specific page.
+```
+{"page": "home|sessions|settings|chat|voice|connection|capabilities|conversations|...""}
+```
+Returns `{ navigated, page }`.
+
+### app.status
+Current connection / node state.
+No params. Returns `{ connectionStatus, nodeConnected, nodePaired, nodePendingApproval, gatewayVersion, sessionCount, nodeCount }`.
+
+### app.sessions
+Active sessions, optionally filtered by agent.
+```
+{"agentId": "string"}        // optional
+```
+Returns array of `{ Key, Status, Model, AgeText, tokens }`.
+
+### app.agents
+List agents from the connected gateway. No params. Returns the raw
+agents JSON array.
+
+### app.nodes
+List connected nodes and their capabilities. No params. Returns array
+of `{ DisplayName, NodeId, IsOnline, Platform, CapabilityCount }`.
+
+### app.config.get
+Read gateway configuration value at a dot-path.
+```
+{"path": "string"}           // optional; omit to fetch the full config tree
+```
+Returns the config subtree (or full config) as JSON.
+
+### app.settings.get
+Read a local app setting by name.
+```
+{"name": "string"}           // required
+```
+Returns the setting value (type depends on the setting).
+
+### app.settings.set
+Set a local app setting.
+```
+{"name": "string", "value": "string"}  // both required
+```
+Returns `{ name, value }`.
+
+### app.menu
+Get tray menu state (status, session count, node count). No params.
+Returns array of menu items.
+
+### app.search
+Search the command palette and return matching commands.
+```
+{"query": "string"}          // required
+```
+Returns array of `{ Title, Subtitle, Icon }`.
+
 ---
 
 ## A2UI v0.8 grammar (for canvas.a2ui.push)
diff --git a/tests/OpenClaw.Shared.Tests/AssetHashPinningTests.cs b/tests/OpenClaw.Shared.Tests/AssetHashPinningTests.cs
new file mode 100644
index 00000000..5073353d
--- /dev/null
+++ b/tests/OpenClaw.Shared.Tests/AssetHashPinningTests.cs
@@ -0,0 +1,70 @@
+using System.Text.RegularExpressions;
+using OpenClaw.Shared.Audio;
+using Xunit;
+
+namespace OpenClaw.Shared.Tests;
+
+/// <summary>
+/// Pre-GA security guard. Every shipped Whisper model and Piper voice MUST
+/// have a pinned SHA-256 hash so the runtime can refuse tampered downloads.
+/// New entries that forget the hash will fail this test loudly instead of
+/// quietly being installable from a compromised source.
+///
+/// See WhisperModelManager.AvailableModels / PiperVoiceManager.AvailableVoices
+/// and Audio_FollowUps.md §2.
+/// </summary>
+public class AssetHashPinningTests
+{
+    private static readonly Regex Sha256Hex = new("^[0-9a-f]{64}$", RegexOptions.Compiled);
+
+    [Fact]
+    public void EveryWhisperModel_HasPinnedSha256()
+    {
+        Assert.NotEmpty(WhisperModelManager.AvailableModels);
+        foreach (var m in WhisperModelManager.AvailableModels)
+        {
+            Assert.False(string.IsNullOrWhiteSpace(m.Sha256),
+                $"Whisper model '{m.Name}' is missing a pinned SHA-256 hash. Add one to AvailableModels.");
+            Assert.Matches(Sha256Hex, m.Sha256!);
+        }
+    }
+
+    [Fact]
+    public void EveryPiperVoice_HasPinnedSha256()
+    {
+        Assert.NotEmpty(PiperVoiceManager.AvailableVoices);
+        foreach (var v in PiperVoiceManager.AvailableVoices)
+        {
+            Assert.False(string.IsNullOrWhiteSpace(v.Sha256),
+                $"Piper voice '{v.VoiceId}' is missing a pinned SHA-256 hash. Add one to AvailableVoices.");
+            Assert.Matches(Sha256Hex, v.Sha256!);
+        }
+    }
+
+    [Fact]
+    public void EveryWhisperModel_UsesHttpsDownloadUrl()
+    {
+        foreach (var m in WhisperModelManager.AvailableModels)
+        {
+            Assert.StartsWith("https://", m.DownloadUrl);
+        }
+    }
+
+    [Fact]
+    public void EveryPiperVoice_UsesHttpsDownloadUrl()
+    {
+        foreach (var v in PiperVoiceManager.AvailableVoices)
+        {
+            Assert.StartsWith("https://", v.DownloadUrl);
+        }
+    }
+
+    [Fact]
+    public void SileroVadModel_HasPinnedSha256()
+    {
+        Assert.False(string.IsNullOrWhiteSpace(SileroVadModelManifest.Sha256),
+            "Silero VAD model is missing a pinned SHA-256 hash. Add one to SileroVadModelManifest.");
+        Assert.Matches(Sha256Hex, SileroVadModelManifest.Sha256);
+        Assert.StartsWith("https://", SileroVadModelManifest.DownloadUrl);
+    }
+}
diff --git a/tests/OpenClaw.Shared.Tests/CapabilityTests.cs b/tests/OpenClaw.Shared.Tests/CapabilityTests.cs
index 0cda6237..f79c25cd 100644
--- a/tests/OpenClaw.Shared.Tests/CapabilityTests.cs
+++ b/tests/OpenClaw.Shared.Tests/CapabilityTests.cs
@@ -2583,8 +2583,8 @@ public void CanHandle_TtsSpeak()
     [InlineData(" ELEVENLABS ", "windows", "elevenlabs")]
     [InlineData(null, "elevenlabs", "elevenlabs")]
     [InlineData("   ", "elevenlabs", "elevenlabs")]
-    [InlineData(null, "", "windows")]
-    [InlineData(null, "   ", "windows")]
+    [InlineData(null, "", "piper")]
+    [InlineData(null, "   ", "piper")]
     public void ResolveProvider_NormalizesRequestedAndConfiguredValues(
         string? requestedProvider,
         string? configuredProvider,
@@ -2712,7 +2712,32 @@ public async Task Speak_ReturnsError_WhenHandlerThrows()
         });
 
         Assert.False(res.Ok);
-        Assert.Contains("Audio device unavailable", res.Error);
+        // Privacy: response surfaces a fixed sanitized error; the underlying
+        // exception text (which can include device names, ElevenLabs key
+        // fragments from 401 messages, etc.) stays in the local log only.
+        Assert.Equal("Speak failed", res.Error);
+    }
+
+    [Fact]
+    public async Task Speak_HandlerException_DoesNotLeakExceptionMessageIntoError()
+    {
+        // Privacy regression: a 401 from ElevenLabs containing a key prefix
+        // must not bleed into the response error path (and from there into
+        // recent activity / support bundles).
+        var cap = new TtsCapability(NullLogger.Instance);
+        const string sensitive = "ElevenLabs 401: invalid key sk-secret-prefix-do-not-leak";
+        cap.SpeakRequested += (_, _) => throw new InvalidOperationException(sensitive);
+
+        var res = await cap.ExecuteAsync(new NodeInvokeRequest
+        {
+            Id = "tts-priv",
+            Command = "tts.speak",
+            Args = Parse("""{"text":"hello"}""")
+        });
+
+        Assert.False(res.Ok);
+        Assert.DoesNotContain(sensitive, res.Error);
+        Assert.DoesNotContain("sk-secret-prefix-do-not-leak", res.Error);
     }
 
     [Fact]
@@ -2901,3 +2926,567 @@ public async Task ExecuteAsync_ReturnsError_ForUnknownCommand()
         Assert.Contains("Unknown command", res.Error);
     }
 }
+
+public class SttCapabilityTests
+{
+    private static JsonElement Parse(string json)
+    {
+        using var doc = JsonDocument.Parse(json);
+        return doc.RootElement.Clone();
+    }
+
+    [Fact]
+    public void CanHandle_SttTranscribe()
+    {
+        var cap = new SttCapability(NullLogger.Instance);
+        Assert.True(cap.CanHandle("stt.transcribe"));
+        Assert.True(cap.CanHandle("stt.listen"));
+        Assert.True(cap.CanHandle("stt.status"));
+        Assert.False(cap.CanHandle("stt.stream"));
+        Assert.False(cap.CanHandle("tts.speak"));
+        Assert.Equal("stt", cap.Category);
+        Assert.Contains(SttCapability.TranscribeCommand, cap.Commands);
+        Assert.Contains(SttCapability.ListenCommand, cap.Commands);
+        Assert.Contains(SttCapability.StatusCommand, cap.Commands);
+    }
+
+    [Fact]
+    public void ResolveLanguage_PrefersRequested()
+    {
+        Assert.Equal("ja-JP", SttCapability.ResolveLanguage("ja-JP", "en-GB"));
+        Assert.Equal("en-GB", SttCapability.ResolveLanguage(null, "en-GB"));
+        Assert.Equal("en-GB", SttCapability.ResolveLanguage("   ", "en-GB"));
+        Assert.Equal(SttCapability.DefaultLanguage, SttCapability.ResolveLanguage(null, null));
+    }
+
+    [Fact]
+    public void ResolveLanguage_RejectsNonsense()
+    {
+        Assert.Null(SttCapability.ResolveLanguage("not a tag", null));
+        Assert.Null(SttCapability.ResolveLanguage("english", null));
+        Assert.Null(SttCapability.ResolveLanguage("en_US", null));
+    }
+
+    [Fact]
+    public async Task Transcribe_ReturnsError_WhenMaxDurationMissing()
+    {
+        var cap = new SttCapability(NullLogger.Instance);
+        cap.TranscribeRequested += (_, _) => throw new InvalidOperationException("should not be called");
+
+        var res = await cap.ExecuteAsync(new NodeInvokeRequest
+        {
+            Id = "stt1",
+            Command = "stt.transcribe",
+            Args = Parse("""{}""")
+        });
+
+        Assert.False(res.Ok);
+        Assert.Contains("Missing required maxDurationMs", res.Error);
+    }
+
+    [Theory]
+    [InlineData(0)]
+    [InlineData(-1)]
+    [InlineData(-5000)]
+    public async Task Transcribe_ReturnsError_WhenMaxDurationNotPositive(int maxMs)
+    {
+        var cap = new SttCapability(NullLogger.Instance);
+        var res = await cap.ExecuteAsync(new NodeInvokeRequest
+        {
+            Id = "stt2",
+            Command = "stt.transcribe",
+            Args = Parse($$"""{"maxDurationMs":{{maxMs}}}""")
+        });
+
+        Assert.False(res.Ok);
+        Assert.Contains("Missing required maxDurationMs", res.Error);
+    }
+
+    [Fact]
+    public async Task Transcribe_ReturnsError_WhenMaxDurationExceedsBound()
+    {
+        var cap = new SttCapability(NullLogger.Instance);
+        var res = await cap.ExecuteAsync(new NodeInvokeRequest
+        {
+            Id = "stt3",
+            Command = "stt.transcribe",
+            Args = Parse("""{"maxDurationMs":60000}""")
+        });
+
+        Assert.False(res.Ok);
+        Assert.Contains("exceeds 30000", res.Error);
+    }
+
+    [Fact]
+    public async Task Transcribe_ReturnsError_WhenLanguageInvalid()
+    {
+        var cap = new SttCapability(NullLogger.Instance);
+        var res = await cap.ExecuteAsync(new NodeInvokeRequest
+        {
+            Id = "stt4",
+            Command = "stt.transcribe",
+            Args = Parse("""{"maxDurationMs":5000,"language":"english please"}""")
+        });
+
+        Assert.False(res.Ok);
+        Assert.Contains("Invalid language tag", res.Error);
+    }
+
+    [Fact]
+    public async Task Transcribe_InvalidLanguageError_DoesNotEchoCallerInput()
+    {
+        // Privacy regression: caller-supplied language must not be echoed back
+        // in the error string, since failed-invoke errors land in recent
+        // activity / support bundles.
+        var cap = new SttCapability(NullLogger.Instance);
+        const string secretish = "ZZ-secret-tag-do-not-leak";
+        var res = await cap.ExecuteAsync(new NodeInvokeRequest
+        {
+            Id = "stt-priv-lang",
+            Command = "stt.transcribe",
+            Args = Parse($$"""{"maxDurationMs":5000,"language":"{{secretish}}"}""")
+        });
+
+        Assert.False(res.Ok);
+        Assert.DoesNotContain(secretish, res.Error);
+    }
+
+    [Fact]
+    public async Task Transcribe_HandlerException_DoesNotLeakExceptionMessageIntoError()
+    {
+        // Privacy regression: raw handler exception text could surface mic /
+        // audio-stack details. Response error must be a fixed sanitized
+        // string; full detail stays in logs.
+        var cap = new SttCapability(NullLogger.Instance);
+        const string sensitive = "secret-mic-device-path-or-stack-trace";
+        cap.TranscribeRequested += (_, _) => throw new InvalidOperationException(sensitive);
+
+        var res = await cap.ExecuteAsync(new NodeInvokeRequest
+        {
+            Id = "stt-priv-ex",
+            Command = "stt.transcribe",
+            Args = Parse("""{"maxDurationMs":5000}""")
+        });
+
+        Assert.False(res.Ok);
+        Assert.DoesNotContain(sensitive, res.Error);
+    }
+
+    [Fact]
+    public async Task Transcribe_ReturnsError_WhenHandlerNotWired()
+    {
+        var cap = new SttCapability(NullLogger.Instance);
+        var res = await cap.ExecuteAsync(new NodeInvokeRequest
+        {
+            Id = "stt5",
+            Command = "stt.transcribe",
+            Args = Parse("""{"maxDurationMs":5000}""")
+        });
+
+        Assert.False(res.Ok);
+        Assert.Contains("not available", res.Error);
+    }
+
+    [Fact]
+    public async Task Transcribe_PassesArgsToHandler_AndReturnsPayload()
+    {
+        var cap = new SttCapability(NullLogger.Instance);
+        SttTranscribeArgs? received = null;
+        cap.TranscribeRequested += (a, _) =>
+        {
+            received = a;
+            return Task.FromResult(new SttTranscribeResult
+            {
+                Transcribed = true,
+                Text = "hello",
+                DurationMs = 4200,
+                Language = a.Language ?? SttCapability.DefaultLanguage
+            });
+        };
+
+        var res = await cap.ExecuteAsync(new NodeInvokeRequest
+        {
+            Id = "stt6",
+            Command = "stt.transcribe",
+            Args = Parse("""{"maxDurationMs":5000,"language":"en-GB"}""")
+        });
+
+        Assert.True(res.Ok);
+        Assert.NotNull(received);
+        Assert.Equal(5000, received!.MaxDurationMs);
+        Assert.Equal("en-GB", received.Language);
+
+        var payload = JsonSerializer.SerializeToElement(res.Payload);
+        Assert.True(payload.GetProperty("transcribed").GetBoolean());
+        Assert.Equal("hello", payload.GetProperty("text").GetString());
+        Assert.Equal(4200, payload.GetProperty("durationMs").GetInt32());
+        Assert.Equal("en-GB", payload.GetProperty("language").GetString());
+    }
+
+    [Fact]
+    public async Task Transcribe_DropsLanguage_WhenOmitted_LettingTrayUseSetting()
+    {
+        var cap = new SttCapability(NullLogger.Instance);
+        SttTranscribeArgs? received = null;
+        cap.TranscribeRequested += (a, _) =>
+        {
+            received = a;
+            return Task.FromResult(new SttTranscribeResult { Transcribed = true, Text = "hi", DurationMs = 100, Language = "en-US" });
+        };
+
+        var res = await cap.ExecuteAsync(new NodeInvokeRequest
+        {
+            Id = "stt7",
+            Command = "stt.transcribe",
+            Args = Parse("""{"maxDurationMs":1000}""")
+        });
+
+        Assert.True(res.Ok);
+        Assert.Null(received!.Language);
+    }
+
+    [Fact]
+    public async Task Transcribe_ReportsHandlerException()
+    {
+        var cap = new SttCapability(NullLogger.Instance);
+        cap.TranscribeRequested += (_, _) => throw new InvalidOperationException("Microphone unavailable.");
+
+        var res = await cap.ExecuteAsync(new NodeInvokeRequest
+        {
+            Id = "stt8",
+            Command = "stt.transcribe",
+            Args = Parse("""{"maxDurationMs":2000}""")
+        });
+
+        Assert.False(res.Ok);
+        // Privacy: response surfaces a fixed sanitized error; raw exception
+        // text stays in the local log only. See
+        // Transcribe_HandlerException_DoesNotLeakExceptionMessageIntoError.
+        Assert.Equal("Transcribe failed", res.Error);
+    }
+
+    [Fact]
+    public async Task Transcribe_ReturnsCanceled_WhenTokenFires()
+    {
+        var cap = new SttCapability(NullLogger.Instance);
+        cap.TranscribeRequested += async (_, ct) =>
+        {
+            await Task.Delay(Timeout.Infinite, ct);
+            return new SttTranscribeResult();
+        };
+
+        using var cts = new CancellationTokenSource();
+        cts.CancelAfter(TimeSpan.FromMilliseconds(50));
+
+        var res = await cap.ExecuteAsync(
+            new NodeInvokeRequest { Id = "stt9", Command = "stt.transcribe", Args = Parse("""{"maxDurationMs":5000}""") },
+            cts.Token);
+
+        Assert.False(res.Ok);
+        Assert.Contains("canceled", res.Error, StringComparison.OrdinalIgnoreCase);
+    }
+
+    [Fact]
+    public async Task ExecuteAsync_ReturnsError_ForUnknownCommand()
+    {
+        var cap = new SttCapability(NullLogger.Instance);
+        var res = await cap.ExecuteAsync(new NodeInvokeRequest
+        {
+            Id = "stt10",
+            Command = "stt.stream",
+            Args = Parse("""{}""")
+        });
+        Assert.False(res.Ok);
+        Assert.Contains("Unknown command", res.Error);
+    }
+
+    // ============================================================
+    // stt.listen (VAD-driven capture)
+    // ============================================================
+
+    [Fact]
+    public async Task Listen_ClampsTimeoutMs_BelowMin()
+    {
+        var cap = new SttCapability(NullLogger.Instance);
+        SttListenArgs? received = null;
+        cap.ListenRequested += (a, _) =>
+        {
+            received = a;
+            return Task.FromResult(new SttListenResult { Text = "x", Language = "auto", DurationMs = 100 });
+        };
+
+        var res = await cap.ExecuteAsync(new NodeInvokeRequest
+        {
+            Id = "listen-min",
+            Command = "stt.listen",
+            Args = Parse("""{"timeoutMs":50}""")
+        });
+
+        Assert.True(res.Ok);
+        Assert.NotNull(received);
+        Assert.Equal(SttCapability.MinListenTimeoutMs, received!.TimeoutMs);
+    }
+
+    [Fact]
+    public async Task Listen_ClampsTimeoutMs_AboveMax()
+    {
+        var cap = new SttCapability(NullLogger.Instance);
+        SttListenArgs? received = null;
+        cap.ListenRequested += (a, _) =>
+        {
+            received = a;
+            return Task.FromResult(new SttListenResult { Text = "x", Language = "auto", DurationMs = 100 });
+        };
+
+        var res = await cap.ExecuteAsync(new NodeInvokeRequest
+        {
+            Id = "listen-max",
+            Command = "stt.listen",
+            Args = Parse("""{"timeoutMs":1000000}""")
+        });
+
+        Assert.True(res.Ok);
+        Assert.NotNull(received);
+        Assert.Equal(SttCapability.MaxListenTimeoutMs, received!.TimeoutMs);
+    }
+
+    [Fact]
+    public async Task Listen_DefaultsLanguageToAuto()
+    {
+        var cap = new SttCapability(NullLogger.Instance);
+        SttListenArgs? received = null;
+        cap.ListenRequested += (a, _) =>
+        {
+            received = a;
+            return Task.FromResult(new SttListenResult { Text = "ok", Language = a.Language, DurationMs = 100 });
+        };
+
+        var res = await cap.ExecuteAsync(new NodeInvokeRequest
+        {
+            Id = "listen-auto",
+            Command = "stt.listen",
+            Args = Parse("""{"timeoutMs":5000}""")
+        });
+
+        Assert.True(res.Ok);
+        Assert.Equal(SttCapability.AutoLanguage, received!.Language);
+    }
+
+    [Fact]
+    public async Task Listen_ReturnsError_WhenLanguageInvalid()
+    {
+        var cap = new SttCapability(NullLogger.Instance);
+        var res = await cap.ExecuteAsync(new NodeInvokeRequest
+        {
+            Id = "listen-bad-lang",
+            Command = "stt.listen",
+            Args = Parse("""{"timeoutMs":5000,"language":"english please"}""")
+        });
+
+        Assert.False(res.Ok);
+        Assert.Contains("Invalid language tag", res.Error);
+    }
+
+    [Fact]
+    public async Task Listen_InvalidLanguageError_DoesNotEchoCallerInput()
+    {
+        var cap = new SttCapability(NullLogger.Instance);
+        const string secretish = "ZZ-secret-tag-do-not-leak";
+        var res = await cap.ExecuteAsync(new NodeInvokeRequest
+        {
+            Id = "listen-priv-lang",
+            Command = "stt.listen",
+            Args = Parse($$"""{"timeoutMs":5000,"language":"{{secretish}}"}""")
+        });
+
+        Assert.False(res.Ok);
+        Assert.DoesNotContain(secretish, res.Error);
+    }
+
+    [Fact]
+    public async Task Listen_ReturnsError_WhenHandlerNotWired()
+    {
+        var cap = new SttCapability(NullLogger.Instance);
+        var res = await cap.ExecuteAsync(new NodeInvokeRequest
+        {
+            Id = "listen-no-handler",
+            Command = "stt.listen",
+            Args = Parse("""{"timeoutMs":5000}""")
+        });
+
+        Assert.False(res.Ok);
+        Assert.Contains("not available", res.Error);
+    }
+
+    [Fact]
+    public async Task Listen_HandlerException_DoesNotLeakExceptionMessageIntoError()
+    {
+        var cap = new SttCapability(NullLogger.Instance);
+        const string sensitive = "secret-mic-device-path-or-stack-trace";
+        cap.ListenRequested += (_, _) => throw new InvalidOperationException(sensitive);
+
+        var res = await cap.ExecuteAsync(new NodeInvokeRequest
+        {
+            Id = "listen-priv-ex",
+            Command = "stt.listen",
+            Args = Parse("""{"timeoutMs":5000}""")
+        });
+
+        Assert.False(res.Ok);
+        Assert.DoesNotContain(sensitive, res.Error);
+        Assert.Equal("Listen failed", res.Error);
+    }
+
+    [Fact]
+    public async Task Listen_PassesSegmentsAndEngineMetadata()
+    {
+        var cap = new SttCapability(NullLogger.Instance);
+        cap.ListenRequested += (_, _) => Task.FromResult(new SttListenResult
+        {
+            Text = "hello world",
+            Language = "en-US",
+            DurationMs = 1500,
+            Segments = new[]
+            {
+                new SttSegment { Text = "hello", StartMs = 0, EndMs = 500 },
+                new SttSegment { Text = "world", StartMs = 600, EndMs = 1500 },
+            },
+            EngineEffective = SttCapability.EngineWhisper
+        });
+
+        var res = await cap.ExecuteAsync(new NodeInvokeRequest
+        {
+            Id = "listen-payload",
+            Command = "stt.listen",
+            Args = Parse("""{"timeoutMs":5000,"language":"en-US"}""")
+        });
+
+        Assert.True(res.Ok);
+        // Round-trip through serialization to make sure the response object
+        // exposes the new fields.
+        var json = System.Text.Json.JsonSerializer.Serialize(res.Payload);
+        Assert.Contains("\"text\":\"hello world\"", json);
+        Assert.Contains("\"engineEffective\":\"whisper\"", json);
+        Assert.Contains("\"segments\":", json);
+    }
+
+    [Fact]
+    public async Task Listen_ReturnsCanceled_WhenTokenFires()
+    {
+        var cap = new SttCapability(NullLogger.Instance);
+        cap.ListenRequested += async (_, ct) =>
+        {
+            await Task.Delay(Timeout.Infinite, ct);
+            return new SttListenResult();
+        };
+
+        using var cts = new CancellationTokenSource();
+        cts.CancelAfter(TimeSpan.FromMilliseconds(50));
+
+        var res = await cap.ExecuteAsync(
+            new NodeInvokeRequest { Id = "listen-cancel", Command = "stt.listen", Args = Parse("""{"timeoutMs":5000}""") },
+            cts.Token);
+
+        Assert.False(res.Ok);
+        Assert.Contains("canceled", res.Error, StringComparison.OrdinalIgnoreCase);
+    }
+
+    // ============================================================
+    // stt.status
+    // ============================================================
+
+    [Fact]
+    public async Task Status_ReturnsError_WhenHandlerNotWired()
+    {
+        var cap = new SttCapability(NullLogger.Instance);
+        var res = await cap.ExecuteAsync(new NodeInvokeRequest
+        {
+            Id = "status-no-handler",
+            Command = "stt.status",
+            Args = Parse("""{}""")
+        });
+
+        Assert.False(res.Ok);
+        Assert.Contains("not available", res.Error);
+    }
+
+    [Fact]
+    public async Task Status_HandlerException_DoesNotLeakExceptionMessageIntoError()
+    {
+        var cap = new SttCapability(NullLogger.Instance);
+        const string sensitive = "secret-engine-stack-trace";
+        cap.StatusRequested += _ => throw new InvalidOperationException(sensitive);
+
+        var res = await cap.ExecuteAsync(new NodeInvokeRequest
+        {
+            Id = "status-priv-ex",
+            Command = "stt.status",
+            Args = Parse("""{}""")
+        });
+
+        Assert.False(res.Ok);
+        Assert.DoesNotContain(sensitive, res.Error);
+        Assert.Equal("Status failed", res.Error);
+    }
+
+    [Fact]
+    public async Task Status_ReturnsEngineReadiness()
+    {
+        var cap = new SttCapability(NullLogger.Instance);
+        cap.StatusRequested += _ => Task.FromResult(new SttStatusResult
+        {
+            Engine = SttCapability.EngineWhisper,
+            Readiness = "model-downloading",
+            ModelDownloadProgress = 0.42,
+            IsListenWithVadSupported = false,
+            IsBoundedTranscribeSupported = false,
+        });
+
+        var res = await cap.ExecuteAsync(new NodeInvokeRequest
+        {
+            Id = "status-ok",
+            Command = "stt.status",
+            Args = Parse("""{}""")
+        });
+
+        Assert.True(res.Ok);
+        var json = System.Text.Json.JsonSerializer.Serialize(res.Payload);
+        Assert.Contains("\"engine\":\"whisper\"", json);
+        Assert.Contains("\"readiness\":\"model-downloading\"", json);
+        Assert.Contains("\"modelDownloadProgress\":0.42", json);
+        // No PII fields ever surface in stt.status — even when synthesizing
+        // a result, callers can only see flat readiness strings + a single
+        // engine identifier.
+        Assert.DoesNotContain("language", json, StringComparison.OrdinalIgnoreCase);
+        Assert.DoesNotContain("path", json, StringComparison.OrdinalIgnoreCase);
+    }
+
+    // ============================================================
+    // BCP-47 + "auto" sentinel
+    // ============================================================
+
+    [Theory]
+    [InlineData("en-US", "en-US")]
+    [InlineData("en-GB", "en-GB")]
+    [InlineData("ja-JP", "ja-JP")]
+    [InlineData("zh-Hans-CN", "zh-Hans-CN")]
+    [InlineData(" en-US ", "en-US")] // leading/trailing whitespace trimmed
+    [InlineData("auto", "auto")]
+    [InlineData("AUTO", "auto")] // case-insensitive sentinel, normalized to lowercase
+    [InlineData("Auto", "auto")]
+    public void NormalizeLanguageTag_AcceptsValid(string input, string expected)
+    {
+        Assert.Equal(expected, SttCapability.NormalizeLanguageTag(input));
+    }
+
+    [Theory]
+    [InlineData("english")]
+    [InlineData("en_US")] // underscore not allowed
+    [InlineData("not a tag")]
+    [InlineData("en US")] // space not allowed
+    [InlineData("automatic")] // not the sentinel
+    public void NormalizeLanguageTag_RejectsInvalid(string input)
+    {
+        Assert.Null(SttCapability.NormalizeLanguageTag(input));
+    }
+}
diff --git a/tests/OpenClaw.Shared.Tests/McpToolBridgeTests.cs b/tests/OpenClaw.Shared.Tests/McpToolBridgeTests.cs
index 043b8212..c25614c4 100644
--- a/tests/OpenClaw.Shared.Tests/McpToolBridgeTests.cs
+++ b/tests/OpenClaw.Shared.Tests/McpToolBridgeTests.cs
@@ -352,6 +352,118 @@ public async Task UnhandledException_ReturnsGenericInternalError_NotLeakingMessa
         Assert.DoesNotContain("secret-internal-detail", error.GetProperty("message").GetString());
     }
 
+    [Fact]
+    public async Task ToolsList_SttTranscribe_HasCuratedDescription()
+    {
+        var caps = new List<INodeCapability>
+        {
+            new FakeCapability("stt", "stt.transcribe"),
+        };
+        var bridge = CreateBridge(caps);
+        var resp = await bridge.HandleRequestAsync(@"{""jsonrpc"":""2.0"",""id"":1,""method"":""tools/list""}");
+
+        using var doc = JsonDocument.Parse(resp!);
+        var description = doc.RootElement.GetProperty("result")
+            .GetProperty("tools")[0]
+            .GetProperty("description")
+            .GetString()!;
+
+        // Must mention the key surface area so MCP clients render something useful.
+        Assert.Contains("microphone", description, System.StringComparison.OrdinalIgnoreCase);
+        Assert.Contains("maxDurationMs", description);
+        Assert.Contains("text", description, System.StringComparison.OrdinalIgnoreCase);
+        // And explicitly NOT the generic stub.
+        Assert.DoesNotContain("stt capability:", description);
+    }
+
+    [Fact]
+    public async Task ToolsList_SttListen_HasCuratedDescription()
+    {
+        var caps = new List<INodeCapability> { new FakeCapability("stt", "stt.listen") };
+        var bridge = CreateBridge(caps);
+        var resp = await bridge.HandleRequestAsync(@"{""jsonrpc"":""2.0"",""id"":1,""method"":""tools/list""}");
+
+        using var doc = JsonDocument.Parse(resp!);
+        var description = doc.RootElement.GetProperty("result")
+            .GetProperty("tools")[0]
+            .GetProperty("description")
+            .GetString()!;
+
+        Assert.Contains("voice-activity detection", description, System.StringComparison.OrdinalIgnoreCase);
+        Assert.Contains("timeoutMs", description);
+        // Privacy: must mention NodeSttEnabled gate so MCP clients
+        // know this is opt-in.
+        Assert.Contains("NodeSttEnabled", description);
+        // Engine surface must be advertised so callers can read engineEffective.
+        Assert.Contains("engineEffective", description);
+        Assert.DoesNotContain("stt capability:", description);
+    }
+
+    [Fact]
+    public async Task ToolsList_SttStatus_HasCuratedDescription()
+    {
+        var caps = new List<INodeCapability> { new FakeCapability("stt", "stt.status") };
+        var bridge = CreateBridge(caps);
+        var resp = await bridge.HandleRequestAsync(@"{""jsonrpc"":""2.0"",""id"":1,""method"":""tools/list""}");
+
+        using var doc = JsonDocument.Parse(resp!);
+        var description = doc.RootElement.GetProperty("result")
+            .GetProperty("tools")[0]
+            .GetProperty("description")
+            .GetString()!;
+
+        Assert.Contains("readiness", description, System.StringComparison.OrdinalIgnoreCase);
+        Assert.Contains("engine", description, System.StringComparison.OrdinalIgnoreCase);
+        Assert.Contains("whisper", description, System.StringComparison.OrdinalIgnoreCase);
+        // Privacy invariant in the description itself: no PII.
+        Assert.Contains("no PII", description, System.StringComparison.OrdinalIgnoreCase);
+    }
+
+    [Fact]
+    public async Task ToolsList_AllStt_AppearWhenSttCapabilityRegistered()
+    {
+        // Single SttCapability instance advertises all three commands.
+        var caps = new List<INodeCapability>
+        {
+            new FakeCapability("stt", "stt.transcribe", "stt.listen", "stt.status"),
+        };
+        var bridge = CreateBridge(caps);
+        var resp = await bridge.HandleRequestAsync(@"{""jsonrpc"":""2.0"",""id"":1,""method"":""tools/list""}");
+
+        using var doc = JsonDocument.Parse(resp!);
+        var toolNames = new HashSet<string>();
+        foreach (var t in doc.RootElement.GetProperty("result").GetProperty("tools").EnumerateArray())
+            toolNames.Add(t.GetProperty("name").GetString()!);
+
+        Assert.Contains("stt.transcribe", toolNames);
+        Assert.Contains("stt.listen", toolNames);
+        Assert.Contains("stt.status", toolNames);
+    }
+
+    [Fact]
+    public async Task ToolsList_AllStt_Absent_WhenSttCapabilityNotRegistered()
+    {
+        // STT capability is gated by NodeSttEnabled in NodeService;
+        // when disabled, no SttCapability is constructed and tools/list
+        // must omit the three stt.* tools.
+        var caps = new List<INodeCapability>
+        {
+            new FakeCapability("device", "device.status"),
+            new FakeCapability("tts", "tts.speak"),
+        };
+        var bridge = CreateBridge(caps);
+        var resp = await bridge.HandleRequestAsync(@"{""jsonrpc"":""2.0"",""id"":1,""method"":""tools/list""}");
+
+        using var doc = JsonDocument.Parse(resp!);
+        var toolNames = new HashSet<string>();
+        foreach (var t in doc.RootElement.GetProperty("result").GetProperty("tools").EnumerateArray())
+            toolNames.Add(t.GetProperty("name").GetString()!);
+
+        Assert.DoesNotContain("stt.transcribe", toolNames);
+        Assert.DoesNotContain("stt.listen", toolNames);
+        Assert.DoesNotContain("stt.status", toolNames);
+    }
+
     [Fact]
     public async Task Initialize_ReturnsCustomServerNameAndVersion()
     {
diff --git a/tests/OpenClaw.Shared.Tests/ModelsTests.cs b/tests/OpenClaw.Shared.Tests/ModelsTests.cs
index ac9ac83f..ad93fb34 100644
--- a/tests/OpenClaw.Shared.Tests/ModelsTests.cs
+++ b/tests/OpenClaw.Shared.Tests/ModelsTests.cs
@@ -1732,4 +1732,34 @@ public void ContextSummaryShort_FormatsSmallNumbers()
         var session = new SessionInfo { TotalTokens = 500, ContextTokens = 1000 };
         Assert.Contains("500/1.0K", session.ContextSummaryShort);
     }
+
+    [Fact]
+    public void DangerousCommands_IncludesSttTranscribe()
+    {
+        Assert.Contains("stt.transcribe", CommandCenterCommandGroups.DangerousCommands);
+        Assert.Contains("stt.transcribe", (IReadOnlySet<string>)CommandCenterCommandGroups.DangerousCommandSet);
+        // stt.listen and stt.status need the same explicit gateway opt-in so
+        // chat agents see them once NodeSttEnabled is on. Otherwise the
+        // gateway's Windows platform default policy keeps them hidden.
+        Assert.Contains("stt.listen", CommandCenterCommandGroups.DangerousCommands);
+        Assert.Contains("stt.status", CommandCenterCommandGroups.DangerousCommands);
+    }
+
+    [Fact]
+    public void MacNodeParityCommands_ExcludesSttTranscribe()
+    {
+        // Mac has no equivalent yet; ensure parity diagnostic does not flag
+        // Windows nodes for "missing" stt.transcribe.
+        Assert.DoesNotContain("stt.transcribe", CommandCenterCommandGroups.MacNodeParityCommands);
+    }
+
+    [Fact]
+    public void CommonDangerousCommands_StillIncludedInMacParity()
+    {
+        // Refactor invariant: the original camera/screen dangerous commands
+        // still appear in Mac parity via the shared CommonDangerousCommands set.
+        Assert.Contains("camera.snap", CommandCenterCommandGroups.MacNodeParityCommands);
+        Assert.Contains("camera.clip", CommandCenterCommandGroups.MacNodeParityCommands);
+        Assert.Contains("screen.record", CommandCenterCommandGroups.MacNodeParityCommands);
+    }
 }
diff --git a/tests/OpenClaw.Shared.Tests/SingleFlightDownloadTests.cs b/tests/OpenClaw.Shared.Tests/SingleFlightDownloadTests.cs
new file mode 100644
index 00000000..3b2e3057
--- /dev/null
+++ b/tests/OpenClaw.Shared.Tests/SingleFlightDownloadTests.cs
@@ -0,0 +1,134 @@
+using System;
+using System.Collections.Concurrent;
+using System.Threading;
+using System.Threading.Tasks;
+using OpenClaw.Shared.Audio;
+using Xunit;
+
+namespace OpenClaw.Shared.Tests;
+
+public sealed class SingleFlightDownloadTests
+{
+    [Fact]
+    public async Task ConcurrentCallers_StartOnlyOneSharedOperation()
+    {
+        var inFlight = new ConcurrentDictionary<string, Lazy<Task>>(StringComparer.OrdinalIgnoreCase);
+        var release = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
+        var started = 0;
+
+        Task Start(CancellationToken _)
+        {
+            Interlocked.Increment(ref started);
+            return release.Task;
+        }
+
+        var callers = new Task[50];
+        for (var i = 0; i < callers.Length; i++)
+        {
+            callers[i] = SingleFlightDownload.RunAsync(inFlight, "asset", Start);
+        }
+
+        await WaitUntilAsync(() => Volatile.Read(ref started) == 1);
+        release.SetResult();
+        await Task.WhenAll(callers);
+
+        Assert.Equal(1, Volatile.Read(ref started));
+        await WaitUntilAsync(() => inFlight.IsEmpty);
+    }
+
+    [Fact]
+    public async Task CancelingOneWaiter_DoesNotCancelSharedOperation()
+    {
+        var inFlight = new ConcurrentDictionary<string, Lazy<Task>>(StringComparer.OrdinalIgnoreCase);
+        var release = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
+        var started = 0;
+        CancellationToken sharedToken = default;
+
+        Task Start(CancellationToken token)
+        {
+            sharedToken = token;
+            Interlocked.Increment(ref started);
+            return release.Task;
+        }
+
+        using var callerCts = new CancellationTokenSource();
+        var canceledWaiter = SingleFlightDownload.RunAsync(inFlight, "asset", Start, callerCts.Token);
+        await WaitUntilAsync(() => Volatile.Read(ref started) == 1);
+
+        var continuingWaiter = SingleFlightDownload.RunAsync(inFlight, "asset", Start);
+        callerCts.Cancel();
+
+        await Assert.ThrowsAsync<TaskCanceledException>(() => canceledWaiter);
+        Assert.False(sharedToken.CanBeCanceled);
+
+        release.SetResult();
+        await continuingWaiter;
+
+        Assert.Equal(1, Volatile.Read(ref started));
+        await WaitUntilAsync(() => inFlight.IsEmpty);
+    }
+
+    [Fact]
+    public async Task FailedSharedOperation_IsRemovedSoRetryCanStart()
+    {
+        var inFlight = new ConcurrentDictionary<string, Lazy<Task>>(StringComparer.OrdinalIgnoreCase);
+        var attempts = 0;
+
+        Task Start(CancellationToken _)
+        {
+            return Interlocked.Increment(ref attempts) == 1
+                ? Task.FromException(new InvalidOperationException("first failure"))
+                : Task.CompletedTask;
+        }
+
+        var ex = await Assert.ThrowsAsync<InvalidOperationException>(
+            () => SingleFlightDownload.RunAsync(inFlight, "asset", Start));
+        Assert.Equal("first failure", ex.Message);
+
+        await WaitUntilAsync(() => inFlight.IsEmpty);
+        await SingleFlightDownload.RunAsync(inFlight, "asset", Start);
+
+        Assert.Equal(2, Volatile.Read(ref attempts));
+    }
+
+    [Fact]
+    public async Task SynchronousFactoryFailure_IsRemovedSoRetryCanStart()
+    {
+        var inFlight = new ConcurrentDictionary<string, Lazy<Task>>(StringComparer.OrdinalIgnoreCase);
+        var attempts = 0;
+
+        Task Start(CancellationToken _)
+        {
+            if (Interlocked.Increment(ref attempts) == 1)
+            {
+                throw new InvalidOperationException("sync failure");
+            }
+
+            return Task.CompletedTask;
+        }
+
+        var ex = await Assert.ThrowsAsync<InvalidOperationException>(
+            () => SingleFlightDownload.RunAsync(inFlight, "asset", Start));
+        Assert.Equal("sync failure", ex.Message);
+
+        await WaitUntilAsync(() => inFlight.IsEmpty);
+        await SingleFlightDownload.RunAsync(inFlight, "asset", Start);
+
+        Assert.Equal(2, Volatile.Read(ref attempts));
+    }
+
+    private static async Task WaitUntilAsync(Func<bool> condition)
+    {
+        for (var i = 0; i < 100; i++)
+        {
+            if (condition())
+            {
+                return;
+            }
+
+            await Task.Delay(10);
+        }
+
+        Assert.True(condition());
+    }
+}
diff --git a/tests/OpenClaw.Shared.Tests/SpeechToTextLanguageNormalizationTests.cs b/tests/OpenClaw.Shared.Tests/SpeechToTextLanguageNormalizationTests.cs
new file mode 100644
index 00000000..971495a9
--- /dev/null
+++ b/tests/OpenClaw.Shared.Tests/SpeechToTextLanguageNormalizationTests.cs
@@ -0,0 +1,43 @@
+using OpenClaw.Shared.Audio;
+using Xunit;
+
+namespace OpenClaw.Shared.Tests;
+
+/// <summary>
+/// SttCapability accepts BCP-47 language tags (the validator + MCP docs
+/// both advertise the wider shape like "en-US"), but Whisper.net's
+/// WithLanguage call only understands "auto" or 2-letter ISO 639-1 codes.
+/// SpeechToTextService.NormalizeForWhisper bridges the gap. These tests
+/// pin the normalization rules so a future change can't silently start
+/// passing a region-tagged BCP-47 string straight to Whisper.
+/// </summary>
+public class SpeechToTextLanguageNormalizationTests
+{
+    [Theory]
+    [InlineData("auto", "auto")]
+    [InlineData("AUTO", "auto")]
+    [InlineData("en", "en")]
+    [InlineData("EN", "en")]
+    [InlineData("en-US", "en")]
+    [InlineData("en-us", "en")]
+    [InlineData("zh-Hans-CN", "zh")]
+    [InlineData("fr-FR", "fr")]
+    [InlineData(" ja-JP ", "ja")]
+    public void NormalizeForWhisper_StripsRegionAndScript(string input, string expected)
+    {
+        Assert.Equal(expected, SpeechToTextService.NormalizeForWhisper(input));
+    }
+
+    [Theory]
+    [InlineData(null)]
+    [InlineData("")]
+    [InlineData("   ")]
+    [InlineData("abc")]            // 3-letter — no safe ISO 639-3 cross-walk
+    [InlineData("e")]              // single letter
+    [InlineData("123-XX")]         // numeric primary subtag
+    [InlineData("en1-US")]         // non-letter primary
+    public void NormalizeForWhisper_FallsBackToAuto_OnInvalid(string? input)
+    {
+        Assert.Equal("auto", SpeechToTextService.NormalizeForWhisper(input));
+    }
+}
diff --git a/tests/OpenClaw.Tray.Tests/ActivityStreamServiceTests.cs b/tests/OpenClaw.Tray.Tests/ActivityStreamServiceTests.cs
index b14b9c2a..128571bf 100644
--- a/tests/OpenClaw.Tray.Tests/ActivityStreamServiceTests.cs
+++ b/tests/OpenClaw.Tray.Tests/ActivityStreamServiceTests.cs
@@ -2,6 +2,13 @@
 
 namespace OpenClaw.Tray.Tests;
 
+[CollectionDefinition(ActivityStreamServiceCollection.Name, DisableParallelization = true)]
+public sealed class ActivityStreamServiceCollection
+{
+    public const string Name = "ActivityStreamService";
+}
+
+[Collection(ActivityStreamServiceCollection.Name)]
 public class ActivityStreamServiceTests : IDisposable
 {
     public ActivityStreamServiceTests()
diff --git a/tests/OpenClaw.Tray.Tests/CapabilitiesPageLocalizationCoverageTests.cs b/tests/OpenClaw.Tray.Tests/CapabilitiesPageLocalizationCoverageTests.cs
new file mode 100644
index 00000000..4ebee380
--- /dev/null
+++ b/tests/OpenClaw.Tray.Tests/CapabilitiesPageLocalizationCoverageTests.cs
@@ -0,0 +1,107 @@
+using System.Xml.Linq;
+
+namespace OpenClaw.Tray.Tests;
+
+/// <summary>
+/// Pins that the STT/TTS card controls in CapabilitiesPage.xaml are localized
+/// (have an x:Uid) and that en-us\Resources.resw provides matching keys.
+///
+/// LocalizationValidationTests catches drift between locales, but does not
+/// catch the case where a developer adds a control with hardcoded English
+/// text and never registers it. This test closes that hole for the new
+/// privacy-sensitive voice surface (the engine picker, the language input,
+/// the ElevenLabs panel, and the deep-link to VoiceSettingsPage).
+/// </summary>
+public sealed class CapabilitiesPageLocalizationCoverageTests
+{
+    private static readonly XNamespace XNs = "http://schemas.microsoft.com/winfx/2006/xaml";
+
+    private static string GetRepositoryRoot()
+    {
+        var envRepoRoot = Environment.GetEnvironmentVariable("OPENCLAW_REPO_ROOT");
+        if (!string.IsNullOrWhiteSpace(envRepoRoot) && Directory.Exists(envRepoRoot))
+            return envRepoRoot;
+
+        var directory = new DirectoryInfo(AppContext.BaseDirectory);
+        while (directory != null)
+        {
+            if ((Directory.Exists(Path.Combine(directory.FullName, ".git")) ||
+                 File.Exists(Path.Combine(directory.FullName, ".git"))) &&
+                File.Exists(Path.Combine(directory.FullName, "README.md")))
+                return directory.FullName;
+            directory = directory.Parent;
+        }
+
+        throw new InvalidOperationException(
+            "Could not find repository root. Set OPENCLAW_REPO_ROOT to the repo path.");
+    }
+
+    private static string GetCapabilitiesXamlPath() =>
+        Path.Combine(GetRepositoryRoot(), "src", "OpenClaw.Tray.WinUI", "Pages", "CapabilitiesPage.xaml");
+
+    private static string GetEnUsReswPath() =>
+        Path.Combine(GetRepositoryRoot(), "src", "OpenClaw.Tray.WinUI", "Strings", "en-us", "Resources.resw");
+
+    private static HashSet<string> LoadReswKeys()
+    {
+        var doc = XDocument.Load(GetEnUsReswPath());
+        return doc.Descendants("data")
+            .Select(e => e.Attribute("name")!.Value)
+            .ToHashSet(StringComparer.Ordinal);
+    }
+
+    private static HashSet<string> LoadXamlUids()
+    {
+        var doc = XDocument.Load(GetCapabilitiesXamlPath());
+        return doc.Descendants()
+            .Select(e => e.Attribute(XNs + "Uid")?.Value)
+            .Where(v => !string.IsNullOrEmpty(v))
+            .Cast<string>()
+            .ToHashSet(StringComparer.Ordinal);
+    }
+
+    /// <summary>
+    /// Contract for the STT/TTS surface introduced by the audio merge.
+    /// Each entry: x:Uid + the resw key suffixes that MUST exist in en-us.
+    /// </summary>
+    public static IEnumerable<object[]> SttAndTtsCardUids => new[]
+    {
+        // STT card (deep-link to dedicated voice settings)
+        new object[] { "CapabilitiesPage_SttCardHeader",        new[] { ".Text" } },
+        new object[] { "CapabilitiesPage_SttCardDescription",   new[] { ".Text" } },
+        new object[] { "CapabilitiesPage_SttMoreSettingsLink",  new[] { ".Content" } },
+        // TTS card (provider picker, ElevenLabs sub-panel)
+        new object[] { "CapabilitiesPage_TtsCardHeader",        new[] { ".Text" } },
+        new object[] { "CapabilitiesPage_TtsCardDescription",   new[] { ".Text" } },
+        new object[] { "CapabilitiesPage_TtsProviderComboBox",  new[] { ".Header" } },
+        new object[] { "CapabilitiesPage_TtsProviderPiper",     new[] { ".Content" } },
+        new object[] { "CapabilitiesPage_TtsProviderWindows",   new[] { ".Content" } },
+        new object[] { "CapabilitiesPage_TtsProviderElevenLabs",new[] { ".Content" } },
+        new object[] { "CapabilitiesPage_TtsElevenLabsApiKey",  new[] { ".Header" } },
+        new object[] { "CapabilitiesPage_TtsElevenLabsVoiceId", new[] { ".Header" } },
+        new object[] { "CapabilitiesPage_TtsElevenLabsModel",   new[] { ".Header", ".PlaceholderText" } },
+        new object[] { "CapabilitiesPage_TtsElevenLabsHelp",    new[] { ".Text" } },
+    };
+
+    [Theory]
+    [MemberData(nameof(SttAndTtsCardUids))]
+    public void SttOrTtsControl_HasXUid_InCapabilitiesPageXaml(string uid, string[] _)
+    {
+        var uids = LoadXamlUids();
+        Assert.Contains(uid, uids);
+    }
+
+    [Theory]
+    [MemberData(nameof(SttAndTtsCardUids))]
+    public void SttOrTtsControl_AllExpectedReswKeys_ExistInEnUs(string uid, string[] suffixes)
+    {
+        var keys = LoadReswKeys();
+        var missing = suffixes
+            .Select(suffix => uid + suffix)
+            .Where(key => !keys.Contains(key))
+            .ToList();
+
+        Assert.True(missing.Count == 0,
+            $"Missing en-us resw keys for x:Uid '{uid}': {string.Join(", ", missing)}");
+    }
+}
diff --git a/tests/OpenClaw.Tray.Tests/DeepLinkParserTests.cs b/tests/OpenClaw.Tray.Tests/DeepLinkParserTests.cs
index 6f36dfba..61d559c3 100644
--- a/tests/OpenClaw.Tray.Tests/DeepLinkParserTests.cs
+++ b/tests/OpenClaw.Tray.Tests/DeepLinkParserTests.cs
@@ -121,6 +121,21 @@ public void ParseDeepLink_TrailingSlash_IsStripped()
         Assert.Equal("settings", result.Path);
     }
 
+    [Theory]
+    [InlineData("openclaw://send/?message=hello", "send")]
+    [InlineData("openclaw://agent/?message=hi&key=abc", "agent")]
+    [InlineData("openclaw://activity/?filter=nodes", "activity")]
+    public void ParseDeepLink_TrailingSlashBeforeQuery_IsStripped(string uri, string expectedPath)
+    {
+        // Windows canonicalizes openclaw://send?... to openclaw://send/?...
+        // before handing it to us. The slash sits before the `?`, so a naïve
+        // TrimEnd before query split fails to strip it. Regression test for
+        // the off-by-one fix in DeepLinkParser.ParseDeepLink.
+        var result = DeepLinkParser.ParseDeepLink(uri);
+        Assert.NotNull(result);
+        Assert.Equal(expectedPath, result!.Path);
+    }
+
     [Fact]
     public void ParseDeepLink_CaseInsensitiveScheme()
     {
diff --git a/tests/OpenClaw.Tray.Tests/LocalizationValidationTests.cs b/tests/OpenClaw.Tray.Tests/LocalizationValidationTests.cs
index 68eb2b7b..824a99f4 100644
--- a/tests/OpenClaw.Tray.Tests/LocalizationValidationTests.cs
+++ b/tests/OpenClaw.Tray.Tests/LocalizationValidationTests.cs
@@ -24,6 +24,18 @@ public class LocalizationValidationTests
         "Onboarding_Connection_Token",
         "WindowTitle_TrayMenu",
         "WindowTitle_Update",
+        // STT/TTS card invariants — these are protocol/brand identifiers
+        // not user-visible prose. They intentionally read the same in every
+        // locale: "eleven_multilingual_v2" is an ElevenLabs model
+        // identifier, "ElevenLabs" is a brand name.
+        // VoiceOverlayWindow window-title key — matches the convention
+        // for ChatWindow / HubWindow / CanvasWindow / TrayMenuWindow.
+        "VoiceOverlayWindow_winexWindowEx_2.Title",
+        "CapabilitiesPage_TtsElevenLabsModel.PlaceholderText",
+        "CapabilitiesPage_TtsProviderElevenLabs.Content",
+        // Sample IDs / brand identifiers — same across locales.
+        "VoiceSettingsPage_ElevenLabsVoiceIdBox.PlaceholderText",
+        "VoiceSettingsPage_ElevenLabsModelBox.PlaceholderText",
     };
 
     private static readonly string[] RequiredRuntimeOnboardingKeys =
diff --git a/tests/OpenClaw.Tray.Tests/NodeCapabilityGatingTests.cs b/tests/OpenClaw.Tray.Tests/NodeCapabilityGatingTests.cs
new file mode 100644
index 00000000..69671057
--- /dev/null
+++ b/tests/OpenClaw.Tray.Tests/NodeCapabilityGatingTests.cs
@@ -0,0 +1,132 @@
+using OpenClawTray.Services;
+
+namespace OpenClaw.Tray.Tests;
+
+/// <summary>
+/// Pins the optional-capability gating that drives both the gateway client
+/// path and the MCP-only path inside <c>NodeService.RegisterCapabilities</c>.
+///
+/// Privacy-sensitive defaults must be **off** even when settings are missing.
+/// A regression that flips Stt/Tts to default-on would silently advertise
+/// stt.transcribe / tts.speak the moment the tray launches with a fresh
+/// settings file, with no user opt-in.
+/// </summary>
+public sealed class NodeCapabilityGatingTests : IDisposable
+{
+    private readonly List<string> _tempDirs = new();
+
+    public void Dispose()
+    {
+        foreach (var dir in _tempDirs)
+        {
+            if (Directory.Exists(dir))
+                Directory.Delete(dir, recursive: true);
+        }
+    }
+
+    private SettingsManager NewSettings()
+    {
+        var dir = Path.Combine(Path.GetTempPath(), "openclaw-tray-tests-" + Guid.NewGuid().ToString("N"));
+        Directory.CreateDirectory(dir);
+        _tempDirs.Add(dir);
+        return new SettingsManager(dir);
+    }
+
+    [Fact]
+    public void NullSettings_DefaultOnCapabilities_AreEnabled()
+    {
+        // Defensive default: when settings are not yet loaded, we still
+        // advertise the non-privacy-sensitive capabilities so the node is
+        // usable immediately.
+        Assert.True(NodeCapabilityGating.ShouldRegisterCanvas(null));
+        Assert.True(NodeCapabilityGating.ShouldRegisterScreen(null));
+        Assert.True(NodeCapabilityGating.ShouldRegisterCamera(null));
+        Assert.True(NodeCapabilityGating.ShouldRegisterLocation(null));
+        Assert.True(NodeCapabilityGating.ShouldRegisterBrowserProxy(null));
+    }
+
+    [Fact]
+    public void NullSettings_PrivacySensitiveCapabilities_AreDisabled()
+    {
+        // Privacy invariant: TTS and STT must require an explicit user
+        // opt-in. A null/missing settings object must not enable mic capture
+        // or speaker output.
+        Assert.False(NodeCapabilityGating.ShouldRegisterTts(null));
+        Assert.False(NodeCapabilityGating.ShouldRegisterStt(null));
+    }
+
+    [Fact]
+    public void DefaultSettings_PrivacySensitiveCapabilities_AreDisabled()
+    {
+        var s = NewSettings();
+        Assert.False(NodeCapabilityGating.ShouldRegisterTts(s));
+        Assert.False(NodeCapabilityGating.ShouldRegisterStt(s));
+    }
+
+    [Fact]
+    public void DefaultSettings_OtherCapabilities_AreEnabled()
+    {
+        var s = NewSettings();
+        Assert.True(NodeCapabilityGating.ShouldRegisterCanvas(s));
+        Assert.True(NodeCapabilityGating.ShouldRegisterScreen(s));
+        Assert.True(NodeCapabilityGating.ShouldRegisterCamera(s));
+        Assert.True(NodeCapabilityGating.ShouldRegisterLocation(s));
+        Assert.True(NodeCapabilityGating.ShouldRegisterBrowserProxy(s));
+    }
+
+    [Fact]
+    public void Tts_OnlyAdvertisedWhenExplicitlyEnabled()
+    {
+        var s = NewSettings();
+        Assert.False(NodeCapabilityGating.ShouldRegisterTts(s));
+        s.NodeTtsEnabled = true;
+        Assert.True(NodeCapabilityGating.ShouldRegisterTts(s));
+        s.NodeTtsEnabled = false;
+        Assert.False(NodeCapabilityGating.ShouldRegisterTts(s));
+    }
+
+    [Fact]
+    public void Stt_OnlyAdvertisedWhenExplicitlyEnabled()
+    {
+        var s = NewSettings();
+        Assert.False(NodeCapabilityGating.ShouldRegisterStt(s));
+        s.NodeSttEnabled = true;
+        Assert.True(NodeCapabilityGating.ShouldRegisterStt(s));
+        s.NodeSttEnabled = false;
+        Assert.False(NodeCapabilityGating.ShouldRegisterStt(s));
+    }
+
+    [Fact]
+    public void TtsAndStt_Independent()
+    {
+        // A user who enables only TTS (output) must not silently enable STT
+        // (input), and vice versa. Each capability is its own consent surface.
+        var s = NewSettings();
+        s.NodeTtsEnabled = true;
+        s.NodeSttEnabled = false;
+        Assert.True(NodeCapabilityGating.ShouldRegisterTts(s));
+        Assert.False(NodeCapabilityGating.ShouldRegisterStt(s));
+
+        s.NodeTtsEnabled = false;
+        s.NodeSttEnabled = true;
+        Assert.False(NodeCapabilityGating.ShouldRegisterTts(s));
+        Assert.True(NodeCapabilityGating.ShouldRegisterStt(s));
+    }
+
+    [Fact]
+    public void DefaultOnCapabilities_OnlyDisabledWhenExplicitlySetToFalse()
+    {
+        var s = NewSettings();
+        s.NodeCanvasEnabled = false;
+        s.NodeScreenEnabled = false;
+        s.NodeCameraEnabled = false;
+        s.NodeLocationEnabled = false;
+        s.NodeBrowserProxyEnabled = false;
+
+        Assert.False(NodeCapabilityGating.ShouldRegisterCanvas(s));
+        Assert.False(NodeCapabilityGating.ShouldRegisterScreen(s));
+        Assert.False(NodeCapabilityGating.ShouldRegisterCamera(s));
+        Assert.False(NodeCapabilityGating.ShouldRegisterLocation(s));
+        Assert.False(NodeCapabilityGating.ShouldRegisterBrowserProxy(s));
+    }
+}
diff --git a/tests/OpenClaw.Tray.Tests/NodeInvokeActivityFormatterTests.cs b/tests/OpenClaw.Tray.Tests/NodeInvokeActivityFormatterTests.cs
new file mode 100644
index 00000000..06f9a576
--- /dev/null
+++ b/tests/OpenClaw.Tray.Tests/NodeInvokeActivityFormatterTests.cs
@@ -0,0 +1,127 @@
+using OpenClawTray.Services;
+
+namespace OpenClaw.Tray.Tests;
+
+/// <summary>
+/// Privacy regression tests for the activity-stream details formatter that
+/// powers both the recent-activity menu and the support bundle.
+///
+/// The end-to-end persistence path is:
+///   NodeService.OnNodeInvokeCompleted (capability handler exception)
+///     → App.OnNodeInvokeCompleted
+///     → NodeInvokeActivityFormatter.BuildDetails
+///     → ActivityStreamService.Add
+///     → ActivityStreamService.BuildSupportBundle (when user shares logs)
+///
+/// For privacy-sensitive commands (mic / camera / screen), no caller-supplied
+/// arg or runtime detail may reach support bundles. This test pins that.
+/// </summary>
+[Collection(ActivityStreamServiceCollection.Name)]
+public sealed class NodeInvokeActivityFormatterTests : IDisposable
+{
+    public NodeInvokeActivityFormatterTests() => ActivityStreamService.Clear();
+    public void Dispose() => ActivityStreamService.Clear();
+
+    [Theory]
+    [InlineData("stt.transcribe")]
+    [InlineData("stt.listen")]
+    [InlineData("stt.status")]
+    [InlineData("camera.snap")]
+    [InlineData("camera.clip")]
+    [InlineData("screen.snapshot")]
+    [InlineData("screen.record")]
+    public void PrivacySensitive_FailedInvoke_OmitsErrorTextFromDetails(string command)
+    {
+        const string secret = "secret-language-or-device-detail";
+        var details = NodeInvokeActivityFormatter.BuildDetails(command, ok: false, durationMs: 4321, error: secret);
+
+        Assert.Equal("privacy-sensitive · 4321 ms · error", details);
+        Assert.DoesNotContain(secret, details);
+    }
+
+    [Fact]
+    public void PrivacySensitive_FailedInvoke_SecretDoesNotReachSupportBundle()
+    {
+        const string secret = "secret-language-or-device-detail";
+        var details = NodeInvokeActivityFormatter.BuildDetails("stt.transcribe", ok: false, durationMs: 1234, error: secret);
+
+        ActivityStreamService.Add(
+            category: "node.invoke",
+            title: "node.invoke failed: stt.transcribe",
+            details: details,
+            nodeId: "test-node");
+
+        var bundle = ActivityStreamService.BuildSupportBundle();
+        Assert.DoesNotContain(secret, bundle);
+        Assert.Contains("privacy-sensitive · 1234 ms · error", bundle);
+    }
+
+    [Fact]
+    public void PrivacySensitive_SuccessfulInvoke_OmitsAllDetail()
+    {
+        var details = NodeInvokeActivityFormatter.BuildDetails("stt.transcribe", ok: true, durationMs: 800, error: null);
+        Assert.Equal("privacy-sensitive · 800 ms", details);
+    }
+
+    [Fact]
+    public void NonPrivacySensitive_FailedInvoke_KeepsErrorForDiagnostics()
+    {
+        // Non-privacy-sensitive commands (metadata / exec) keep the error text
+        // because they're useful for diagnostics and don't carry mic/camera args.
+        var details = NodeInvokeActivityFormatter.BuildDetails(
+            "device.status",
+            ok: false,
+            durationMs: 50,
+            error: "gateway unreachable");
+
+        Assert.Equal("metadata · 50 ms · gateway unreachable", details);
+    }
+
+    [Fact]
+    public void NonPrivacySensitive_FailedInvoke_NullError_FallsBackToUnknown()
+    {
+        var details = NodeInvokeActivityFormatter.BuildDetails("device.status", ok: false, durationMs: 0, error: null);
+        Assert.Equal("metadata · 0 ms · unknown error", details);
+    }
+
+    [Fact]
+    public void Exec_FailedInvoke_KeepsErrorForDiagnostics()
+    {
+        var details = NodeInvokeActivityFormatter.BuildDetails(
+            "system.run",
+            ok: false,
+            durationMs: 100,
+            error: "exit code 1");
+
+        Assert.Equal("exec · 100 ms · exit code 1", details);
+    }
+
+    [Fact]
+    public void NegativeDuration_ClampsToZero()
+    {
+        var details = NodeInvokeActivityFormatter.BuildDetails("device.status", ok: true, durationMs: -7, error: null);
+        Assert.Equal("metadata · 0 ms", details);
+    }
+
+    [Theory]
+    [InlineData("stt.transcribe", "privacy-sensitive")]
+    [InlineData("STT.Transcribe", "privacy-sensitive")]
+    [InlineData("stt.listen", "privacy-sensitive")]
+    [InlineData("Stt.Listen", "privacy-sensitive")]
+    [InlineData("stt.status", "privacy-sensitive")]
+    [InlineData("stt.future-command", "privacy-sensitive")] // any new stt.* defaults privacy-sensitive
+    [InlineData("camera.snap", "privacy-sensitive")]
+    [InlineData("camera.clip", "privacy-sensitive")]
+    [InlineData("screen.snapshot", "privacy-sensitive")]
+    [InlineData("screen.record", "privacy-sensitive")]
+    [InlineData("system.run", "exec")]
+    [InlineData("system.run.shell", "exec")]
+    [InlineData("device.status", "metadata")]
+    [InlineData("tts.speak", "privacy-sensitive")] // TTS errors can leak ElevenLabs key fragments / device names
+    [InlineData("tts.future-command", "privacy-sensitive")] // any future tts.* defaults privacy-sensitive
+    [InlineData("", "metadata")]
+    public void GetPrivacyClass_KnownCommands(string command, string expected)
+    {
+        Assert.Equal(expected, NodeInvokeActivityFormatter.GetPrivacyClass(command));
+    }
+}
diff --git a/tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj b/tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj
index 30f0256e..40340184 100644
--- a/tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj
+++ b/tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj
@@ -26,6 +26,8 @@
     <Compile Include="..\..\src\OpenClaw.Tray.WinUI\Services\SettingsManager.cs" Link="Services\SettingsManager.cs" />
     <Compile Include="..\..\src\OpenClaw.Tray.WinUI\Services\StartupSetupState.cs" Link="Services\StartupSetupState.cs" />
     <Compile Include="..\..\src\OpenClaw.Tray.WinUI\Services\ActivityStreamService.cs" Link="Services\ActivityStreamService.cs" />
+    <Compile Include="..\..\src\OpenClaw.Tray.WinUI\Services\NodeInvokeActivityFormatter.cs" Link="Services\NodeInvokeActivityFormatter.cs" />
+    <Compile Include="..\..\src\OpenClaw.Tray.WinUI\Services\NodeCapabilityGating.cs" Link="Services\NodeCapabilityGating.cs" />
     <Compile Include="..\..\src\OpenClaw.Tray.WinUI\Services\TextToSpeech\ElevenLabsTextToSpeechClient.cs" Link="Services\TextToSpeech\ElevenLabsTextToSpeechClient.cs" />
     <Compile Include="..\..\src\OpenClaw.Tray.WinUI\Services\GatewayDiscoveryService.cs" Link="Services\GatewayDiscoveryService.cs" />
     <Compile Include="..\..\src\OpenClaw.Tray.WinUI\A2UI\Actions\AgentMessageFormatter.cs" Link="A2UI\Actions\AgentMessageFormatter.cs" />
diff --git a/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs b/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs
index f1996b3c..03b84c6f 100644
--- a/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs
+++ b/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs
@@ -37,11 +37,20 @@ public void RoundTrip_AllFields_Preserved()
             NodeCameraEnabled = false,
             NodeLocationEnabled = true,
             NodeBrowserProxyEnabled = false,
+            NodeSttEnabled = true,
+            SttLanguage = "en-GB",
+            SttModelName = "tiny",
+            SttSilenceTimeout = 2.5f,
+            VoiceTtsEnabled = false,
+            VoiceAudioFeedback = false,
             NodeTtsEnabled = true,
             TtsProvider = "elevenlabs",
             TtsElevenLabsApiKey = "elevenlabs-key",
             TtsElevenLabsModel = "eleven_multilingual_v2",
             TtsElevenLabsVoiceId = "voice-123",
+            TtsWindowsVoiceId = "Microsoft Zira Desktop",
+            HubNavPaneOpen = false,
+            TtsPiperVoiceId = "fr_FR-siwis-low",
             HasSeenActivityStreamTip = true,
             SkippedUpdateTag = "v1.2.3",
             NotifyChatResponses = false,
@@ -82,11 +91,20 @@ public void RoundTrip_AllFields_Preserved()
         Assert.Equal(original.NodeCameraEnabled, restored.NodeCameraEnabled);
         Assert.Equal(original.NodeLocationEnabled, restored.NodeLocationEnabled);
         Assert.Equal(original.NodeBrowserProxyEnabled, restored.NodeBrowserProxyEnabled);
+        Assert.Equal(original.NodeSttEnabled, restored.NodeSttEnabled);
+        Assert.Equal(original.SttLanguage, restored.SttLanguage);
+        Assert.Equal(original.SttModelName, restored.SttModelName);
+        Assert.Equal(original.SttSilenceTimeout, restored.SttSilenceTimeout);
+        Assert.Equal(original.VoiceTtsEnabled, restored.VoiceTtsEnabled);
+        Assert.Equal(original.VoiceAudioFeedback, restored.VoiceAudioFeedback);
         Assert.Equal(original.NodeTtsEnabled, restored.NodeTtsEnabled);
         Assert.Equal(original.TtsProvider, restored.TtsProvider);
         Assert.Equal(original.TtsElevenLabsApiKey, restored.TtsElevenLabsApiKey);
         Assert.Equal(original.TtsElevenLabsModel, restored.TtsElevenLabsModel);
         Assert.Equal(original.TtsElevenLabsVoiceId, restored.TtsElevenLabsVoiceId);
+        Assert.Equal(original.TtsWindowsVoiceId, restored.TtsWindowsVoiceId);
+        Assert.Equal(original.HubNavPaneOpen, restored.HubNavPaneOpen);
+        Assert.Equal(original.TtsPiperVoiceId, restored.TtsPiperVoiceId);
         Assert.Equal(original.HasSeenActivityStreamTip, restored.HasSeenActivityStreamTip);
         Assert.Equal(original.SkippedUpdateTag, restored.SkippedUpdateTag);
         Assert.Equal(original.NotifyChatResponses, restored.NotifyChatResponses);
@@ -144,8 +162,10 @@ public void MissingFields_UseDefaults()
         Assert.True(settings.NodeCameraEnabled);
         Assert.True(settings.NodeLocationEnabled);
         Assert.True(settings.NodeBrowserProxyEnabled);
+        Assert.False(settings.NodeSttEnabled);
+        Assert.Equal("auto", settings.SttLanguage);
         Assert.False(settings.NodeTtsEnabled);
-        Assert.Equal("windows", settings.TtsProvider);
+        Assert.Equal("piper", settings.TtsProvider);
         Assert.Null(settings.TtsElevenLabsApiKey);
         Assert.Null(settings.TtsElevenLabsModel);
         Assert.Null(settings.TtsElevenLabsVoiceId);
@@ -153,9 +173,24 @@ public void MissingFields_UseDefaults()
         Assert.Null(settings.SkippedUpdateTag);
         Assert.True(settings.NotifyChatResponses);
         Assert.True(settings.PreferStructuredCategories);
+        // HubNavPaneOpen defaults to true (NavView starts expanded for new
+        // installs and for any settings file that predates the field).
+        Assert.True(settings.HubNavPaneOpen);
         Assert.Null(settings.UserRules);
     }
 
+    [Fact]
+    public void HubNavPaneOpen_DefaultsTrue_ForEmptyJson()
+    {
+        // Existing users have a settings file written before HubNavPaneOpen
+        // existed. The default-true initializer must survive deserialization
+        // of a missing field so the NavView lands expanded for them, not
+        // silently collapsed.
+        var settings = SettingsData.FromJson("{}");
+        Assert.NotNull(settings);
+        Assert.True(settings!.HubNavPaneOpen);
+    }
+
     [Fact]
     public void BackwardCompatibility_OldSettingsWithoutNewFields()
     {
@@ -198,14 +233,18 @@ public void BackwardCompatibility_OldSettingsWithoutNewFields()
         Assert.True(settings.NodeCameraEnabled);
         Assert.True(settings.NodeLocationEnabled);
         Assert.True(settings.NodeBrowserProxyEnabled);
+        Assert.False(settings.NodeSttEnabled);
+        Assert.Equal("auto", settings.SttLanguage);
         Assert.False(settings.NodeTtsEnabled);
-        Assert.Equal("windows", settings.TtsProvider);
+        Assert.Equal("piper", settings.TtsProvider);
         Assert.Null(settings.TtsElevenLabsApiKey);
         Assert.Null(settings.TtsElevenLabsModel);
         Assert.Null(settings.TtsElevenLabsVoiceId);
         Assert.False(settings.HasSeenActivityStreamTip);
         Assert.Null(settings.SkippedUpdateTag);
         Assert.True(settings.GlobalHotkeyEnabled);
+        // HubNavPaneOpen wasn't in this older JSON shape; default true.
+        Assert.True(settings.HubNavPaneOpen);
         Assert.Null(settings.UserRules);
     }