Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
f2568de
Add Windows STT transcribe capability
RBrid May 2, 2026
0231bdb
Merge origin/master into STT capability branch
RBrid May 2, 2026
54b01e5
stt: privacy hardening, localization, and test coverage
RBrid May 4, 2026
0f50220
Remove fake/sample data from 6 UI pages
ranjeshj May 5, 2026
e77bd55
feat: add voice/audio support with local Whisper STT
ranjeshj May 5, 2026
72d0d9d
Merge master into user/rbrid/stt-capability
RBrid May 5, 2026
702ebe9
stt/tts: refill settings UI gaps after the unified Hub merge
RBrid May 5, 2026
d3ea24d
Merge remote-tracking branch 'ranjeshj/user/ranjeshj/audio' into user…
RBrid May 5, 2026
1e44186
Merge user/rbrid/stt-capability into user/rbrid/audio-support1
RBrid May 5, 2026
3a6db5f
audio: extend privacy class + tests for stt.listen and stt.status
RBrid May 5, 2026
8d167ec
audio: STM, locale audit, and coverage tests for STT/TTS card
RBrid May 5, 2026
3a83d26
audio: extract SttEngineSelector + tests for engine selection rules
RBrid May 5, 2026
c35b5b7
audio: security review fixes from STM walkthrough
RBrid May 5, 2026
ff11467
audio: drop WinRT SpeechRecognizer + SAPI fallback; Whisper-only
RBrid May 5, 2026
a7cd07a
Merge remote-tracking branch 'origin/master' into user/rbrid/audio-su…
RBrid May 5, 2026
5ed6db3
audio: add Piper TTS provider via Sherpa-ONNX
RBrid May 6, 2026
8ccdcb6
audio: voice download UI, Piper-as-default, first-listen polish
RBrid May 6, 2026
5f8210a
audio: address rubber-duck review (Highs #2-#5, Mediums #6-#8, Low #9)
RBrid May 6, 2026
f8facfb
audio: SHA-256 verification of Whisper models and Piper voices
RBrid May 6, 2026
945c1fb
audio: Download Model button works without VoiceService
RBrid May 6, 2026
361b409
ux: Companion rename, expanded NavView memory, right-click opens Hub
RBrid May 6, 2026
facda94
Merge remote-tracking branch 'origin/master' into user/rbrid/audio-su…
RBrid May 6, 2026
f569c20
ux: rubber-duck #2 — restore minimized Hub on right-click; pin pane d…
RBrid May 6, 2026
259f8b0
fix: deep-link parser strips trailing slash before query (#-)
RBrid May 6, 2026
c49ca1c
audio: don't drop final utterance on stop or timeout; bound transcrip…
RBrid May 6, 2026
1bf93a7
audio: SHA-256 verification of Silero VAD model
RBrid May 6, 2026
90aa5f5
docs: bring skill.md back in sync with capability registry
RBrid May 6, 2026
7b97b9d
ux: throttle Whisper/Piper download progress UI; wire Re-download button
RBrid May 6, 2026
90fa065
i18n: localize VoiceSettingsPage and VoiceOverlayWindow surfaces
RBrid May 6, 2026
c682a87
audio: stt.transcribe is now a true fixed-duration capture
RBrid May 6, 2026
fcccf31
i18n: localize VoiceOverlayWindow root window title
RBrid May 6, 2026
32497a7
audio: gate stt.* on file presence, not in-memory load state
RBrid May 6, 2026
6aae03a
ux: voice UI testing round — localization, shutdown, and Capabilities…
RBrid May 6, 2026
1e39ba4
i18n: align VoiceOverlayWindow root x:Uid with WindowEx convention
RBrid May 6, 2026
3e89b72
privacy: stop leaking ex.Message into voice UI status text
RBrid May 6, 2026
597a01d
audio: include stt.listen + stt.status in DangerousCommands
RBrid May 7, 2026
cd3f969
fix(audio): isolate shared download cancellation
shanselman May 7, 2026
14942f9
fix(tray): keep right-click to context menu only
shanselman May 7, 2026
032baa1
fix(voice): allow local overlay without node pairing
shanselman May 7, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ When Node Mode is enabled in Settings, your Windows PC becomes a **node** that t
| **Canvas** | `canvas.present`, `canvas.hide`, `canvas.navigate`, `canvas.eval`, `canvas.snapshot`, `canvas.a2ui.push`, `canvas.a2ui.pushJSONL`, `canvas.a2ui.reset` | Display and control a WebView2 window |
| **Screen** | `screen.snapshot`, `screen.record` | Capture screenshots and fixed-duration MP4 screen recordings |
| **Camera** | `camera.list`, `camera.snap`, `camera.clip` | Enumerate cameras and capture still photos or short video clips |
| **Speech-to-text** | `stt.transcribe` | Capture audio from the default microphone for a bounded duration and return transcribed text. Default-off; opt-in via Settings. When enabled, advertised to both gateway callers (subject to gateway allowlist) and local MCP clients (subject to bearer token). |
| **Location** | `location.get` | Return Windows geolocation when permission is available |
| **Device** | `device.info`, `device.status` | Return Windows host/app metadata and lightweight status |
| **Text-to-speech** | `tts.speak` | Speak text aloud through Windows speech synthesis, or ElevenLabs when configured |
Expand Down
35 changes: 35 additions & 0 deletions docs/WINDOWS_NODE_TESTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ These features need the gateway to send `node.invoke` commands:
| `location.get` | Get Windows location | Uses Windows location permission/settings |
| `device.info` / `device.status` | Device metadata/status | Returns host/app/locale plus battery/storage/network/uptime payloads |
| `browser.proxy` | Proxy browser-control host requests | Requires Browser proxy bridge enabled, a compatible browser-control host listening on gateway port + 2, and matching browser-control auth |
| `stt.transcribe` | Speech-to-text from default microphone | Default-off; bounded `maxDurationMs` ≤ 30000; concatenates phrases until duration elapses; requires explicit gateway allowlist |
| `tts.speak` | Speak text aloud | Requires Text-to-speech playback enabled in Settings; gateway mode also requires `tts.speak` in `gateway.nodes.allowCommands` |

## Capabilities Advertised
Expand Down Expand Up @@ -112,6 +113,40 @@ When the node connects, it advertises these capabilities:
- If you see "Camera access blocked", enable camera access for desktop apps in Windows Privacy settings
- Packaged MSIX builds will show the system consent prompt automatically

### `stt.transcribe` returns "Speech recognition failed" or "Internal Speech Error"
- Open Windows Settings → Privacy & security → Speech (`ms-settings:privacy-speech`)
- Turn **Online speech recognition** = On. The Windows speech recognizer's default dictation grammar often fails without it, and Windows surfaces an unmapped HRESULT as "Internal Speech Error"
- Open Windows Settings → Time & language → Language & region (`ms-settings:regionlanguage`), select your display language → Language options, and confirm **Speech** appears under Installed features (install it if not, ~50 MB; reboot or sign out/in afterward)
- Verify the recognizer end-to-end with `ms-settings:speech` → "Microphone" → **Get started** before re-trying `stt.transcribe`

### `stt.transcribe` returns "Microphone permission denied"
- Open Windows Settings → Privacy & security → Microphone
- Ensure **Microphone access** (top-level toggle) is on
- For **unpackaged** tray builds (the default `.\build.ps1` output): ensure **Let desktop apps access your microphone** is on. The tray exe will **not** appear as its own row — desktop-app access is granted as a group, not per-app
- For **packaged MSIX** tray builds: the tray appears as its own entry under "Let apps access your microphone" and must be individually enabled (the OS shows a consent prompt on first use)
- After changing permissions, re-pair the node so the gateway picks up the new advertised command

### `stt.transcribe` returns "Language pack 'X' is not installed"
- Open Windows Settings → Time & language → Language & region
- Add the requested display language and ensure the **Speech** optional feature is installed
- Restart the tray after installing the speech pack

### Manual STT validation
1. Enable Node Mode in Settings.
2. Enable **Speech-to-text (microphone)** in Settings → Node mode.
3. Append `stt.transcribe` to your existing gateway allowlist (do **not** copy a literal `...` — substitute the commands you already allow). For example, starting from the recommended Windows safe companion list:
```bash
openclaw config set gateway.nodes.allowCommands '["canvas.present","canvas.hide","canvas.navigate","canvas.eval","canvas.snapshot","canvas.a2ui.push","canvas.a2ui.pushJSONL","canvas.a2ui.reset","camera.list","location.get","screen.snapshot","device.info","device.status","system.execApprovals.get","system.execApprovals.set","stt.transcribe"]'
openclaw gateway restart
```
4. Re-pair or re-approve the node so the gateway refreshes its command snapshot.
5. Invoke and speak a short phrase:
```bash
openclaw nodes invoke --node <id> --command stt.transcribe \
--params '{"maxDurationMs":5000,"language":"en-US"}'
```
6. The Windows microphone OS indicator should appear during recognition. Confirm a `transcribed:true` payload returns the text.

## Remaining Work (Roadmap)

1. ~~**system.run + exec approvals**~~ ✅ Implemented
Expand Down
6 changes: 4 additions & 2 deletions docs/gateway-node-integration.md
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ Recommended gateway defaults:
| Command bucket | Windows default? | Reason |
|----------------|------------------|--------|
| Safe declared companion commands: `canvas.*`, `camera.list`, `location.get`, `screen.snapshot`, `device.info`, `device.status` | Yes | Matches macOS parity and only applies when declared by the node |
| Dangerous/privacy-heavy commands: `camera.snap`, `camera.clip`, `screen.record`, write commands like `contacts.add` | No | Existing gateway model already requires explicit `gateway.nodes.allowCommands` |
| Dangerous/privacy-heavy commands: `camera.snap`, `camera.clip`, `screen.record`, `stt.transcribe`, write commands like `contacts.add` | No | Existing gateway model already requires explicit `gateway.nodes.allowCommands` |
| Exec commands: `system.run`, `system.run.prepare`, `system.which`, `system.notify`, `browser.proxy` | Yes | Existing Windows headless-host behavior |

Until the gateway expands Windows safe defaults, the practical local solution is:
Expand Down Expand Up @@ -364,6 +364,7 @@ Privacy-sensitive commands should stay out of the default safe list and should o
camera.snap
camera.clip
screen.record
stt.transcribe
```

After changing either `gateway.nodes.allowCommands` or `gateway.nodes.denyCommands`, re-approve or re-pair the Windows node. Approved device records may keep a snapshot of the commands that were visible at approval time, so a gateway restart alone may not refresh existing approvals.
Expand Down Expand Up @@ -424,6 +425,7 @@ Proposal:
- `camera.snap`
- `camera.clip`
- `screen.record`
- `stt.transcribe`
- write commands such as `contacts.add`, `calendar.add`, etc.

This does not grant capabilities to headless Windows hosts by itself. A command still has to pass both gates: the node must declare it in `commands`, and the gateway policy must allow it. Headless Windows node hosts that only declare `system.run` / `system.which` remain exec-only.
Expand All @@ -441,7 +443,7 @@ When shipping the Windows node, README/wiki should tell users:
> ```
> Then re-pair the node (`openclaw devices reject <old-id>` + re-approve).
>
> Add `camera.snap`, `camera.clip`, and `screen.record` only when you explicitly want to allow privacy-sensitive camera or screen capture.
> Add `camera.snap`, `camera.clip`, `screen.record`, and `stt.transcribe` only when you explicitly want to allow privacy-sensitive camera, screen, or microphone capture.
>
> The Windows tray Command Center (`openclaw://commandcenter`) surfaces these policy problems directly: it separates safe companion allowlist fixes from privacy-sensitive opt-ins and provides copyable repair text for safe fixes or pending pairing approval.

Expand Down
70 changes: 70 additions & 0 deletions src/OpenClaw.Shared/Audio/AudioModels.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
using System;
using System.Collections.Generic;

namespace OpenClaw.Shared.Audio;

/// <summary>Result of a speech-to-text transcription segment.</summary>
public sealed class TranscriptionResult
{
public string Text { get; init; } = "";
public TimeSpan Start { get; init; }
public TimeSpan End { get; init; }
public string Language { get; init; } = "en";
}

/// <summary>
/// Aggregated result of a single silence-bounded utterance — i.e. all the
/// Whisper segments produced from one VAD-bounded speech burst, combined.
/// Consumers that need "what the user said" (chat submission, stt.listen)
/// should listen for this event instead of per-segment TranscriptionResult
/// to avoid sending partial text.
/// </summary>
public sealed class UtteranceResult
{
/// <summary>Concatenated text across all segments, single-spaced.</summary>
public string Text { get; init; } = "";
/// <summary>Language detected on the first segment, or null if no segments.</summary>
public string? Language { get; init; }
/// <summary>Start of the first segment relative to capture start.</summary>
public TimeSpan Start { get; init; }
/// <summary>End of the last segment relative to capture start.</summary>
public TimeSpan End { get; init; }
/// <summary>Immutable snapshot of the per-segment results.</summary>
public IReadOnlyList<TranscriptionResult> Segments { get; init; } = Array.Empty<TranscriptionResult>();
}

/// <summary>Voice-activity detection event.</summary>
public sealed class VadEvent
{
public bool IsSpeaking { get; init; }
public float Probability { get; init; }
}

/// <summary>Configuration for the audio pipeline.</summary>
public sealed class AudioPipelineOptions
{
/// <summary>Path to the Whisper GGML model file.</summary>
public string ModelPath { get; init; } = "";

/// <summary>Language code for STT (e.g. "en", "auto").</summary>
public string Language { get; init; } = "auto";

/// <summary>Seconds of silence before a speech segment is finalized.</summary>
public float SilenceTimeoutSeconds { get; init; } = 1.5f;

/// <summary>Optional audio device ID. Null = system default microphone.</summary>
public string? DeviceId { get; init; }

/// <summary>VAD probability threshold (0.0–1.0). Audio above this is considered speech.</summary>
public float VadThreshold { get; init; } = 0.3f;
}

/// <summary>Pipeline state.</summary>
public enum AudioPipelineState
{
Stopped,
Starting,
Listening,
Processing,
Error
}
Loading
Loading