From 962b042520b09546377fe2d208e108f467f45554 Mon Sep 17 00:00:00 2001 From: Ivan Date: Wed, 13 May 2026 17:00:25 +0200 Subject: [PATCH] Make speech-android Android-only by deleting code that moved to speech-core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit speech-core PRs #19 and #20 lifted all the model wrappers, audio utilities, and Linux examples out of this repo. This PR finishes the migration by deleting the now-duplicated source and slimming the native side to a single ~250-line JNI bridge. Net change: 51 files, +717 / -7412. Bumped: - speech-core submodule pointer: 679869d → ba75579 (PR #19 + #20 merged) Deleted (now in speech-core): - sdk/src/main/cpp/audio/ — fft, mel, stft (live at speech_core::audio) - sdk/src/main/cpp/util/ — json.h - sdk/src/main/cpp/models/ — silero_vad, parakeet_stt, kokoro_tts + phonemizer + multilingual, deepfilter, onnx_engine, inference_engine, onnx_backend, soc_detect - linux/ — moved verbatim to speech-core/examples/linux/ (libspeech.so, demo, CLIs, integration test) Rewrote: - sdk/src/main/cpp/jni_bridge.cpp (388 → 269 lines) — the model wrappers in speech_core::* directly implement VADInterface / STTInterface / TTSInterface / EnhancerInterface, so the 100+ lines of C-vtable adapter boilerplate (vad_process_chunk, stt_transcribe, tts_synthesize, etc.) that wrapped each model class into sc_*_vtable_t structs are gone. The bridge now constructs speech_core::SileroVad / ParakeetStt / KokoroTts and hands references to speech_core::VoicePipeline. - sdk/src/main/cpp/CMakeLists.txt — replaced the manual list of speech-core source files with add_subdirectory(${SPEECH_CORE_DIR}) using SPEECH_CORE_WITH_ONNX=ON. Link speech_android against speech_core_models. Compatibility: - Kotlin contract unchanged. NativeBridge.onEvent still receives the same int event-type values (0..11). The new speech_core::EventType enum has ResponseDone and ResponseAudioDelta swapped relative to the old C ABI (sc_event_t.type) — added to_kotlin_event() to map explicitly so the Kotlin side keeps working without any change. - Public Kotlin API (SpeechPipeline, SpeechConfig, SpeechEvent) untouched. Docs: - README.md rewritten as Android-only (Linux/Yocto/QNN sections moved to a one-line cross-link pointing at speech-core/examples/linux). - All 9 README translations updated to mirror the new structure (zh, ja, ko, es, de, fr, hi, pt, ru) with existing high-quality translations preserved where the underlying English text is unchanged. - AGENTS.md rewritten — Android-only scope, points contributors at speech-core for any C++ / model / Linux changes. - .gitignore drops the linux/tests/models/ and /ort-linux/ entries that are no longer relevant. - setup.sh trimmed to just the Android ORT download + submodule init (it was previously rewriting the .gitignore on every invocation). Verified locally: - ./gradlew :sdk:externalNativeBuildDebug — BUILD SUCCESSFUL, 5.6 MB libspeech_android.so produced for arm64-v8a, links libonnxruntime.so and libc++_shared.so cleanly. - ./gradlew :sdk:assembleDebug :sdk:test — BUILD SUCCESSFUL, 77 tasks. Next: connectedAndroidTest needs to run on an emulator (downloads 1.2 GB of models on first run); will run that in CI rather than locally. --- .gitignore | 6 +- AGENTS.md | 140 +- README.md | 139 +- README_de.md | 139 +- README_es.md | 169 +- README_fr.md | 135 +- README_hi.md | 135 +- README_ja.md | 139 +- README_ko.md | 139 +- README_pt.md | 168 +- README_ru.md | 139 +- README_zh.md | 134 +- linux/CMakeLists.txt | 102 - linux/README.md | 151 -- linux/demo/main.cpp | 135 -- linux/include/speech.h | 67 - linux/setup_linux.sh | 60 - linux/src/speech.cpp | 259 --- linux/tests/download_models.sh | 40 - linux/tests/test_pipeline.cpp | 284 --- linux/toolchain-aarch64.cmake | 11 - linux/tools/phonemize.cpp | 47 - linux/tools/synthesize.cpp | 110 - linux/tools/transcribe.cpp | 262 --- sdk/src/main/cpp/CMakeLists.txt | 76 +- sdk/src/main/cpp/audio/fft.cpp | 92 - sdk/src/main/cpp/audio/fft.h | 13 - sdk/src/main/cpp/audio/mel.cpp | 163 -- sdk/src/main/cpp/audio/mel.h | 20 - sdk/src/main/cpp/audio/stft.cpp | 64 - sdk/src/main/cpp/audio/stft.h | 33 - sdk/src/main/cpp/jni_bridge.cpp | 308 +-- sdk/src/main/cpp/models/deepfilter.cpp | 192 -- sdk/src/main/cpp/models/deepfilter.h | 58 - sdk/src/main/cpp/models/inference_engine.h | 73 - .../main/cpp/models/kokoro_multilingual.cpp | 1841 ----------------- sdk/src/main/cpp/models/kokoro_multilingual.h | 51 - sdk/src/main/cpp/models/kokoro_phonemizer.cpp | 456 ---- sdk/src/main/cpp/models/kokoro_phonemizer.h | 85 - sdk/src/main/cpp/models/kokoro_tts.cpp | 258 --- sdk/src/main/cpp/models/kokoro_tts.h | 41 - sdk/src/main/cpp/models/onnx_backend.h | 131 -- sdk/src/main/cpp/models/onnx_engine.h | 122 -- sdk/src/main/cpp/models/parakeet_stt.cpp | 412 ---- sdk/src/main/cpp/models/parakeet_stt.h | 78 - sdk/src/main/cpp/models/silero_vad.cpp | 74 - sdk/src/main/cpp/models/silero_vad.h | 29 - sdk/src/main/cpp/models/soc_detect.cpp | 88 - sdk/src/main/cpp/util/json.h | 241 --- setup.sh | 18 - speech-core | 2 +- 51 files changed, 717 insertions(+), 7412 deletions(-) delete mode 100644 linux/CMakeLists.txt delete mode 100644 linux/README.md delete mode 100644 linux/demo/main.cpp delete mode 100644 linux/include/speech.h delete mode 100755 linux/setup_linux.sh delete mode 100644 linux/src/speech.cpp delete mode 100755 linux/tests/download_models.sh delete mode 100644 linux/tests/test_pipeline.cpp delete mode 100644 linux/toolchain-aarch64.cmake delete mode 100644 linux/tools/phonemize.cpp delete mode 100644 linux/tools/synthesize.cpp delete mode 100644 linux/tools/transcribe.cpp delete mode 100644 sdk/src/main/cpp/audio/fft.cpp delete mode 100644 sdk/src/main/cpp/audio/fft.h delete mode 100644 sdk/src/main/cpp/audio/mel.cpp delete mode 100644 sdk/src/main/cpp/audio/mel.h delete mode 100644 sdk/src/main/cpp/audio/stft.cpp delete mode 100644 sdk/src/main/cpp/audio/stft.h delete mode 100644 sdk/src/main/cpp/models/deepfilter.cpp delete mode 100644 sdk/src/main/cpp/models/deepfilter.h delete mode 100644 sdk/src/main/cpp/models/inference_engine.h delete mode 100644 sdk/src/main/cpp/models/kokoro_multilingual.cpp delete mode 100644 sdk/src/main/cpp/models/kokoro_multilingual.h delete mode 100644 sdk/src/main/cpp/models/kokoro_phonemizer.cpp delete mode 100644 sdk/src/main/cpp/models/kokoro_phonemizer.h delete mode 100644 sdk/src/main/cpp/models/kokoro_tts.cpp delete mode 100644 sdk/src/main/cpp/models/kokoro_tts.h delete mode 100644 sdk/src/main/cpp/models/onnx_backend.h delete mode 100644 sdk/src/main/cpp/models/onnx_engine.h delete mode 100644 sdk/src/main/cpp/models/parakeet_stt.cpp delete mode 100644 sdk/src/main/cpp/models/parakeet_stt.h delete mode 100644 sdk/src/main/cpp/models/silero_vad.cpp delete mode 100644 sdk/src/main/cpp/models/silero_vad.h delete mode 100644 sdk/src/main/cpp/models/soc_detect.cpp delete mode 100644 sdk/src/main/cpp/util/json.h diff --git a/.gitignore b/.gitignore index 25af2ea..f28a3e6 100644 --- a/.gitignore +++ b/.gitignore @@ -5,12 +5,8 @@ build/ .idea/ local.properties -# ONNX Runtime (downloaded by setup.sh / setup_linux.sh) +# ONNX Runtime (downloaded by setup.sh) /ort/ -/ort-linux/ - -# Test models (downloaded by linux/tests/download_models.sh) -linux/tests/models/ # Native build artifacts .cxx/ diff --git a/AGENTS.md b/AGENTS.md index 74e0840..20dcfae 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -2,17 +2,23 @@ ## Project -speech-android — on-device speech SDK for Android and embedded Linux (VAD + STT + TTS + noise cancellation). +speech-android — on-device speech SDK for Android (VAD + STT + TTS + noise cancellation). + +Thin Kotlin SDK + JNI bridge over the [speech-core](https://github.com/soniqo/speech-core) +C++ engine, which provides the orchestration pipeline AND the ONNX Runtime +model wrappers (Silero VAD, Parakeet STT, Kokoro TTS, DeepFilterNet3). This +repo owns only the Android packaging and a single ~250-line JNI bridge. + +Linux/automotive support moved to [speech-core's `examples/linux/`](https://github.com/soniqo/speech-core/tree/main/examples/linux). ## Structure -- `speech-core/` — C++17 git submodule, pipeline orchestration (do not modify directly) -- `sdk/src/main/cpp/` — ONNX Runtime model implementations, JNI bridge, audio DSP +- `speech-core/` — git submodule (do not modify directly; open PRs against soniqo/speech-core) +- `sdk/src/main/cpp/` — `jni_bridge.cpp` + `CMakeLists.txt`. That's it. All model code lives in speech-core. - `sdk/src/main/kotlin/com/soniqo/speech/` — Kotlin public SDK - `sdk/src/androidTest/` — instrumented e2e tests -- `linux/` — embedded Linux C API (automotive/Yocto) - `app/` — demo application -- `setup.sh` — downloads ONNX Runtime, initializes submodule +- `setup.sh` — downloads ONNX Runtime, initializes the speech-core submodule ## Build @@ -24,68 +30,13 @@ speech-android — on-device speech SDK for Android and embedded Linux (VAD + ST ## Tests -### Android (emulator or device) - -```bash -./gradlew :sdk:connectedAndroidTest -``` - -Models download automatically via `ModelManager.ensureModels()`. -23 tests across 5 suites: SileroVadTest, ParakeetSttTest, KokoroTtsTest, PipelineE2ETest, BargeInTest. - -### Linux - -```bash -# 1. Download ONNX Runtime -linux/setup_linux.sh - -# 2. Download test models -linux/tests/download_models.sh - -# 3. Build -cd linux && cmake -B build -DORT_DIR=../ort-linux && cmake --build build - -# 4. Run (set model dir) -SPEECH_MODEL_DIR=tests/models ./build/speech_test -``` - -11 tests: config, lifecycle, speech detection, concurrency, null safety. - -## Models - -ONNX models hosted on HuggingFace under `aufklarer/` org. INT8 is default. -Parakeet TDT v3 — multilingual STT (114 languages, 8192 BPE vocab). -ModelManager.kt handles download and caching. - -## Key files - -- `jni_bridge.cpp` — wires ONNX models to speech-core C API via vtables -- `SpeechPipeline.kt` — main public API -- `parakeet_stt.cpp` — STT with TDT greedy decoder + per-feature mel normalization -- `kokoro_tts.cpp` + `kokoro_phonemizer.cpp` — TTS with dictionary-based phonemizer -- `silero_vad.cpp` — voice activity detection -- `deepfilter.cpp` — noise cancellation with STFT/ERB processing -- `onnx_engine.h` — platform-aware ONNX Runtime wrapper (Android NNAPI / Linux QNN) -- `linux/src/speech.cpp` — Linux C API implementation -- `linux/include/speech.h` — Linux public C header - -## Workflow - -- **Never push directly to main.** Create a feature branch, open a PR, and merge after review. -- Branch naming: `feat/description`, `fix/description`, `chore/description` -- PRs should include: summary, test plan, and link to related issues -- Tag releases from main after PR is merged: `git tag v0.0.X && git push origin v0.0.X` -- CI runs on tags: builds SDK, runs unit tests, publishes to Maven Central + GitHub Packages, creates GitHub Release with APK - -## Testing - ### Unit tests (no device needed) ```bash ./gradlew :sdk:test ``` -15 tests: download retry, resume, timeout, validation, edge cases. +Download retry / resume / timeout / validation / edge cases. ### E2E tests (arm64 emulator or device) @@ -93,7 +44,11 @@ ModelManager.kt handles download and caching. ./gradlew :sdk:connectedAndroidTest ``` -31 tests across 7 suites: SileroVadTest, ParakeetSttTest, KokoroTtsTest, KokoroMultilingualTest, PipelineE2ETest, BargeInTest, DeepFilterTest. +Suites: `SileroVadTest`, `ParakeetSttTest`, `KokoroTtsTest`, +`KokoroMultilingualTest`, `PipelineE2ETest`, `BargeInTest`, `DeepFilterTest`. + +Models (~1.2GB) download on first run via `ModelManager.ensureModels()`. +Subsequent runs use the device-side cache. #### Emulator setup (arm64, 4GB RAM required) @@ -104,29 +59,50 @@ echo "no" | avdmanager create avd -n speech_test -k "system-images;android-35-ex /opt/homebrew/share/android-commandlinetools/emulator/emulator -avd speech_test -no-window -no-audio -no-boot-anim -gpu swiftshader_indirect -memory 4096 ``` -Models (~1.2GB) download on first run. Subsequent runs use cache. +## Models + +ONNX models hosted on HuggingFace under [`aufklarer/`](https://huggingface.co/aufklarer) +org. INT8 quantized by default. -### Linux +- `aufklarer/Silero-VAD-v5-ONNX` — VAD +- `aufklarer/Parakeet-TDT-v3-ONNX` — STT (114 languages, 8192 BPE vocab) +- `aufklarer/Kokoro-82M-ONNX` — TTS + phonemizer dicts + voice embeddings +- `aufklarer/DeepFilterNet3-ONNX` — noise enhancer -```bash -linux/setup_linux.sh -linux/tests/download_models.sh -cd linux && cmake -B build -DORT_DIR=../ort-linux && cmake --build build -SPEECH_MODEL_DIR=tests/models ./build/speech_test -``` +`ModelManager.kt` handles download and caching. See speech-core's +[`docs/models.md`](https://github.com/soniqo/speech-core/blob/main/docs/models.md) +for the full model-file inventory. + +## Key files + +- `sdk/src/main/cpp/jni_bridge.cpp` — constructs `speech_core::SileroVad`/`ParakeetStt`/`KokoroTts` and feeds them to `speech_core::VoicePipeline`. No vtable adapters — the model wrappers implement the interfaces directly. +- `sdk/src/main/cpp/CMakeLists.txt` — pulls speech-core in via `add_subdirectory` with `SPEECH_CORE_WITH_ONNX=ON`; the speech_core_models target provides every model wrapper. +- `sdk/src/main/kotlin/com/soniqo/speech/SpeechPipeline.kt` — main public Kotlin API. +- `sdk/src/main/kotlin/com/soniqo/speech/NativeBridge.kt` — JNI surface (must stay in lockstep with `jni_bridge.cpp`). +- `sdk/src/main/kotlin/com/soniqo/speech/ModelManager.kt` — model download + caching. + +Native code that used to live here (`models/*.{cpp,h}`, `audio/{fft,mel,stft}.cpp`, +`util/json.h`, `onnx_engine.h`) is now under speech-core. Modify it via a +speech-core PR, then bump the submodule pointer here. + +## Workflow -11 tests: config, lifecycle, speech detection, concurrency, null safety. +- **Never push directly to main.** Create a feature branch, open a PR, merge after review. +- Branch naming: `feat/description`, `fix/description`, `chore/description`. +- PRs should include: summary, test plan, and link to related issues. +- Tag releases from main after merge: `git tag v0.0.X && git push origin v0.0.X`. +- CI runs on tags: builds SDK, runs unit tests, publishes to Maven Central + GitHub Packages, creates GitHub Release with APK. ## Guidelines -- Keep native code in C++17, no external deps beyond ONNX Runtime, OkHttp, and speech-core -- Kotlin SDK should be minimal — thin wrapper over JNI -- All model tensor names/shapes must match actual ONNX exports -- Test on arm64-v8a (Snapdragon) as primary target -- No Claude attribution in commits, PRs, or model cards -- **Never push directly to main — always use a PR** -- **Always ask for confirmation before creating a git commit** -- **Always ask for confirmation before any action visible to others** — pushing to any branch, opening / commenting on / reviewing / closing / merging PRs or issues, posting to Slack or any external service. The git commit rule above is one instance of this broader principle: never create externally visible artifacts without explicit confirmation. -- **Run unit tests (`./gradlew :sdk:test`) after making code changes** -- **Run e2e tests (`./gradlew :sdk:connectedAndroidTest`) before tagging a release** -- **README translations must stay in sync.** Any change to `README.md` must be mirrored in all translated copies: `README_zh.md`, `README_ja.md`, `README_ko.md`, `README_es.md`, `README_de.md`, `README_fr.md`, `README_hi.md`, `README_pt.md`, `README_ru.md` +- Keep native code in C++17. No external deps beyond ONNX Runtime, OkHttp, and speech-core. +- Kotlin SDK stays minimal — thin wrapper over JNI. +- All model tensor names/shapes must match the published ONNX exports under `aufklarer/`. +- Test on arm64-v8a (Snapdragon) as primary target. +- **No Claude attribution** in commits, PRs, or model cards. Strip both the `🤖 Generated with [Claude Code]` footer and the `Co-Authored-By: Claude …` trailer from defaults. +- **Never push directly to main — always use a PR**. +- **Always ask for confirmation before creating a git commit**. +- **Always ask for confirmation before any externally-visible action** — pushing to any branch, opening / commenting on / reviewing / closing / merging PRs or issues, posting to Slack or any external service. The git commit rule above is one instance of this broader principle. +- **Run unit tests (`./gradlew :sdk:test`) after making code changes**. +- **Run e2e tests (`./gradlew :sdk:connectedAndroidTest`) before tagging a release**. +- **README translations must stay in sync.** Any change to `README.md` must be mirrored in all translated copies: `README_zh.md`, `README_ja.md`, `README_ko.md`, `README_es.md`, `README_de.md`, `README_fr.md`, `README_hi.md`, `README_pt.md`, `README_ru.md`. diff --git a/README.md b/README.md index 91dba12..95ae7ea 100644 --- a/README.md +++ b/README.md @@ -2,18 +2,15 @@ 📖 Read in: [English](README.md) · [中文](README_zh.md) · [日本語](README_ja.md) · [한국어](README_ko.md) · [Español](README_es.md) · [Deutsch](README_de.md) · [Français](README_fr.md) · [हिन्दी](README_hi.md) · [Português](README_pt.md) · [Русский](README_ru.md) -On-device speech SDK for Android and embedded Linux, powered by [ONNX Runtime](https://onnxruntime.ai) and [speech-core](https://github.com/soniqo/speech-core). +On-device speech SDK for Android, powered by [ONNX Runtime](https://onnxruntime.ai) and [speech-core](https://github.com/soniqo/speech-core). Speech recognition (114 languages), text-to-speech (8 languages), voice activity detection, and noise cancellation — all running locally. No cloud APIs, no data leaves the device. -**[Demo APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[Models](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (Apple counterpart) · **[speech-core](https://github.com/soniqo/speech-core)** (pipeline engine) +**[Demo APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[Models](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (Apple counterpart) · **[speech-core](https://github.com/soniqo/speech-core)** (pipeline engine + Linux/embedded build) -## Platforms +## Scope -| Platform | API | Acceleration | Directory | -| --- | --- | --- | --- | -| Android | Kotlin (`SpeechPipeline`) | NNAPI (Snapdragon, Exynos, Tensor) | `sdk/` | -| Embedded Linux | C (`speech.h`) | QNN (Hexagon DSP) | `linux/` | +This repo is the **Android packaging**: Kotlin SDK, JNI bridge, demo app. The C++ engine and ONNX model wrappers (Silero VAD, Parakeet STT, Kokoro TTS, DeepFilterNet3) live in [speech-core](https://github.com/soniqo/speech-core) and are pulled in via a git submodule. Linux / automotive (Yocto, Qualcomm SA8295P/SA8255P) lives at [speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux). ## Models @@ -24,15 +21,13 @@ Speech recognition (114 languages), text-to-speech (8 languages), voice activity | [Silero VAD v5](https://huggingface.co/aufklarer/Silero-VAD-v5-ONNX) | Voice activity detection | 2 MB | Any | | [DeepFilterNet3](https://huggingface.co/aufklarer/DeepFilterNet3-ONNX) | Noise cancellation | ~8 MB | Any | -Models are downloaded automatically on first launch (Android) or placed manually (Linux). - -## Android +Models are downloaded automatically on first launch via `ModelManager.ensureModels()`. -### Try the demo +## Try the demo Download the [signed APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk) and install on any arm64 Android device (8+). Models (~1.2 GB) download automatically on first launch. -### Add dependency +## Add dependency ```kotlin dependencies { @@ -40,7 +35,7 @@ dependencies { } ``` -### Kotlin usage +## Kotlin usage ```kotlin val modelDir = ModelManager.ensureModels(context) @@ -63,7 +58,7 @@ pipeline.start() pipeline.pushAudio(samples) ``` -### Build from source +## Build from source ```bash git clone --recursive https://github.com/soniqo/speech-android.git @@ -73,7 +68,10 @@ cd speech-android ./gradlew :sdk:connectedAndroidTest # 34 e2e tests ``` -### Demo app +`./setup.sh` initializes the speech-core submodule and downloads ONNX Runtime +into `./ort/`. + +## Demo app The [`app/`](app/) module is a minimal voice assistant demo with: @@ -87,7 +85,7 @@ The [`app/`](app/) module is a minimal voice assistant demo with: ./gradlew :app:installDebug ``` -### System voice input (`RecognitionService`) +## System voice input (`RecognitionService`) The SDK ships a ready-made `audio.soniqo.speech.service.SpeechRecognitionService` that plugs into Android's framework `SpeechRecognizer` API — no code to write. @@ -159,53 +157,6 @@ Measured on Android emulator (arm64-v8a, no NNAPI). Real hardware is significant | Kokoro 82M | TTS | 1.9s output | 1,075ms | 0.58 | | Silero VAD v5 | VAD | 32ms chunk | <1ms | <0.01 | -## Embedded Linux - -Minimal C API for automotive and embedded platforms. See [`linux/README.md`](linux/README.md) for full documentation. - -### C API usage - -```c -#include - -void on_event(const speech_event_t* event, void* ctx) { - if (event->type == SPEECH_EVENT_TRANSCRIPTION) - printf("%s\n", event->text); -} - -speech_config_t cfg = speech_config_default(); -cfg.model_dir = "/opt/speech/models"; -cfg.use_qnn = true; // Hexagon DSP acceleration - -speech_pipeline_t p = speech_create(cfg, on_event, NULL); -speech_start(p); -speech_push_audio(p, pcm_samples, 512); -``` - -### Build - -```bash -cd linux && ./setup_linux.sh -cmake -B build -DORT_DIR=../ort-linux -cmake --build build -./build/speech_demo --model-dir /path/to/models -``` - -### Test - -```bash -linux/tests/download_models.sh # download ONNX models -SPEECH_MODEL_DIR=tests/models ./build/speech_test # 12 tests -``` - -### Cross-compile for Yocto - -```bash -source /opt/poky/environment-setup-aarch64-poky-linux -cmake -B build -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64.cmake -DORT_DIR=... -cmake --build build -``` - ## Pipeline ```text @@ -220,41 +171,51 @@ Barge-in supported: speaking during TTS playback interrupts and starts a new tra ```text ┌──────────────────────────────────────────────┐ -│ Android: SpeechPipeline (Kotlin/JNI) │ -│ Linux: speech.h (C API) │ -└──────────────────┬───────────────────────────┘ - │ -┌──────────────────┴───────────────────────────┐ -│ speech-core (C++ submodule) │ -│ Turn detection · Interruptions · Context │ -└──┬────────┬────────┬────────┬────────────────┘ - │ │ │ │ vtables -┌──┴──┐ ┌──┴──┐ ┌──┴──┐ ┌─┴────────┐ -│ VAD │ │ STT │ │ TTS │ │ Enhancer │ -│Silero│ │Para-│ │Koko-│ │DeepFilter│ -│ │ │keet │ │ro │ │Net3 │ -└──┬──┘ └──┬──┘ └──┬──┘ └─┬────────┘ - └────────┴────────┴────────┘ - ONNX Runtime (CPU / NNAPI / QNN) +│ SpeechPipeline (Kotlin) │ +│ │ │ +│ ▼ │ +│ jni_bridge.cpp (~250 lines) │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────┐ │ +│ │ speech_core_models (git submodule) │ │ +│ │ SileroVad / ParakeetStt / │ │ +│ │ KokoroTts / DeepFilterEnhancer │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ speech_core (orchestration: │ │ +│ │ pipeline · turn · interruptions) │ │ +│ └──────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ONNX Runtime (CPU / NNAPI) │ +└──────────────────────────────────────────────┘ ``` +Each model class directly implements the corresponding speech-core interface +(`VADInterface`, `STTInterface`, `TTSInterface`, `EnhancerInterface`) — the +JNI bridge instantiates them and hands references to `VoicePipeline`. No +C-vtable adapter boilerplate. + ## Hardware Acceleration -| Platform | Chipset | Acceleration | -| --- | --- | --- | -| Android | Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU | -| Android | Samsung Exynos 2200+ | NNAPI → Samsung NPU | -| Android | Google Tensor G2+ | NNAPI → Google TPU | -| Automotive | SA8295P / SA8255P | QNN → Hexagon DSP | -| Any | CPU fallback | XNNPACK | +| Chipset | Acceleration | +| --- | --- | +| Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU | +| Samsung Exynos 2200+ | NNAPI → Samsung NPU | +| Google Tensor G2+ | NNAPI → Google TPU | +| CPU fallback | XNNPACK | + +For automotive Qualcomm SA8295P / SA8255P with QNN (Hexagon DSP), see +[speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux). ## Related -| Repository | Platform | +| Repository | Scope | | --- | --- | | [speech-swift](https://github.com/soniqo/speech-swift) | Apple (macOS, iOS) — MLX + CoreML | -| [speech-core](https://github.com/soniqo/speech-core) | Cross-platform C++ pipeline engine | -| **speech-android** | Android + embedded Linux — ONNX Runtime | +| [speech-core](https://github.com/soniqo/speech-core) | Cross-platform C++ pipeline engine + ONNX model wrappers + Linux/embedded examples | +| **speech-android** | Android wrapper — Kotlin SDK + JNI bridge over speech-core | ## License diff --git a/README_de.md b/README_de.md index 8bdffd3..b2147fa 100644 --- a/README_de.md +++ b/README_de.md @@ -2,18 +2,15 @@ 📖 Sprachen: [English](README.md) · [中文](README_zh.md) · [日本語](README_ja.md) · [한국어](README_ko.md) · [Español](README_es.md) · [Deutsch](README_de.md) · [Français](README_fr.md) · [हिन्दी](README_hi.md) · [Português](README_pt.md) · [Русский](README_ru.md) -On-Device Speech-SDK für Android und Embedded Linux, basierend auf [ONNX Runtime](https://onnxruntime.ai) und [speech-core](https://github.com/soniqo/speech-core). +On-Device Speech-SDK für Android, basierend auf [ONNX Runtime](https://onnxruntime.ai) und [speech-core](https://github.com/soniqo/speech-core). Spracherkennung (114 Sprachen), Text-to-Speech (8 Sprachen), Sprachaktivitätserkennung und Rauschunterdrückung — alles lokal ausgeführt. Keine Cloud-APIs, keine Daten verlassen das Gerät. -**[Demo-APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[Modelle](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (Apple-Pendant) · **[speech-core](https://github.com/soniqo/speech-core)** (Pipeline-Engine) +**[Demo-APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[Modelle](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (Apple-Pendant) · **[speech-core](https://github.com/soniqo/speech-core)** (Pipeline-Engine + Linux/Embedded-Build) -## Plattformen +## Geltungsbereich -| Plattform | API | Beschleunigung | Verzeichnis | -| --- | --- | --- | --- | -| Android | Kotlin (`SpeechPipeline`) | NNAPI (Snapdragon, Exynos, Tensor) | `sdk/` | -| Embedded Linux | C (`speech.h`) | QNN (Hexagon DSP) | `linux/` | +Dieses Repo ist das **Android-Packaging**: Kotlin-SDK, JNI-Bridge, Demo-App. Die C++-Engine und die ONNX-Modell-Wrapper (Silero VAD, Parakeet STT, Kokoro TTS, DeepFilterNet3) liegen in [speech-core](https://github.com/soniqo/speech-core) und werden über ein Git-Submodul eingebunden. Linux / Automotive (Yocto, Qualcomm SA8295P/SA8255P) befindet sich unter [speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux). ## Modelle @@ -24,15 +21,13 @@ Spracherkennung (114 Sprachen), Text-to-Speech (8 Sprachen), Sprachaktivitätser | [Silero VAD v5](https://huggingface.co/aufklarer/Silero-VAD-v5-ONNX) | Sprachaktivitätserkennung | 2 MB | Beliebig | | [DeepFilterNet3](https://huggingface.co/aufklarer/DeepFilterNet3-ONNX) | Rauschunterdrückung | ~8 MB | Beliebig | -Modelle werden beim ersten Start automatisch heruntergeladen (Android) oder manuell platziert (Linux). - -## Android +Modelle werden beim ersten Start automatisch über `ModelManager.ensureModels()` heruntergeladen. -### Demo ausprobieren +## Demo ausprobieren Lade das [signierte APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk) herunter und installiere es auf einem beliebigen arm64-Android-Gerät (8+). Modelle (~1,2 GB) werden beim ersten Start automatisch heruntergeladen. -### Abhängigkeit hinzufügen +## Abhängigkeit hinzufügen ```kotlin dependencies { @@ -40,7 +35,7 @@ dependencies { } ``` -### Kotlin-Verwendung +## Kotlin-Verwendung ```kotlin val modelDir = ModelManager.ensureModels(context) @@ -63,7 +58,7 @@ pipeline.start() pipeline.pushAudio(samples) ``` -### Aus dem Quellcode bauen +## Aus dem Quellcode bauen ```bash git clone --recursive https://github.com/soniqo/speech-android.git @@ -73,7 +68,10 @@ cd speech-android ./gradlew :sdk:connectedAndroidTest # 34 e2e-Tests ``` -### Demo-App +`./setup.sh` initialisiert das speech-core-Submodul und lädt die ONNX Runtime +nach `./ort/` herunter. + +## Demo-App Das Modul [`app/`](app/) ist eine minimale Sprachassistenten-Demo mit: @@ -87,7 +85,7 @@ Das Modul [`app/`](app/) ist eine minimale Sprachassistenten-Demo mit: ./gradlew :app:installDebug ``` -### Systemweite Spracheingabe (`RecognitionService`) +## Systemweite Spracheingabe (`RecognitionService`) Das SDK enthält einen einsatzbereiten `audio.soniqo.speech.service.SpeechRecognitionService`, der sich in die `SpeechRecognizer`-API des Android-Frameworks einklinkt — kein Code zu schreiben. Sobald deine App als Standard-Spracherkennung ausgewählt ist, erhält jede Drittanbieter-App, die `SpeechRecognizer.createSpeechRecognizer(context)` (ohne `ComponentName`) aufruft, vollständiges On-Device-STT über deine Pipeline. @@ -143,53 +141,6 @@ Gemessen auf einem Android-Emulator (arm64-v8a, ohne NNAPI). Echte Hardware ist | Kokoro 82M | TTS | 1,9s Ausgabe | 1.075ms | 0,58 | | Silero VAD v5 | VAD | 32ms-Block | <1ms | <0,01 | -## Embedded Linux - -Minimale C-API für Automotive- und Embedded-Plattformen. Vollständige Dokumentation siehe [`linux/README.md`](linux/README.md). - -### C-API-Verwendung - -```c -#include - -void on_event(const speech_event_t* event, void* ctx) { - if (event->type == SPEECH_EVENT_TRANSCRIPTION) - printf("%s\n", event->text); -} - -speech_config_t cfg = speech_config_default(); -cfg.model_dir = "/opt/speech/models"; -cfg.use_qnn = true; // Hexagon-DSP-Beschleunigung - -speech_pipeline_t p = speech_create(cfg, on_event, NULL); -speech_start(p); -speech_push_audio(p, pcm_samples, 512); -``` - -### Bauen - -```bash -cd linux && ./setup_linux.sh -cmake -B build -DORT_DIR=../ort-linux -cmake --build build -./build/speech_demo --model-dir /path/to/models -``` - -### Testen - -```bash -linux/tests/download_models.sh # ONNX-Modelle herunterladen -SPEECH_MODEL_DIR=tests/models ./build/speech_test # 12 Tests -``` - -### Cross-Compile für Yocto - -```bash -source /opt/poky/environment-setup-aarch64-poky-linux -cmake -B build -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64.cmake -DORT_DIR=... -cmake --build build -``` - ## Pipeline ```text @@ -204,41 +155,51 @@ Barge-In wird unterstützt: Sprechen während der TTS-Wiedergabe unterbricht und ```text ┌──────────────────────────────────────────────┐ -│ Android: SpeechPipeline (Kotlin/JNI) │ -│ Linux: speech.h (C API) │ -└──────────────────┬───────────────────────────┘ - │ -┌──────────────────┴───────────────────────────┐ -│ speech-core (C++ submodule) │ -│ Turn detection · Interruptions · Context │ -└──┬────────┬────────┬────────┬────────────────┘ - │ │ │ │ vtables -┌──┴──┐ ┌──┴──┐ ┌──┴──┐ ┌─┴────────┐ -│ VAD │ │ STT │ │ TTS │ │ Enhancer │ -│Silero│ │Para-│ │Koko-│ │DeepFilter│ -│ │ │keet │ │ro │ │Net3 │ -└──┬──┘ └──┬──┘ └──┬──┘ └─┬────────┘ - └────────┴────────┴────────┘ - ONNX Runtime (CPU / NNAPI / QNN) +│ SpeechPipeline (Kotlin) │ +│ │ │ +│ ▼ │ +│ jni_bridge.cpp (~250 Zeilen) │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────┐ │ +│ │ speech_core_models (Git-Submodul) │ │ +│ │ SileroVad / ParakeetStt / │ │ +│ │ KokoroTts / DeepFilterEnhancer │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ speech_core (Orchestrierung: │ │ +│ │ Pipeline · Turn · Interruptions) │ │ +│ └──────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ONNX Runtime (CPU / NNAPI) │ +└──────────────────────────────────────────────┘ ``` +Jede Modellklasse implementiert direkt die entsprechende speech-core-Schnittstelle +(`VADInterface`, `STTInterface`, `TTSInterface`, `EnhancerInterface`) — die +JNI-Bridge instanziiert sie und übergibt Referenzen an `VoicePipeline`. Kein +C-vtable-Adapter-Boilerplate. + ## Hardwarebeschleunigung -| Plattform | Chipsatz | Beschleunigung | -| --- | --- | --- | -| Android | Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU | -| Android | Samsung Exynos 2200+ | NNAPI → Samsung NPU | -| Android | Google Tensor G2+ | NNAPI → Google TPU | -| Automotive | SA8295P / SA8255P | QNN → Hexagon DSP | -| Beliebig | CPU-Fallback | XNNPACK | +| Chipsatz | Beschleunigung | +| --- | --- | +| Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU | +| Samsung Exynos 2200+ | NNAPI → Samsung NPU | +| Google Tensor G2+ | NNAPI → Google TPU | +| CPU-Fallback | XNNPACK | + +Für Automotive Qualcomm SA8295P / SA8255P mit QNN (Hexagon DSP) siehe +[speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux). ## Verwandte Projekte -| Repository | Plattform | +| Repository | Geltungsbereich | | --- | --- | | [speech-swift](https://github.com/soniqo/speech-swift) | Apple (macOS, iOS) — MLX + CoreML | -| [speech-core](https://github.com/soniqo/speech-core) | Plattformübergreifende C++-Pipeline-Engine | -| **speech-android** | Android + Embedded Linux — ONNX Runtime | +| [speech-core](https://github.com/soniqo/speech-core) | Plattformübergreifende C++-Pipeline-Engine + ONNX-Modell-Wrapper + Linux/Embedded-Beispiele | +| **speech-android** | Android-Wrapper — Kotlin-SDK + JNI-Bridge über speech-core | ## Lizenz diff --git a/README_es.md b/README_es.md index 575055c..c5ae3d3 100644 --- a/README_es.md +++ b/README_es.md @@ -2,18 +2,15 @@ 📖 Idiomas: [English](README.md) · [中文](README_zh.md) · [日本語](README_ja.md) · [한국어](README_ko.md) · [Español](README_es.md) · [Deutsch](README_de.md) · [Français](README_fr.md) · [हिन्दी](README_hi.md) · [Português](README_pt.md) · [Русский](README_ru.md) -SDK de voz en el dispositivo para Android y Linux embebido, impulsado por [ONNX Runtime](https://onnxruntime.ai) y [speech-core](https://github.com/soniqo/speech-core). +SDK de voz en el dispositivo para Android, impulsado por [ONNX Runtime](https://onnxruntime.ai) y [speech-core](https://github.com/soniqo/speech-core). Reconocimiento de voz (114 idiomas), texto a voz (8 idiomas), detección de actividad de voz y cancelación de ruido — todo ejecutándose localmente. Sin APIs en la nube, ningún dato sale del dispositivo. -**[APK de demostración](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[Modelos](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (contraparte Apple) · **[speech-core](https://github.com/soniqo/speech-core)** (motor de pipeline) +**[APK de demostración](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[Modelos](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (contraparte Apple) · **[speech-core](https://github.com/soniqo/speech-core)** (motor de pipeline + compilación Linux/embebido) -## Plataformas +## Alcance -| Plataforma | API | Aceleración | Directorio | -| --- | --- | --- | --- | -| Android | Kotlin (`SpeechPipeline`) | NNAPI (Snapdragon, Exynos, Tensor) | `sdk/` | -| Linux embebido | C (`speech.h`) | QNN (Hexagon DSP) | `linux/` | +Este repositorio es el **empaquetado para Android**: SDK de Kotlin, puente JNI, app demo. El motor C++ y los envoltorios de modelos ONNX (Silero VAD, Parakeet STT, Kokoro TTS, DeepFilterNet3) viven en [speech-core](https://github.com/soniqo/speech-core) y se incorporan vía un submódulo git. Linux / automoción (Yocto, Qualcomm SA8295P/SA8255P) vive en [speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux). ## Modelos @@ -24,15 +21,13 @@ Reconocimiento de voz (114 idiomas), texto a voz (8 idiomas), detección de acti | [Silero VAD v5](https://huggingface.co/aufklarer/Silero-VAD-v5-ONNX) | Detección de actividad de voz | 2 MB | Cualquiera | | [DeepFilterNet3](https://huggingface.co/aufklarer/DeepFilterNet3-ONNX) | Cancelación de ruido | ~8 MB | Cualquiera | -Los modelos se descargan automáticamente al primer inicio (Android) o se colocan manualmente (Linux). - -## Android +Los modelos se descargan automáticamente al primer inicio vía `ModelManager.ensureModels()`. -### Prueba la demo +## Prueba la demo Descarga el [APK firmado](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk) e instálalo en cualquier dispositivo Android arm64 (8+). Los modelos (~1.2 GB) se descargan automáticamente en el primer inicio. -### Añadir dependencia +## Añadir dependencia ```kotlin dependencies { @@ -40,7 +35,7 @@ dependencies { } ``` -### Uso de Kotlin +## Uso de Kotlin ```kotlin val modelDir = ModelManager.ensureModels(context) @@ -63,7 +58,7 @@ pipeline.start() pipeline.pushAudio(samples) ``` -### Compilar desde fuente +## Compilar desde fuente ```bash git clone --recursive https://github.com/soniqo/speech-android.git @@ -73,7 +68,10 @@ cd speech-android ./gradlew :sdk:connectedAndroidTest # 34 pruebas e2e ``` -### Aplicación demo +`./setup.sh` inicializa el submódulo speech-core y descarga ONNX Runtime +en `./ort/`. + +## Aplicación demo El módulo [`app/`](app/) es una demo mínima de asistente de voz con: @@ -87,9 +85,14 @@ El módulo [`app/`](app/) es una demo mínima de asistente de voz con: ./gradlew :app:installDebug ``` -### Entrada de voz del sistema (`RecognitionService`) +## Entrada de voz del sistema (`RecognitionService`) -El SDK incluye un `audio.soniqo.speech.service.SpeechRecognitionService` listo para usar que se conecta a la API `SpeechRecognizer` del framework de Android — sin código que escribir. Una vez que tu app está seleccionada como reconocedor de voz predeterminado, cualquier app de terceros que llame a `SpeechRecognizer.createSpeechRecognizer(context)` (sin `ComponentName`) obtiene STT completamente en el dispositivo a través de tu pipeline. +El SDK incluye un `audio.soniqo.speech.service.SpeechRecognitionService` listo +para usar que se conecta a la API `SpeechRecognizer` del framework de Android +— sin código que escribir. Una vez que tu app está seleccionada como +reconocedor de voz predeterminado, cualquier app de terceros que llame a +`SpeechRecognizer.createSpeechRecognizer(context)` (sin `ComponentName`) +obtiene STT completamente en el dispositivo a través de tu pipeline. **1. Declara `RECORD_AUDIO` y el servicio en `AndroidManifest.xml`:** @@ -118,20 +121,33 @@ El SDK incluye un `audio.soniqo.speech.service.SpeechRecognitionService` listo p ``` -(Opcionalmente añade `android:settingsActivity="..."` para exponer un icono de engranaje en el selector de entrada de voz del sistema.) +(Opcionalmente añade `android:settingsActivity="..."` para exponer un icono +de engranaje en el selector de entrada de voz del sistema.) -**3. Configura el servicio como predeterminado del sistema** (Ajustes → Sistema → Idiomas e introducción → Selector de entrada de voz en Android puro, o vía adb): +**3. Configura el servicio como predeterminado del sistema** (Ajustes → +Sistema → Idiomas e introducción → Selector de entrada de voz en Android +puro, o vía adb): ```bash adb shell settings put secure voice_recognition_service \ your.package/audio.soniqo.speech.service.SpeechRecognitionService ``` -**4. Verifica** ejecutando la pantalla *Recognizer test* de la app demo, que llama a `SpeechRecognizer.createSpeechRecognizer(ctx)` (sin componente) y registra cada callback del framework — útil para confirmar el round-trip del binder sin necesitar logcat. +**4. Verifica** ejecutando la pantalla *Recognizer test* de la app demo, que +llama a `SpeechRecognizer.createSpeechRecognizer(ctx)` (sin componente) y +registra cada callback del framework — útil para confirmar el round-trip del +binder sin necesitar logcat. -El servicio implementa `onCheckRecognitionSupport` (API 33+) devolviendo los 27 idiomas BCP-47 que cubre Parakeet TDT v3, marcados como `installedOnDeviceLanguage` cuando los modelos están presentes (o `pendingOnDeviceLanguage` mientras se descargan). Se adquiere foco de audio con `AUDIOFOCUS_GAIN_TRANSIENT` durante la sesión. +El servicio implementa `onCheckRecognitionSupport` (API 33+) devolviendo los +27 idiomas BCP-47 que cubre Parakeet TDT v3, marcados como +`installedOnDeviceLanguage` cuando los modelos están presentes (o +`pendingOnDeviceLanguage` mientras se descargan). Se adquiere foco de audio +con `AUDIOFOCUS_GAIN_TRANSIENT` durante la sesión. -**Limitación:** Gboard, Samsung Keyboard y Google Assistant incluyen sus propios reconocedores y se saltan el predeterminado del sistema. Las apps que llaman explícitamente a la API `SpeechRecognizer` del framework (o construyen su propia UI sobre ella) son las que pasan por tu servicio. +**Limitación:** Gboard, Samsung Keyboard y Google Assistant incluyen sus +propios reconocedores y se saltan el predeterminado del sistema. Las apps +que llaman explícitamente a la API `SpeechRecognizer` del framework (o +construyen su propia UI sobre ella) son las que pasan por tu servicio. ## Rendimiento @@ -143,53 +159,6 @@ Medido en emulador Android (arm64-v8a, sin NNAPI). El hardware real es significa | Kokoro 82M | TTS | 1.9s salida | 1,075ms | 0.58 | | Silero VAD v5 | VAD | bloque 32ms | <1ms | <0.01 | -## Linux embebido - -API C mínima para plataformas automotrices y embebidas. Consulta [`linux/README.md`](linux/README.md) para la documentación completa. - -### Uso de la API C - -```c -#include - -void on_event(const speech_event_t* event, void* ctx) { - if (event->type == SPEECH_EVENT_TRANSCRIPTION) - printf("%s\n", event->text); -} - -speech_config_t cfg = speech_config_default(); -cfg.model_dir = "/opt/speech/models"; -cfg.use_qnn = true; // Aceleración Hexagon DSP - -speech_pipeline_t p = speech_create(cfg, on_event, NULL); -speech_start(p); -speech_push_audio(p, pcm_samples, 512); -``` - -### Compilar - -```bash -cd linux && ./setup_linux.sh -cmake -B build -DORT_DIR=../ort-linux -cmake --build build -./build/speech_demo --model-dir /path/to/models -``` - -### Probar - -```bash -linux/tests/download_models.sh # descargar modelos ONNX -SPEECH_MODEL_DIR=tests/models ./build/speech_test # 12 pruebas -``` - -### Compilación cruzada para Yocto - -```bash -source /opt/poky/environment-setup-aarch64-poky-linux -cmake -B build -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64.cmake -DORT_DIR=... -cmake --build build -``` - ## Pipeline ```text @@ -204,41 +173,51 @@ Soporte de barge-in: hablar durante la reproducción TTS interrumpe e inicia una ```text ┌──────────────────────────────────────────────┐ -│ Android: SpeechPipeline (Kotlin/JNI) │ -│ Linux: speech.h (C API) │ -└──────────────────┬───────────────────────────┘ - │ -┌──────────────────┴───────────────────────────┐ -│ speech-core (C++ submodule) │ -│ Turn detection · Interruptions · Context │ -└──┬────────┬────────┬────────┬────────────────┘ - │ │ │ │ vtables -┌──┴──┐ ┌──┴──┐ ┌──┴──┐ ┌─┴────────┐ -│ VAD │ │ STT │ │ TTS │ │ Enhancer │ -│Silero│ │Para-│ │Koko-│ │DeepFilter│ -│ │ │keet │ │ro │ │Net3 │ -└──┬──┘ └──┬──┘ └──┬──┘ └─┬────────┘ - └────────┴────────┴────────┘ - ONNX Runtime (CPU / NNAPI / QNN) +│ SpeechPipeline (Kotlin) │ +│ │ │ +│ ▼ │ +│ jni_bridge.cpp (~250 líneas) │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────┐ │ +│ │ speech_core_models (submódulo git) │ │ +│ │ SileroVad / ParakeetStt / │ │ +│ │ KokoroTts / DeepFilterEnhancer │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ speech_core (orquestación: │ │ +│ │ pipeline · turno · interrupciones) │ │ +│ └──────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ONNX Runtime (CPU / NNAPI) │ +└──────────────────────────────────────────────┘ ``` +Cada clase de modelo implementa directamente la interfaz correspondiente de +speech-core (`VADInterface`, `STTInterface`, `TTSInterface`, +`EnhancerInterface`) — el puente JNI las instancia y entrega las referencias +a `VoicePipeline`. Sin código repetitivo de adaptadores con vtables en C. + ## Aceleración por hardware -| Plataforma | Chipset | Aceleración | -| --- | --- | --- | -| Android | Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU | -| Android | Samsung Exynos 2200+ | NNAPI → Samsung NPU | -| Android | Google Tensor G2+ | NNAPI → Google TPU | -| Automoción | SA8295P / SA8255P | QNN → Hexagon DSP | -| Cualquiera | Fallback CPU | XNNPACK | +| Chipset | Aceleración | +| --- | --- | +| Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU | +| Samsung Exynos 2200+ | NNAPI → Samsung NPU | +| Google Tensor G2+ | NNAPI → Google TPU | +| Fallback CPU | XNNPACK | + +Para Qualcomm SA8295P / SA8255P de automoción con QNN (Hexagon DSP), consulta +[speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux). ## Relacionados -| Repositorio | Plataforma | +| Repositorio | Alcance | | --- | --- | | [speech-swift](https://github.com/soniqo/speech-swift) | Apple (macOS, iOS) — MLX + CoreML | -| [speech-core](https://github.com/soniqo/speech-core) | Motor de pipeline C++ multiplataforma | -| **speech-android** | Android + Linux embebido — ONNX Runtime | +| [speech-core](https://github.com/soniqo/speech-core) | Motor de pipeline C++ multiplataforma + envoltorios de modelos ONNX + ejemplos Linux/embebido | +| **speech-android** | Envoltorio Android — SDK Kotlin + puente JNI sobre speech-core | ## Licencia diff --git a/README_fr.md b/README_fr.md index 9752333..a46ab04 100644 --- a/README_fr.md +++ b/README_fr.md @@ -2,18 +2,15 @@ 📖 Langues : [English](README.md) · [中文](README_zh.md) · [日本語](README_ja.md) · [한국어](README_ko.md) · [Español](README_es.md) · [Deutsch](README_de.md) · [Français](README_fr.md) · [हिन्दी](README_hi.md) · [Português](README_pt.md) · [Русский](README_ru.md) -SDK vocal sur appareil pour Android et Linux embarqué, propulsé par [ONNX Runtime](https://onnxruntime.ai) et [speech-core](https://github.com/soniqo/speech-core). +SDK vocal sur appareil pour Android, propulsé par [ONNX Runtime](https://onnxruntime.ai) et [speech-core](https://github.com/soniqo/speech-core). Reconnaissance vocale (114 langues), synthèse vocale (8 langues), détection d'activité vocale et suppression de bruit — tout fonctionne en local. Aucune API cloud, aucune donnée ne quitte l'appareil. -**[APK de démo](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[Modèles](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (équivalent Apple) · **[speech-core](https://github.com/soniqo/speech-core)** (moteur de pipeline) +**[APK de démo](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[Modèles](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (équivalent Apple) · **[speech-core](https://github.com/soniqo/speech-core)** (moteur de pipeline + build Linux/embarqué) -## Plateformes +## Périmètre -| Plateforme | API | Accélération | Répertoire | -| --- | --- | --- | --- | -| Android | Kotlin (`SpeechPipeline`) | NNAPI (Snapdragon, Exynos, Tensor) | `sdk/` | -| Linux embarqué | C (`speech.h`) | QNN (Hexagon DSP) | `linux/` | +Ce dépôt fournit le **packaging Android** : SDK Kotlin, pont JNI, application de démo. Le moteur C++ et les wrappers de modèles ONNX (Silero VAD, Parakeet STT, Kokoro TTS, DeepFilterNet3) résident dans [speech-core](https://github.com/soniqo/speech-core) et sont intégrés via un sous-module git. Le volet Linux / automobile (Yocto, Qualcomm SA8295P/SA8255P) se trouve dans [speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux). ## Modèles @@ -24,15 +21,13 @@ Reconnaissance vocale (114 langues), synthèse vocale (8 langues), détection d' | [Silero VAD v5](https://huggingface.co/aufklarer/Silero-VAD-v5-ONNX) | Détection d'activité vocale | 2 Mo | Toutes | | [DeepFilterNet3](https://huggingface.co/aufklarer/DeepFilterNet3-ONNX) | Suppression de bruit | ~8 Mo | Toutes | -Les modèles sont téléchargés automatiquement au premier lancement (Android) ou placés manuellement (Linux). - -## Android +Les modèles sont téléchargés automatiquement au premier lancement via `ModelManager.ensureModels()`. -### Essayer la démo +## Essayer la démo Téléchargez l'[APK signé](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk) et installez-le sur n'importe quel appareil Android arm64 (8+). Les modèles (~1,2 Go) sont téléchargés automatiquement au premier lancement. -### Ajouter la dépendance +## Ajouter la dépendance ```kotlin dependencies { @@ -40,7 +35,7 @@ dependencies { } ``` -### Utilisation Kotlin +## Utilisation Kotlin ```kotlin val modelDir = ModelManager.ensureModels(context) @@ -63,7 +58,7 @@ pipeline.start() pipeline.pushAudio(samples) ``` -### Compiler depuis les sources +## Compiler depuis les sources ```bash git clone --recursive https://github.com/soniqo/speech-android.git @@ -73,7 +68,10 @@ cd speech-android ./gradlew :sdk:connectedAndroidTest # 34 tests e2e ``` -### Application de démo +`./setup.sh` initialise le sous-module speech-core et télécharge ONNX Runtime +dans `./ort/`. + +## Application de démo Le module [`app/`](app/) est une démo minimale d'assistant vocal avec : @@ -87,7 +85,7 @@ Le module [`app/`](app/) est une démo minimale d'assistant vocal avec : ./gradlew :app:installDebug ``` -### Entrée vocale système (`RecognitionService`) +## Entrée vocale système (`RecognitionService`) Le SDK fournit un `audio.soniqo.speech.service.SpeechRecognitionService` prêt à l'emploi qui s'intègre à l'API `SpeechRecognizer` du framework Android — aucun code à écrire. Une fois votre app sélectionnée comme reconnaisseur vocal par défaut, toute application tierce appelant `SpeechRecognizer.createSpeechRecognizer(context)` (sans `ComponentName`) obtient un STT entièrement on-device via votre pipeline. @@ -143,53 +141,6 @@ Mesuré sur émulateur Android (arm64-v8a, sans NNAPI). Le matériel réel est n | Kokoro 82M | TTS | 1,9 s en sortie | 1 075 ms | 0,58 | | Silero VAD v5 | VAD | bloc 32 ms | <1 ms | <0,01 | -## Linux embarqué - -API C minimale pour les plateformes automobiles et embarquées. Voir [`linux/README.md`](linux/README.md) pour la documentation complète. - -### Utilisation de l'API C - -```c -#include - -void on_event(const speech_event_t* event, void* ctx) { - if (event->type == SPEECH_EVENT_TRANSCRIPTION) - printf("%s\n", event->text); -} - -speech_config_t cfg = speech_config_default(); -cfg.model_dir = "/opt/speech/models"; -cfg.use_qnn = true; // Accélération Hexagon DSP - -speech_pipeline_t p = speech_create(cfg, on_event, NULL); -speech_start(p); -speech_push_audio(p, pcm_samples, 512); -``` - -### Compiler - -```bash -cd linux && ./setup_linux.sh -cmake -B build -DORT_DIR=../ort-linux -cmake --build build -./build/speech_demo --model-dir /path/to/models -``` - -### Tester - -```bash -linux/tests/download_models.sh # télécharger les modèles ONNX -SPEECH_MODEL_DIR=tests/models ./build/speech_test # 12 tests -``` - -### Compilation croisée pour Yocto - -```bash -source /opt/poky/environment-setup-aarch64-poky-linux -cmake -B build -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64.cmake -DORT_DIR=... -cmake --build build -``` - ## Pipeline ```text @@ -204,41 +155,47 @@ Le barge-in est pris en charge : parler pendant la lecture TTS l'interrompt et d ```text ┌──────────────────────────────────────────────┐ -│ Android: SpeechPipeline (Kotlin/JNI) │ -│ Linux: speech.h (C API) │ -└──────────────────┬───────────────────────────┘ - │ -┌──────────────────┴───────────────────────────┐ -│ speech-core (C++ submodule) │ -│ Turn detection · Interruptions · Context │ -└──┬────────┬────────┬────────┬────────────────┘ - │ │ │ │ vtables -┌──┴──┐ ┌──┴──┐ ┌──┴──┐ ┌─┴────────┐ -│ VAD │ │ STT │ │ TTS │ │ Enhancer │ -│Silero│ │Para-│ │Koko-│ │DeepFilter│ -│ │ │keet │ │ro │ │Net3 │ -└──┬──┘ └──┬──┘ └──┬──┘ └─┬────────┘ - └────────┴────────┴────────┘ - ONNX Runtime (CPU / NNAPI / QNN) +│ SpeechPipeline (Kotlin) │ +│ │ │ +│ ▼ │ +│ jni_bridge.cpp (~250 lignes) │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────┐ │ +│ │ speech_core_models (sous-module) │ │ +│ │ SileroVad / ParakeetStt / │ │ +│ │ KokoroTts / DeepFilterEnhancer │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ speech_core (orchestration : │ │ +│ │ pipeline · tour · interruptions) │ │ +│ └──────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ONNX Runtime (CPU / NNAPI) │ +└──────────────────────────────────────────────┘ ``` +Chaque classe de modèle implémente directement l'interface speech-core correspondante (`VADInterface`, `STTInterface`, `TTSInterface`, `EnhancerInterface`) — le pont JNI les instancie et transmet les références à `VoicePipeline`. Aucun boilerplate d'adaptateur de vtable C. + ## Accélération matérielle -| Plateforme | Chipset | Accélération | -| --- | --- | --- | -| Android | Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU | -| Android | Samsung Exynos 2200+ | NNAPI → Samsung NPU | -| Android | Google Tensor G2+ | NNAPI → Google TPU | -| Automobile | SA8295P / SA8255P | QNN → Hexagon DSP | -| Toutes | Repli CPU | XNNPACK | +| Chipset | Accélération | +| --- | --- | +| Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU | +| Samsung Exynos 2200+ | NNAPI → Samsung NPU | +| Google Tensor G2+ | NNAPI → Google TPU | +| Repli CPU | XNNPACK | + +Pour les plateformes automobiles Qualcomm SA8295P / SA8255P avec QNN (Hexagon DSP), voir [speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux). ## Projets liés -| Dépôt | Plateforme | +| Dépôt | Périmètre | | --- | --- | | [speech-swift](https://github.com/soniqo/speech-swift) | Apple (macOS, iOS) — MLX + CoreML | -| [speech-core](https://github.com/soniqo/speech-core) | Moteur de pipeline C++ multiplateforme | -| **speech-android** | Android + Linux embarqué — ONNX Runtime | +| [speech-core](https://github.com/soniqo/speech-core) | Moteur de pipeline C++ multiplateforme + wrappers de modèles ONNX + exemples Linux/embarqué | +| **speech-android** | Wrapper Android — SDK Kotlin + pont JNI sur speech-core | ## Licence diff --git a/README_hi.md b/README_hi.md index dde08c6..a1b91e3 100644 --- a/README_hi.md +++ b/README_hi.md @@ -2,18 +2,15 @@ 📖 भाषाएँ: [English](README.md) · [中文](README_zh.md) · [日本語](README_ja.md) · [한국어](README_ko.md) · [Español](README_es.md) · [Deutsch](README_de.md) · [Français](README_fr.md) · [हिन्दी](README_hi.md) · [Português](README_pt.md) · [Русский](README_ru.md) -Android और एम्बेडेड Linux के लिए ऑन-डिवाइस स्पीच SDK, [ONNX Runtime](https://onnxruntime.ai) और [speech-core](https://github.com/soniqo/speech-core) द्वारा संचालित। +Android के लिए ऑन-डिवाइस स्पीच SDK, [ONNX Runtime](https://onnxruntime.ai) और [speech-core](https://github.com/soniqo/speech-core) द्वारा संचालित। स्पीच रिकग्निशन (114 भाषाएँ), टेक्स्ट-टू-स्पीच (8 भाषाएँ), वॉयस एक्टिविटी डिटेक्शन, और शोर रद्दीकरण — सभी स्थानीय रूप से चलते हैं। कोई क्लाउड API नहीं, कोई डेटा डिवाइस से बाहर नहीं जाता। -**[डेमो APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[मॉडल](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (Apple समकक्ष) · **[speech-core](https://github.com/soniqo/speech-core)** (पाइपलाइन इंजन) +**[डेमो APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[मॉडल](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (Apple समकक्ष) · **[speech-core](https://github.com/soniqo/speech-core)** (पाइपलाइन इंजन + Linux/एम्बेडेड बिल्ड) -## प्लेटफ़ॉर्म +## स्कोप -| प्लेटफ़ॉर्म | API | त्वरण | निर्देशिका | -| --- | --- | --- | --- | -| Android | Kotlin (`SpeechPipeline`) | NNAPI (Snapdragon, Exynos, Tensor) | `sdk/` | -| एम्बेडेड Linux | C (`speech.h`) | QNN (Hexagon DSP) | `linux/` | +यह रिपॉज़िटरी **Android पैकेजिंग** है: Kotlin SDK, JNI ब्रिज, डेमो ऐप। C++ इंजन और ONNX मॉडल रैपर (Silero VAD, Parakeet STT, Kokoro TTS, DeepFilterNet3) [speech-core](https://github.com/soniqo/speech-core) में रहते हैं और एक git सबमॉड्यूल के माध्यम से शामिल किए जाते हैं। Linux / ऑटोमोटिव (Yocto, Qualcomm SA8295P/SA8255P) [speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux) पर रहता है। ## मॉडल @@ -24,15 +21,13 @@ Android और एम्बेडेड Linux के लिए ऑन-डिव | [Silero VAD v5](https://huggingface.co/aufklarer/Silero-VAD-v5-ONNX) | वॉयस एक्टिविटी डिटेक्शन | 2 MB | कोई भी | | [DeepFilterNet3](https://huggingface.co/aufklarer/DeepFilterNet3-ONNX) | शोर रद्दीकरण | ~8 MB | कोई भी | -मॉडल पहले लॉन्च पर स्वचालित रूप से डाउनलोड होते हैं (Android) या मैन्युअल रूप से रखे जाते हैं (Linux)। - -## Android +मॉडल पहले लॉन्च पर `ModelManager.ensureModels()` के माध्यम से स्वचालित रूप से डाउनलोड होते हैं। -### डेमो आज़माएँ +## डेमो आज़माएँ [हस्ताक्षरित APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk) डाउनलोड करें और किसी भी arm64 Android डिवाइस (8+) पर इंस्टॉल करें। मॉडल (~1.2 GB) पहले लॉन्च पर स्वचालित रूप से डाउनलोड होते हैं। -### निर्भरता जोड़ें +## निर्भरता जोड़ें ```kotlin dependencies { @@ -40,7 +35,7 @@ dependencies { } ``` -### Kotlin उपयोग +## Kotlin उपयोग ```kotlin val modelDir = ModelManager.ensureModels(context) @@ -63,7 +58,7 @@ pipeline.start() pipeline.pushAudio(samples) ``` -### स्रोत से बिल्ड करें +## स्रोत से बिल्ड करें ```bash git clone --recursive https://github.com/soniqo/speech-android.git @@ -73,7 +68,10 @@ cd speech-android ./gradlew :sdk:connectedAndroidTest # 34 e2e परीक्षण ``` -### डेमो ऐप +`./setup.sh` speech-core सबमॉड्यूल को इनिशियलाइज़ करता है और ONNX Runtime को +`./ort/` में डाउनलोड करता है। + +## डेमो ऐप [`app/`](app/) मॉड्यूल एक न्यूनतम वॉयस असिस्टेंट डेमो है जिसमें शामिल हैं: @@ -87,7 +85,7 @@ cd speech-android ./gradlew :app:installDebug ``` -### सिस्टम वॉयस इनपुट (`RecognitionService`) +## सिस्टम वॉयस इनपुट (`RecognitionService`) SDK एक उपयोग के लिए तैयार `audio.soniqo.speech.service.SpeechRecognitionService` शामिल करता है जो Android फ्रेमवर्क के `SpeechRecognizer` API से जुड़ता है — कोई कोड लिखने की आवश्यकता नहीं। एक बार आपका ऐप डिफ़ॉल्ट वॉयस रिकग्नाइज़र के रूप में चुना जाता है, कोई भी थर्ड-पार्टी ऐप जो `SpeechRecognizer.createSpeechRecognizer(context)` (बिना `ComponentName` के) कॉल करता है, आपकी पाइपलाइन के माध्यम से पूरी तरह से ऑन-डिवाइस STT प्राप्त करता है। @@ -143,53 +141,6 @@ Android एमुलेटर (arm64-v8a, NNAPI के बिना) पर म | Kokoro 82M | TTS | 1.9 सेकंड आउटपुट | 1,075 मिलीसेकंड | 0.58 | | Silero VAD v5 | VAD | 32 मिलीसेकंड चंक | <1 मिलीसेकंड | <0.01 | -## एम्बेडेड Linux - -ऑटोमोटिव और एम्बेडेड प्लेटफ़ॉर्म के लिए न्यूनतम C API। पूर्ण दस्तावेज़ के लिए [`linux/README.md`](linux/README.md) देखें। - -### C API उपयोग - -```c -#include - -void on_event(const speech_event_t* event, void* ctx) { - if (event->type == SPEECH_EVENT_TRANSCRIPTION) - printf("%s\n", event->text); -} - -speech_config_t cfg = speech_config_default(); -cfg.model_dir = "/opt/speech/models"; -cfg.use_qnn = true; // Hexagon DSP त्वरण - -speech_pipeline_t p = speech_create(cfg, on_event, NULL); -speech_start(p); -speech_push_audio(p, pcm_samples, 512); -``` - -### बिल्ड - -```bash -cd linux && ./setup_linux.sh -cmake -B build -DORT_DIR=../ort-linux -cmake --build build -./build/speech_demo --model-dir /path/to/models -``` - -### परीक्षण - -```bash -linux/tests/download_models.sh # ONNX मॉडल डाउनलोड करें -SPEECH_MODEL_DIR=tests/models ./build/speech_test # 12 परीक्षण -``` - -### Yocto के लिए क्रॉस-कंपाइल - -```bash -source /opt/poky/environment-setup-aarch64-poky-linux -cmake -B build -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64.cmake -DORT_DIR=... -cmake --build build -``` - ## पाइपलाइन ```text @@ -204,41 +155,47 @@ Idle → Listening → Transcribing → Speaking → Idle ```text ┌──────────────────────────────────────────────┐ -│ Android: SpeechPipeline (Kotlin/JNI) │ -│ Linux: speech.h (C API) │ -└──────────────────┬───────────────────────────┘ - │ -┌──────────────────┴───────────────────────────┐ -│ speech-core (C++ submodule) │ -│ Turn detection · Interruptions · Context │ -└──┬────────┬────────┬────────┬────────────────┘ - │ │ │ │ vtables -┌──┴──┐ ┌──┴──┐ ┌──┴──┐ ┌─┴────────┐ -│ VAD │ │ STT │ │ TTS │ │ Enhancer │ -│Silero│ │Para-│ │Koko-│ │DeepFilter│ -│ │ │keet │ │ro │ │Net3 │ -└──┬──┘ └──┬──┘ └──┬──┘ └─┬────────┘ - └────────┴────────┴────────┘ - ONNX Runtime (CPU / NNAPI / QNN) +│ SpeechPipeline (Kotlin) │ +│ │ │ +│ ▼ │ +│ jni_bridge.cpp (~250 lines) │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────┐ │ +│ │ speech_core_models (git submodule) │ │ +│ │ SileroVad / ParakeetStt / │ │ +│ │ KokoroTts / DeepFilterEnhancer │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ speech_core (orchestration: │ │ +│ │ pipeline · turn · interruptions) │ │ +│ └──────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ONNX Runtime (CPU / NNAPI) │ +└──────────────────────────────────────────────┘ ``` +प्रत्येक मॉडल क्लास सीधे संबंधित speech-core इंटरफ़ेस (`VADInterface`, `STTInterface`, `TTSInterface`, `EnhancerInterface`) को लागू करता है — JNI ब्रिज उन्हें इंस्टैंशिएट करता है और संदर्भ `VoicePipeline` को सौंपता है। कोई C-vtable अडैप्टर बॉइलरप्लेट नहीं। + ## हार्डवेयर त्वरण -| प्लेटफ़ॉर्म | चिपसेट | त्वरण | -| --- | --- | --- | -| Android | Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU | -| Android | Samsung Exynos 2200+ | NNAPI → Samsung NPU | -| Android | Google Tensor G2+ | NNAPI → Google TPU | -| ऑटोमोटिव | SA8295P / SA8255P | QNN → Hexagon DSP | -| कोई भी | CPU फ़ॉलबैक | XNNPACK | +| चिपसेट | त्वरण | +| --- | --- | +| Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU | +| Samsung Exynos 2200+ | NNAPI → Samsung NPU | +| Google Tensor G2+ | NNAPI → Google TPU | +| CPU फ़ॉलबैक | XNNPACK | + +ऑटोमोटिव Qualcomm SA8295P / SA8255P के लिए QNN (Hexagon DSP) के साथ, [speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux) देखें। ## संबंधित परियोजनाएँ -| रिपॉज़िटरी | प्लेटफ़ॉर्म | +| रिपॉज़िटरी | स्कोप | | --- | --- | | [speech-swift](https://github.com/soniqo/speech-swift) | Apple (macOS, iOS) — MLX + CoreML | -| [speech-core](https://github.com/soniqo/speech-core) | क्रॉस-प्लेटफ़ॉर्म C++ पाइपलाइन इंजन | -| **speech-android** | Android + एम्बेडेड Linux — ONNX Runtime | +| [speech-core](https://github.com/soniqo/speech-core) | क्रॉस-प्लेटफ़ॉर्म C++ पाइपलाइन इंजन + ONNX मॉडल रैपर + Linux/एम्बेडेड उदाहरण | +| **speech-android** | Android रैपर — speech-core के ऊपर Kotlin SDK + JNI ब्रिज | ## लाइसेंस diff --git a/README_ja.md b/README_ja.md index 76da318..abd78ea 100644 --- a/README_ja.md +++ b/README_ja.md @@ -2,18 +2,15 @@ 📖 言語: [English](README.md) · [中文](README_zh.md) · [日本語](README_ja.md) · [한국어](README_ko.md) · [Español](README_es.md) · [Deutsch](README_de.md) · [Français](README_fr.md) · [हिन्दी](README_hi.md) · [Português](README_pt.md) · [Русский](README_ru.md) -[ONNX Runtime](https://onnxruntime.ai) と [speech-core](https://github.com/soniqo/speech-core) を活用した、Android および組み込み Linux 向けのオンデバイス音声 SDK。 +[ONNX Runtime](https://onnxruntime.ai) と [speech-core](https://github.com/soniqo/speech-core) を活用した、Android 向けオンデバイス音声 SDK。 音声認識(114 言語)、テキスト読み上げ(8 言語)、音声活動検出、ノイズキャンセリング — すべてローカルで動作。クラウド API 不要、データはデバイスから外に出ません。 -**[デモ APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[モデル](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)**(Apple 版)· **[speech-core](https://github.com/soniqo/speech-core)**(パイプラインエンジン) +**[デモ APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[モデル](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)**(Apple 版)· **[speech-core](https://github.com/soniqo/speech-core)**(パイプラインエンジン + Linux/組み込みビルド) -## プラットフォーム +## スコープ -| プラットフォーム | API | アクセラレーション | ディレクトリ | -| --- | --- | --- | --- | -| Android | Kotlin (`SpeechPipeline`) | NNAPI(Snapdragon、Exynos、Tensor) | `sdk/` | -| 組み込み Linux | C (`speech.h`) | QNN(Hexagon DSP) | `linux/` | +このリポジトリは **Android パッケージング** を担当します:Kotlin SDK、JNI ブリッジ、デモアプリ。C++ エンジンおよび ONNX モデルラッパー(Silero VAD、Parakeet STT、Kokoro TTS、DeepFilterNet3)は [speech-core](https://github.com/soniqo/speech-core) に存在し、git サブモジュールとして取り込まれます。Linux / 自動車向け(Yocto、Qualcomm SA8295P/SA8255P)は [speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux) に存在します。 ## モデル @@ -24,15 +21,13 @@ | [Silero VAD v5](https://huggingface.co/aufklarer/Silero-VAD-v5-ONNX) | 音声活動検出 | 2 MB | 任意 | | [DeepFilterNet3](https://huggingface.co/aufklarer/DeepFilterNet3-ONNX) | ノイズキャンセリング | ~8 MB | 任意 | -モデルは初回起動時に自動ダウンロード(Android)または手動配置(Linux)されます。 - -## Android +モデルは初回起動時に `ModelManager.ensureModels()` 経由で自動ダウンロードされます。 -### デモを試す +## デモを試す [署名済み APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk) をダウンロードし、任意の arm64 Android デバイス(8 以降)にインストールします。モデル(~1.2 GB)は初回起動時に自動ダウンロードされます。 -### 依存関係を追加 +## 依存関係を追加 ```kotlin dependencies { @@ -40,7 +35,7 @@ dependencies { } ``` -### Kotlin の使い方 +## Kotlin の使い方 ```kotlin val modelDir = ModelManager.ensureModels(context) @@ -63,7 +58,7 @@ pipeline.start() pipeline.pushAudio(samples) ``` -### ソースからビルド +## ソースからビルド ```bash git clone --recursive https://github.com/soniqo/speech-android.git @@ -73,7 +68,10 @@ cd speech-android ./gradlew :sdk:connectedAndroidTest # 34 個の e2e テスト ``` -### デモアプリ +`./setup.sh` は speech-core サブモジュールを初期化し、ONNX Runtime を +`./ort/` にダウンロードします。 + +## デモアプリ [`app/`](app/) モジュールは最小限の音声アシスタントデモで、以下を含みます: @@ -87,7 +85,7 @@ cd speech-android ./gradlew :app:installDebug ``` -### システム音声入力(`RecognitionService`) +## システム音声入力(`RecognitionService`) SDK には、Android フレームワークの `SpeechRecognizer` API に組み込めるすぐに使える `audio.soniqo.speech.service.SpeechRecognitionService` が含まれています — コードを書く必要はありません。アプリがデフォルトの音声認識サービスに選択されると、`SpeechRecognizer.createSpeechRecognizer(context)`(`ComponentName` なし)を呼び出す任意のサードパーティアプリが、あなたのパイプラインを通じて完全なオンデバイス STT を利用できます。 @@ -143,53 +141,6 @@ Android エミュレータ(arm64-v8a、NNAPI なし)で測定。実機ははる | Kokoro 82M | TTS | 1.9 秒出力 | 1,075 ミリ秒 | 0.58 | | Silero VAD v5 | VAD | 32 ミリ秒チャンク | <1 ミリ秒 | <0.01 | -## 組み込み Linux - -自動車および組み込みプラットフォーム向けの最小限の C API。詳細は [`linux/README.md`](linux/README.md) を参照してください。 - -### C API の使い方 - -```c -#include - -void on_event(const speech_event_t* event, void* ctx) { - if (event->type == SPEECH_EVENT_TRANSCRIPTION) - printf("%s\n", event->text); -} - -speech_config_t cfg = speech_config_default(); -cfg.model_dir = "/opt/speech/models"; -cfg.use_qnn = true; // Hexagon DSP アクセラレーション - -speech_pipeline_t p = speech_create(cfg, on_event, NULL); -speech_start(p); -speech_push_audio(p, pcm_samples, 512); -``` - -### ビルド - -```bash -cd linux && ./setup_linux.sh -cmake -B build -DORT_DIR=../ort-linux -cmake --build build -./build/speech_demo --model-dir /path/to/models -``` - -### テスト - -```bash -linux/tests/download_models.sh # ONNX モデルをダウンロード -SPEECH_MODEL_DIR=tests/models ./build/speech_test # 12 個のテスト -``` - -### Yocto 向けクロスコンパイル - -```bash -source /opt/poky/environment-setup-aarch64-poky-linux -cmake -B build -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64.cmake -DORT_DIR=... -cmake --build build -``` - ## パイプライン ```text @@ -204,41 +155,51 @@ Idle → Listening → Transcribing → Speaking → Idle ```text ┌──────────────────────────────────────────────┐ -│ Android: SpeechPipeline (Kotlin/JNI) │ -│ Linux: speech.h (C API) │ -└──────────────────┬───────────────────────────┘ - │ -┌──────────────────┴───────────────────────────┐ -│ speech-core (C++ submodule) │ -│ Turn detection · Interruptions · Context │ -└──┬────────┬────────┬────────┬────────────────┘ - │ │ │ │ vtables -┌──┴──┐ ┌──┴──┐ ┌──┴──┐ ┌─┴────────┐ -│ VAD │ │ STT │ │ TTS │ │ Enhancer │ -│Silero│ │Para-│ │Koko-│ │DeepFilter│ -│ │ │keet │ │ro │ │Net3 │ -└──┬──┘ └──┬──┘ └──┬──┘ └─┬────────┘ - └────────┴────────┴────────┘ - ONNX Runtime (CPU / NNAPI / QNN) +│ SpeechPipeline (Kotlin) │ +│ │ │ +│ ▼ │ +│ jni_bridge.cpp (~250 行) │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────┐ │ +│ │ speech_core_models (git サブモジュール) │ │ +│ │ SileroVad / ParakeetStt / │ │ +│ │ KokoroTts / DeepFilterEnhancer │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ speech_core (オーケストレーション: │ │ +│ │ パイプライン · ターン · 割り込み) │ │ +│ └──────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ONNX Runtime (CPU / NNAPI) │ +└──────────────────────────────────────────────┘ ``` +各モデルクラスは対応する speech-core インターフェース +(`VADInterface`、`STTInterface`、`TTSInterface`、`EnhancerInterface`)を +直接実装します — JNI ブリッジがそれらをインスタンス化し、参照を +`VoicePipeline` に渡します。C vtable アダプタの定型コードは不要です。 + ## ハードウェアアクセラレーション -| プラットフォーム | チップセット | アクセラレーション | -| --- | --- | --- | -| Android | Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU | -| Android | Samsung Exynos 2200+ | NNAPI → Samsung NPU | -| Android | Google Tensor G2+ | NNAPI → Google TPU | -| 自動車 | SA8295P / SA8255P | QNN → Hexagon DSP | -| 任意 | CPU フォールバック | XNNPACK | +| チップセット | アクセラレーション | +| --- | --- | +| Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU | +| Samsung Exynos 2200+ | NNAPI → Samsung NPU | +| Google Tensor G2+ | NNAPI → Google TPU | +| CPU フォールバック | XNNPACK | + +自動車向け Qualcomm SA8295P / SA8255P と QNN(Hexagon DSP)については、 +[speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux) を参照してください。 ## 関連プロジェクト -| リポジトリ | プラットフォーム | +| リポジトリ | スコープ | | --- | --- | | [speech-swift](https://github.com/soniqo/speech-swift) | Apple(macOS、iOS)— MLX + CoreML | -| [speech-core](https://github.com/soniqo/speech-core) | クロスプラットフォーム C++ パイプラインエンジン | -| **speech-android** | Android + 組み込み Linux — ONNX Runtime | +| [speech-core](https://github.com/soniqo/speech-core) | クロスプラットフォーム C++ パイプラインエンジン + ONNX モデルラッパー + Linux/組み込み例 | +| **speech-android** | Android ラッパー — speech-core 上の Kotlin SDK + JNI ブリッジ | ## ライセンス diff --git a/README_ko.md b/README_ko.md index 941a796..8c7eec4 100644 --- a/README_ko.md +++ b/README_ko.md @@ -2,18 +2,15 @@ 📖 언어: [English](README.md) · [中文](README_zh.md) · [日本語](README_ja.md) · [한국어](README_ko.md) · [Español](README_es.md) · [Deutsch](README_de.md) · [Français](README_fr.md) · [हिन्दी](README_hi.md) · [Português](README_pt.md) · [Русский](README_ru.md) -[ONNX Runtime](https://onnxruntime.ai)와 [speech-core](https://github.com/soniqo/speech-core) 기반의 Android 및 임베디드 Linux용 온디바이스 음성 SDK. +[ONNX Runtime](https://onnxruntime.ai)와 [speech-core](https://github.com/soniqo/speech-core) 기반의 Android용 온디바이스 음성 SDK. 음성 인식(114개 언어), 텍스트 음성 변환(8개 언어), 음성 활동 감지, 노이즈 캔슬링 — 모두 로컬에서 실행됩니다. 클라우드 API도, 디바이스 외부로 전송되는 데이터도 없습니다. -**[데모 APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[모델](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)**(Apple 버전) · **[speech-core](https://github.com/soniqo/speech-core)**(파이프라인 엔진) +**[데모 APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[모델](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)**(Apple 버전) · **[speech-core](https://github.com/soniqo/speech-core)**(파이프라인 엔진 + Linux/임베디드 빌드) -## 플랫폼 +## 범위 -| 플랫폼 | API | 가속 | 디렉토리 | -| --- | --- | --- | --- | -| Android | Kotlin (`SpeechPipeline`) | NNAPI(Snapdragon, Exynos, Tensor) | `sdk/` | -| 임베디드 Linux | C (`speech.h`) | QNN(Hexagon DSP) | `linux/` | +이 저장소는 **Android 패키징**입니다: Kotlin SDK, JNI 브리지, 데모 앱. C++ 엔진과 ONNX 모델 래퍼(Silero VAD, Parakeet STT, Kokoro TTS, DeepFilterNet3)는 [speech-core](https://github.com/soniqo/speech-core)에 있으며 git 서브모듈을 통해 가져옵니다. Linux / 자동차(Yocto, Qualcomm SA8295P/SA8255P)는 [speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux)에 있습니다. ## 모델 @@ -24,15 +21,13 @@ | [Silero VAD v5](https://huggingface.co/aufklarer/Silero-VAD-v5-ONNX) | 음성 활동 감지 | 2 MB | 모든 언어 | | [DeepFilterNet3](https://huggingface.co/aufklarer/DeepFilterNet3-ONNX) | 노이즈 캔슬링 | ~8 MB | 모든 언어 | -모델은 첫 실행 시 자동 다운로드(Android)되거나 수동으로 배치(Linux)됩니다. - -## Android +모델은 `ModelManager.ensureModels()`를 통해 첫 실행 시 자동으로 다운로드됩니다. -### 데모 사용해보기 +## 데모 사용해보기 [서명된 APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)를 다운로드하여 arm64 Android 기기(8 이상)에 설치하세요. 모델(~1.2 GB)은 첫 실행 시 자동으로 다운로드됩니다. -### 의존성 추가 +## 의존성 추가 ```kotlin dependencies { @@ -40,7 +35,7 @@ dependencies { } ``` -### Kotlin 사용법 +## Kotlin 사용법 ```kotlin val modelDir = ModelManager.ensureModels(context) @@ -63,7 +58,7 @@ pipeline.start() pipeline.pushAudio(samples) ``` -### 소스에서 빌드 +## 소스에서 빌드 ```bash git clone --recursive https://github.com/soniqo/speech-android.git @@ -73,7 +68,10 @@ cd speech-android ./gradlew :sdk:connectedAndroidTest # 34개 e2e 테스트 ``` -### 데모 앱 +`./setup.sh`는 speech-core 서브모듈을 초기화하고 ONNX Runtime을 +`./ort/`로 다운로드합니다. + +## 데모 앱 [`app/`](app/) 모듈은 최소한의 음성 비서 데모로 다음을 포함합니다: @@ -87,7 +85,7 @@ cd speech-android ./gradlew :app:installDebug ``` -### 시스템 음성 입력(`RecognitionService`) +## 시스템 음성 입력(`RecognitionService`) SDK는 Android 프레임워크 `SpeechRecognizer` API에 연결되는 바로 사용 가능한 `audio.soniqo.speech.service.SpeechRecognitionService`를 제공합니다 — 작성할 코드가 없습니다. 앱이 기본 음성 인식기로 선택되면, `SpeechRecognizer.createSpeechRecognizer(context)`(`ComponentName` 없이)를 호출하는 모든 타사 앱이 파이프라인을 통해 완전히 온디바이스 STT를 받을 수 있습니다. @@ -143,53 +141,6 @@ Android 에뮬레이터(arm64-v8a, NNAPI 없음)에서 측정. 실제 하드웨 | Kokoro 82M | TTS | 1.9초 출력 | 1,075ms | 0.58 | | Silero VAD v5 | VAD | 32ms 청크 | <1ms | <0.01 | -## 임베디드 Linux - -자동차 및 임베디드 플랫폼을 위한 최소한의 C API. 전체 문서는 [`linux/README.md`](linux/README.md)를 참조하세요. - -### C API 사용법 - -```c -#include - -void on_event(const speech_event_t* event, void* ctx) { - if (event->type == SPEECH_EVENT_TRANSCRIPTION) - printf("%s\n", event->text); -} - -speech_config_t cfg = speech_config_default(); -cfg.model_dir = "/opt/speech/models"; -cfg.use_qnn = true; // Hexagon DSP 가속 - -speech_pipeline_t p = speech_create(cfg, on_event, NULL); -speech_start(p); -speech_push_audio(p, pcm_samples, 512); -``` - -### 빌드 - -```bash -cd linux && ./setup_linux.sh -cmake -B build -DORT_DIR=../ort-linux -cmake --build build -./build/speech_demo --model-dir /path/to/models -``` - -### 테스트 - -```bash -linux/tests/download_models.sh # ONNX 모델 다운로드 -SPEECH_MODEL_DIR=tests/models ./build/speech_test # 12개 테스트 -``` - -### Yocto용 크로스 컴파일 - -```bash -source /opt/poky/environment-setup-aarch64-poky-linux -cmake -B build -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64.cmake -DORT_DIR=... -cmake --build build -``` - ## 파이프라인 ```text @@ -204,41 +155,51 @@ Idle → Listening → Transcribing → Speaking → Idle ```text ┌──────────────────────────────────────────────┐ -│ Android: SpeechPipeline (Kotlin/JNI) │ -│ Linux: speech.h (C API) │ -└──────────────────┬───────────────────────────┘ - │ -┌──────────────────┴───────────────────────────┐ -│ speech-core (C++ submodule) │ -│ Turn detection · Interruptions · Context │ -└──┬────────┬────────┬────────┬────────────────┘ - │ │ │ │ vtables -┌──┴──┐ ┌──┴──┐ ┌──┴──┐ ┌─┴────────┐ -│ VAD │ │ STT │ │ TTS │ │ Enhancer │ -│Silero│ │Para-│ │Koko-│ │DeepFilter│ -│ │ │keet │ │ro │ │Net3 │ -└──┬──┘ └──┬──┘ └──┬──┘ └─┬────────┘ - └────────┴────────┴────────┘ - ONNX Runtime (CPU / NNAPI / QNN) +│ SpeechPipeline (Kotlin) │ +│ │ │ +│ ▼ │ +│ jni_bridge.cpp (~250 lines) │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────┐ │ +│ │ speech_core_models (git submodule) │ │ +│ │ SileroVad / ParakeetStt / │ │ +│ │ KokoroTts / DeepFilterEnhancer │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ speech_core (orchestration: │ │ +│ │ pipeline · turn · interruptions) │ │ +│ └──────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ONNX Runtime (CPU / NNAPI) │ +└──────────────────────────────────────────────┘ ``` +각 모델 클래스는 해당하는 speech-core 인터페이스(`VADInterface`, +`STTInterface`, `TTSInterface`, `EnhancerInterface`)를 직접 구현합니다 — +JNI 브리지가 이들을 인스턴스화하여 `VoicePipeline`에 참조를 전달합니다. +C-vtable 어댑터 보일러플레이트가 없습니다. + ## 하드웨어 가속 -| 플랫폼 | 칩셋 | 가속 | -| --- | --- | --- | -| Android | Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU | -| Android | Samsung Exynos 2200+ | NNAPI → Samsung NPU | -| Android | Google Tensor G2+ | NNAPI → Google TPU | -| 자동차 | SA8295P / SA8255P | QNN → Hexagon DSP | -| 모두 | CPU 폴백 | XNNPACK | +| 칩셋 | 가속 | +| --- | --- | +| Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU | +| Samsung Exynos 2200+ | NNAPI → Samsung NPU | +| Google Tensor G2+ | NNAPI → Google TPU | +| CPU 폴백 | XNNPACK | + +자동차용 Qualcomm SA8295P / SA8255P와 QNN(Hexagon DSP)은 +[speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux)를 참조하세요. ## 관련 프로젝트 -| 저장소 | 플랫폼 | +| 저장소 | 범위 | | --- | --- | | [speech-swift](https://github.com/soniqo/speech-swift) | Apple(macOS, iOS) — MLX + CoreML | -| [speech-core](https://github.com/soniqo/speech-core) | 크로스 플랫폼 C++ 파이프라인 엔진 | -| **speech-android** | Android + 임베디드 Linux — ONNX Runtime | +| [speech-core](https://github.com/soniqo/speech-core) | 크로스 플랫폼 C++ 파이프라인 엔진 + ONNX 모델 래퍼 + Linux/임베디드 예제 | +| **speech-android** | Android 래퍼 — speech-core 위에 Kotlin SDK + JNI 브리지 | ## 라이선스 diff --git a/README_pt.md b/README_pt.md index c149b52..17cace7 100644 --- a/README_pt.md +++ b/README_pt.md @@ -2,18 +2,15 @@ 📖 Idiomas: [English](README.md) · [中文](README_zh.md) · [日本語](README_ja.md) · [한국어](README_ko.md) · [Español](README_es.md) · [Deutsch](README_de.md) · [Français](README_fr.md) · [हिन्दी](README_hi.md) · [Português](README_pt.md) · [Русский](README_ru.md) -SDK de voz no dispositivo para Android e Linux embarcado, baseado em [ONNX Runtime](https://onnxruntime.ai) e [speech-core](https://github.com/soniqo/speech-core). +SDK de voz no dispositivo para Android, baseado em [ONNX Runtime](https://onnxruntime.ai) e [speech-core](https://github.com/soniqo/speech-core). Reconhecimento de fala (114 idiomas), texto para fala (8 idiomas), detecção de atividade vocal e cancelamento de ruído — tudo executado localmente. Sem APIs em nuvem, nenhum dado sai do dispositivo. -**[APK de demonstração](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[Modelos](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (contraparte Apple) · **[speech-core](https://github.com/soniqo/speech-core)** (motor de pipeline) +**[APK de demonstração](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[Modelos](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (contraparte Apple) · **[speech-core](https://github.com/soniqo/speech-core)** (motor de pipeline + build Linux/embarcado) -## Plataformas +## Escopo -| Plataforma | API | Aceleração | Diretório | -| --- | --- | --- | --- | -| Android | Kotlin (`SpeechPipeline`) | NNAPI (Snapdragon, Exynos, Tensor) | `sdk/` | -| Linux embarcado | C (`speech.h`) | QNN (Hexagon DSP) | `linux/` | +Este repositório é o **empacotamento Android**: SDK Kotlin, ponte JNI, app de demonstração. O motor C++ e os wrappers de modelo ONNX (Silero VAD, Parakeet STT, Kokoro TTS, DeepFilterNet3) ficam em [speech-core](https://github.com/soniqo/speech-core) e são incorporados via submódulo git. Linux / automotivo (Yocto, Qualcomm SA8295P/SA8255P) está em [speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux). ## Modelos @@ -24,15 +21,13 @@ Reconhecimento de fala (114 idiomas), texto para fala (8 idiomas), detecção de | [Silero VAD v5](https://huggingface.co/aufklarer/Silero-VAD-v5-ONNX) | Detecção de atividade vocal | 2 MB | Qualquer | | [DeepFilterNet3](https://huggingface.co/aufklarer/DeepFilterNet3-ONNX) | Cancelamento de ruído | ~8 MB | Qualquer | -Os modelos são baixados automaticamente no primeiro lançamento (Android) ou colocados manualmente (Linux). - -## Android +Os modelos são baixados automaticamente no primeiro lançamento via `ModelManager.ensureModels()`. -### Experimente a demo +## Experimente a demo Baixe o [APK assinado](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk) e instale em qualquer dispositivo Android arm64 (8+). Os modelos (~1,2 GB) são baixados automaticamente no primeiro lançamento. -### Adicionar dependência +## Adicionar dependência ```kotlin dependencies { @@ -40,7 +35,7 @@ dependencies { } ``` -### Uso do Kotlin +## Uso do Kotlin ```kotlin val modelDir = ModelManager.ensureModels(context) @@ -63,7 +58,7 @@ pipeline.start() pipeline.pushAudio(samples) ``` -### Compilar a partir do código-fonte +## Compilar a partir do código-fonte ```bash git clone --recursive https://github.com/soniqo/speech-android.git @@ -73,7 +68,10 @@ cd speech-android ./gradlew :sdk:connectedAndroidTest # 34 testes e2e ``` -### Aplicativo de demonstração +`./setup.sh` inicializa o submódulo speech-core e baixa o ONNX Runtime +para `./ort/`. + +## Aplicativo de demonstração O módulo [`app/`](app/) é uma demo mínima de assistente de voz com: @@ -87,9 +85,14 @@ O módulo [`app/`](app/) é uma demo mínima de assistente de voz com: ./gradlew :app:installDebug ``` -### Entrada de voz do sistema (`RecognitionService`) +## Entrada de voz do sistema (`RecognitionService`) -O SDK fornece um `audio.soniqo.speech.service.SpeechRecognitionService` pronto para uso que se conecta à API `SpeechRecognizer` do framework do Android — sem código a escrever. Uma vez que seu app é selecionado como o reconhecedor de voz padrão, qualquer app de terceiros chamando `SpeechRecognizer.createSpeechRecognizer(context)` (sem `ComponentName`) obtém STT totalmente no dispositivo através do seu pipeline. +O SDK fornece um `audio.soniqo.speech.service.SpeechRecognitionService` pronto +para uso que se conecta à API `SpeechRecognizer` do framework do Android — +sem código a escrever. Uma vez que seu app é selecionado como o reconhecedor +de voz padrão, qualquer app de terceiros chamando +`SpeechRecognizer.createSpeechRecognizer(context)` (sem `ComponentName`) +obtém STT totalmente no dispositivo através do seu pipeline. **1. Declare `RECORD_AUDIO` e o serviço em `AndroidManifest.xml`:** @@ -118,20 +121,32 @@ O SDK fornece um `audio.soniqo.speech.service.SpeechRecognitionService` pronto p ``` -(Opcionalmente adicione `android:settingsActivity="..."` para expor um ícone de engrenagem no seletor de entrada de voz do sistema.) +(Opcionalmente adicione `android:settingsActivity="..."` para expor um ícone +de engrenagem no seletor de entrada de voz do sistema.) -**3. Defina o serviço como padrão do sistema** (Configurações → Sistema → Idiomas e entrada → Seletor de entrada de voz no Android puro, ou via adb): +**3. Defina o serviço como padrão do sistema** (Configurações → Sistema → +Idiomas e entrada → Seletor de entrada de voz no Android puro, ou via adb): ```bash adb shell settings put secure voice_recognition_service \ your.package/audio.soniqo.speech.service.SpeechRecognitionService ``` -**4. Verifique** executando a tela *Recognizer test* do app demo, que chama `SpeechRecognizer.createSpeechRecognizer(ctx)` (sem componente) e registra cada callback do framework — útil para confirmar o round-trip do binder sem precisar do logcat. +**4. Verifique** executando a tela *Recognizer test* do app demo, que chama +`SpeechRecognizer.createSpeechRecognizer(ctx)` (sem componente) e registra +cada callback do framework — útil para confirmar o round-trip do binder sem +precisar do logcat. -O serviço implementa `onCheckRecognitionSupport` (API 33+) retornando os 27 idiomas BCP-47 cobertos pelo Parakeet TDT v3, marcados como `installedOnDeviceLanguage` quando os modelos estão presentes (ou `pendingOnDeviceLanguage` enquanto eles são baixados). O foco de áudio é adquirido com `AUDIOFOCUS_GAIN_TRANSIENT` pela duração de uma sessão. +O serviço implementa `onCheckRecognitionSupport` (API 33+) retornando os +27 idiomas BCP-47 cobertos pelo Parakeet TDT v3, marcados como +`installedOnDeviceLanguage` quando os modelos estão presentes (ou +`pendingOnDeviceLanguage` enquanto eles são baixados). O foco de áudio é +adquirido com `AUDIOFOCUS_GAIN_TRANSIENT` pela duração de uma sessão. -**Limitação:** Gboard, Samsung Keyboard e Google Assistant agrupam seus próprios reconhecedores e ignoram o padrão do sistema. Apps que chamam explicitamente a API `SpeechRecognizer` do framework (ou constroem sua própria UI em cima dela) são os que passam pelo seu serviço. +**Limitação:** Gboard, Samsung Keyboard e Google Assistant agrupam seus +próprios reconhecedores e ignoram o padrão do sistema. Apps que chamam +explicitamente a API `SpeechRecognizer` do framework (ou constroem sua +própria UI em cima dela) são os que passam pelo seu serviço. ## Desempenho @@ -143,53 +158,6 @@ Medido em emulador Android (arm64-v8a, sem NNAPI). Hardware real é significativ | Kokoro 82M | TTS | 1,9s saída | 1.075ms | 0,58 | | Silero VAD v5 | VAD | bloco 32ms | <1ms | <0,01 | -## Linux embarcado - -API C mínima para plataformas automotivas e embarcadas. Veja [`linux/README.md`](linux/README.md) para a documentação completa. - -### Uso da API C - -```c -#include - -void on_event(const speech_event_t* event, void* ctx) { - if (event->type == SPEECH_EVENT_TRANSCRIPTION) - printf("%s\n", event->text); -} - -speech_config_t cfg = speech_config_default(); -cfg.model_dir = "/opt/speech/models"; -cfg.use_qnn = true; // Aceleração Hexagon DSP - -speech_pipeline_t p = speech_create(cfg, on_event, NULL); -speech_start(p); -speech_push_audio(p, pcm_samples, 512); -``` - -### Compilar - -```bash -cd linux && ./setup_linux.sh -cmake -B build -DORT_DIR=../ort-linux -cmake --build build -./build/speech_demo --model-dir /path/to/models -``` - -### Testar - -```bash -linux/tests/download_models.sh # baixar modelos ONNX -SPEECH_MODEL_DIR=tests/models ./build/speech_test # 12 testes -``` - -### Compilação cruzada para Yocto - -```bash -source /opt/poky/environment-setup-aarch64-poky-linux -cmake -B build -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64.cmake -DORT_DIR=... -cmake --build build -``` - ## Pipeline ```text @@ -204,41 +172,51 @@ Suporte a barge-in: falar durante a reprodução TTS interrompe e inicia uma nov ```text ┌──────────────────────────────────────────────┐ -│ Android: SpeechPipeline (Kotlin/JNI) │ -│ Linux: speech.h (C API) │ -└──────────────────┬───────────────────────────┘ - │ -┌──────────────────┴───────────────────────────┐ -│ speech-core (C++ submodule) │ -│ Turn detection · Interruptions · Context │ -└──┬────────┬────────┬────────┬────────────────┘ - │ │ │ │ vtables -┌──┴──┐ ┌──┴──┐ ┌──┴──┐ ┌─┴────────┐ -│ VAD │ │ STT │ │ TTS │ │ Enhancer │ -│Silero│ │Para-│ │Koko-│ │DeepFilter│ -│ │ │keet │ │ro │ │Net3 │ -└──┬──┘ └──┬──┘ └──┬──┘ └─┬────────┘ - └────────┴────────┴────────┘ - ONNX Runtime (CPU / NNAPI / QNN) +│ SpeechPipeline (Kotlin) │ +│ │ │ +│ ▼ │ +│ jni_bridge.cpp (~250 linhas) │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────┐ │ +│ │ speech_core_models (submódulo git) │ │ +│ │ SileroVad / ParakeetStt / │ │ +│ │ KokoroTts / DeepFilterEnhancer │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ speech_core (orquestração: │ │ +│ │ pipeline · turn · interrupções) │ │ +│ └──────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ONNX Runtime (CPU / NNAPI) │ +└──────────────────────────────────────────────┘ ``` +Cada classe de modelo implementa diretamente a interface correspondente de +speech-core (`VADInterface`, `STTInterface`, `TTSInterface`, +`EnhancerInterface`) — a ponte JNI as instancia e entrega referências ao +`VoicePipeline`. Sem boilerplate de adaptador C-vtable. + ## Aceleração de hardware -| Plataforma | Chipset | Aceleração | -| --- | --- | --- | -| Android | Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU | -| Android | Samsung Exynos 2200+ | NNAPI → Samsung NPU | -| Android | Google Tensor G2+ | NNAPI → Google TPU | -| Automotivo | SA8295P / SA8255P | QNN → Hexagon DSP | -| Qualquer | Fallback CPU | XNNPACK | +| Chipset | Aceleração | +| --- | --- | +| Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU | +| Samsung Exynos 2200+ | NNAPI → Samsung NPU | +| Google Tensor G2+ | NNAPI → Google TPU | +| Fallback CPU | XNNPACK | + +Para Qualcomm SA8295P / SA8255P automotivo com QNN (Hexagon DSP), veja +[speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux). ## Projetos relacionados -| Repositório | Plataforma | +| Repositório | Escopo | | --- | --- | | [speech-swift](https://github.com/soniqo/speech-swift) | Apple (macOS, iOS) — MLX + CoreML | -| [speech-core](https://github.com/soniqo/speech-core) | Motor de pipeline C++ multiplataforma | -| **speech-android** | Android + Linux embarcado — ONNX Runtime | +| [speech-core](https://github.com/soniqo/speech-core) | Motor de pipeline C++ multiplataforma + wrappers de modelo ONNX + exemplos Linux/embarcado | +| **speech-android** | Wrapper Android — SDK Kotlin + ponte JNI sobre speech-core | ## Licença diff --git a/README_ru.md b/README_ru.md index 4b95abe..fc3a155 100644 --- a/README_ru.md +++ b/README_ru.md @@ -2,18 +2,15 @@ 📖 Языки: [English](README.md) · [中文](README_zh.md) · [日本語](README_ja.md) · [한국어](README_ko.md) · [Español](README_es.md) · [Deutsch](README_de.md) · [Français](README_fr.md) · [हिन्दी](README_hi.md) · [Português](README_pt.md) · [Русский](README_ru.md) -Речевой SDK для устройств Android и встраиваемого Linux, основанный на [ONNX Runtime](https://onnxruntime.ai) и [speech-core](https://github.com/soniqo/speech-core). +Локальный речевой SDK для Android, основанный на [ONNX Runtime](https://onnxruntime.ai) и [speech-core](https://github.com/soniqo/speech-core). Распознавание речи (114 языков), синтез речи (8 языков), определение голосовой активности и шумоподавление — всё работает локально. Никаких облачных API, никакие данные не покидают устройство. -**[Демо APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[Модели](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (аналог для Apple) · **[speech-core](https://github.com/soniqo/speech-core)** (движок конвейера) +**[Демо APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[Модели](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (аналог для Apple) · **[speech-core](https://github.com/soniqo/speech-core)** (движок конвейера + сборка для Linux/встраиваемых систем) -## Платформы +## Область применения -| Платформа | API | Ускорение | Каталог | -| --- | --- | --- | --- | -| Android | Kotlin (`SpeechPipeline`) | NNAPI (Snapdragon, Exynos, Tensor) | `sdk/` | -| Встраиваемый Linux | C (`speech.h`) | QNN (Hexagon DSP) | `linux/` | +Этот репозиторий — **Android-обёртка**: Kotlin SDK, JNI-мост, демо-приложение. C++-движок и обёртки ONNX-моделей (Silero VAD, Parakeet STT, Kokoro TTS, DeepFilterNet3) находятся в [speech-core](https://github.com/soniqo/speech-core) и подключаются через git-submodule. Linux / автомобильные системы (Yocto, Qualcomm SA8295P/SA8255P) — в [speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux). ## Модели @@ -24,15 +21,13 @@ | [Silero VAD v5](https://huggingface.co/aufklarer/Silero-VAD-v5-ONNX) | Определение голосовой активности | 2 МБ | Любой | | [DeepFilterNet3](https://huggingface.co/aufklarer/DeepFilterNet3-ONNX) | Шумоподавление | ~8 МБ | Любой | -Модели загружаются автоматически при первом запуске (Android) или размещаются вручную (Linux). - -## Android +Модели загружаются автоматически при первом запуске через `ModelManager.ensureModels()`. -### Попробовать демо +## Попробовать демо Скачайте [подписанный APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk) и установите на любое arm64-устройство Android (8+). Модели (~1,2 ГБ) загружаются автоматически при первом запуске. -### Добавить зависимость +## Добавить зависимость ```kotlin dependencies { @@ -40,7 +35,7 @@ dependencies { } ``` -### Использование Kotlin +## Использование Kotlin ```kotlin val modelDir = ModelManager.ensureModels(context) @@ -63,7 +58,7 @@ pipeline.start() pipeline.pushAudio(samples) ``` -### Сборка из исходного кода +## Сборка из исходного кода ```bash git clone --recursive https://github.com/soniqo/speech-android.git @@ -73,7 +68,10 @@ cd speech-android ./gradlew :sdk:connectedAndroidTest # 34 e2e-теста ``` -### Демо-приложение +`./setup.sh` инициализирует submodule speech-core и загружает ONNX Runtime +в `./ort/`. + +## Демо-приложение Модуль [`app/`](app/) — минимальное демо голосового ассистента, включающее: @@ -87,7 +85,7 @@ cd speech-android ./gradlew :app:installDebug ``` -### Системный голосовой ввод (`RecognitionService`) +## Системный голосовой ввод (`RecognitionService`) SDK включает готовый к использованию `audio.soniqo.speech.service.SpeechRecognitionService`, который подключается к API `SpeechRecognizer` фреймворка Android — никакого кода писать не нужно. Как только ваше приложение выбрано в качестве распознавателя голоса по умолчанию, любое стороннее приложение, вызывающее `SpeechRecognizer.createSpeechRecognizer(context)` (без `ComponentName`), получает полностью локальный STT через ваш конвейер. @@ -143,53 +141,6 @@ adb shell settings put secure voice_recognition_service \ | Kokoro 82M | TTS | 1,9 с вывод | 1075 мс | 0,58 | | Silero VAD v5 | VAD | блок 32 мс | <1 мс | <0,01 | -## Встраиваемый Linux - -Минимальный C API для автомобильных и встраиваемых платформ. Полную документацию см. в [`linux/README.md`](linux/README.md). - -### Использование C API - -```c -#include - -void on_event(const speech_event_t* event, void* ctx) { - if (event->type == SPEECH_EVENT_TRANSCRIPTION) - printf("%s\n", event->text); -} - -speech_config_t cfg = speech_config_default(); -cfg.model_dir = "/opt/speech/models"; -cfg.use_qnn = true; // Ускорение Hexagon DSP - -speech_pipeline_t p = speech_create(cfg, on_event, NULL); -speech_start(p); -speech_push_audio(p, pcm_samples, 512); -``` - -### Сборка - -```bash -cd linux && ./setup_linux.sh -cmake -B build -DORT_DIR=../ort-linux -cmake --build build -./build/speech_demo --model-dir /path/to/models -``` - -### Тесты - -```bash -linux/tests/download_models.sh # загрузить модели ONNX -SPEECH_MODEL_DIR=tests/models ./build/speech_test # 12 тестов -``` - -### Кросс-компиляция для Yocto - -```bash -source /opt/poky/environment-setup-aarch64-poky-linux -cmake -B build -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64.cmake -DORT_DIR=... -cmake --build build -``` - ## Конвейер ```text @@ -204,41 +155,51 @@ Idle → Listening → Transcribing → Speaking → Idle ```text ┌──────────────────────────────────────────────┐ -│ Android: SpeechPipeline (Kotlin/JNI) │ -│ Linux: speech.h (C API) │ -└──────────────────┬───────────────────────────┘ - │ -┌──────────────────┴───────────────────────────┐ -│ speech-core (C++ submodule) │ -│ Turn detection · Interruptions · Context │ -└──┬────────┬────────┬────────┬────────────────┘ - │ │ │ │ vtables -┌──┴──┐ ┌──┴──┐ ┌──┴──┐ ┌─┴────────┐ -│ VAD │ │ STT │ │ TTS │ │ Enhancer │ -│Silero│ │Para-│ │Koko-│ │DeepFilter│ -│ │ │keet │ │ro │ │Net3 │ -└──┬──┘ └──┬──┘ └──┬──┘ └─┬────────┘ - └────────┴────────┴────────┘ - ONNX Runtime (CPU / NNAPI / QNN) +│ SpeechPipeline (Kotlin) │ +│ │ │ +│ ▼ │ +│ jni_bridge.cpp (~250 строк) │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────┐ │ +│ │ speech_core_models (git submodule) │ │ +│ │ SileroVad / ParakeetStt / │ │ +│ │ KokoroTts / DeepFilterEnhancer │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ speech_core (оркестрация: │ │ +│ │ pipeline · turn · прерывания) │ │ +│ └──────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ONNX Runtime (CPU / NNAPI) │ +└──────────────────────────────────────────────┘ ``` +Каждый класс модели напрямую реализует соответствующий интерфейс speech-core +(`VADInterface`, `STTInterface`, `TTSInterface`, `EnhancerInterface`) — +JNI-мост создаёт их и передаёт ссылки в `VoicePipeline`. Никаких шаблонных +обвязок через C-vtable. + ## Аппаратное ускорение -| Платформа | Чипсет | Ускорение | -| --- | --- | --- | -| Android | Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU | -| Android | Samsung Exynos 2200+ | NNAPI → Samsung NPU | -| Android | Google Tensor G2+ | NNAPI → Google TPU | -| Автомобильная | SA8295P / SA8255P | QNN → Hexagon DSP | -| Любая | Резерв CPU | XNNPACK | +| Чипсет | Ускорение | +| --- | --- | +| Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU | +| Samsung Exynos 2200+ | NNAPI → Samsung NPU | +| Google Tensor G2+ | NNAPI → Google TPU | +| Резерв CPU | XNNPACK | + +Для автомобильных Qualcomm SA8295P / SA8255P с QNN (Hexagon DSP) см. +[speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux). ## Связанные проекты -| Репозиторий | Платформа | +| Репозиторий | Область | | --- | --- | | [speech-swift](https://github.com/soniqo/speech-swift) | Apple (macOS, iOS) — MLX + CoreML | -| [speech-core](https://github.com/soniqo/speech-core) | Кроссплатформенный движок конвейера на C++ | -| **speech-android** | Android + встраиваемый Linux — ONNX Runtime | +| [speech-core](https://github.com/soniqo/speech-core) | Кроссплатформенный движок конвейера на C++ + обёртки ONNX-моделей + примеры для Linux/встраиваемых систем | +| **speech-android** | Android-обёртка — Kotlin SDK + JNI-мост поверх speech-core | ## Лицензия diff --git a/README_zh.md b/README_zh.md index c92bc91..db1c562 100644 --- a/README_zh.md +++ b/README_zh.md @@ -2,18 +2,15 @@ 📖 阅读语言: [English](README.md) · [中文](README_zh.md) · [日本語](README_ja.md) · [한국어](README_ko.md) · [Español](README_es.md) · [Deutsch](README_de.md) · [Français](README_fr.md) · [हिन्दी](README_hi.md) · [Português](README_pt.md) · [Русский](README_ru.md) -适用于 Android 和嵌入式 Linux 的设备端语音 SDK,基于 [ONNX Runtime](https://onnxruntime.ai) 和 [speech-core](https://github.com/soniqo/speech-core) 构建。 +适用于 Android 的设备端语音 SDK,基于 [ONNX Runtime](https://onnxruntime.ai) 和 [speech-core](https://github.com/soniqo/speech-core) 构建。 语音识别(114 种语言)、文本转语音(8 种语言)、语音活动检测和噪声消除——全部在本地运行。无需云端 API,数据不会离开设备。 -**[演示 APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[模型](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)**(Apple 对应版本)· **[speech-core](https://github.com/soniqo/speech-core)**(管线引擎) +**[演示 APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[模型](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)**(Apple 对应版本)· **[speech-core](https://github.com/soniqo/speech-core)**(管线引擎 + Linux/嵌入式构建) -## 平台 +## 范围 -| 平台 | API | 加速 | 目录 | -| --- | --- | --- | --- | -| Android | Kotlin (`SpeechPipeline`) | NNAPI(Snapdragon、Exynos、Tensor) | `sdk/` | -| 嵌入式 Linux | C (`speech.h`) | QNN(Hexagon DSP) | `linux/` | +本仓库是 **Android 打包**:Kotlin SDK、JNI 桥接、演示应用。C++ 引擎和 ONNX 模型封装(Silero VAD、Parakeet STT、Kokoro TTS、DeepFilterNet3)位于 [speech-core](https://github.com/soniqo/speech-core),通过 git 子模块引入。Linux / 汽车(Yocto、Qualcomm SA8295P/SA8255P)位于 [speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux)。 ## 模型 @@ -24,15 +21,13 @@ | [Silero VAD v5](https://huggingface.co/aufklarer/Silero-VAD-v5-ONNX) | 语音活动检测 | 2 MB | 任意 | | [DeepFilterNet3](https://huggingface.co/aufklarer/DeepFilterNet3-ONNX) | 噪声消除 | ~8 MB | 任意 | -模型在首次启动时自动下载(Android)或手动放置(Linux)。 - -## Android +模型在首次启动时通过 `ModelManager.ensureModels()` 自动下载。 -### 试用演示 +## 试用演示 下载[已签名的 APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk) 并安装到任何 arm64 Android 设备(8 及以上)。模型(~1.2 GB)在首次启动时自动下载。 -### 添加依赖 +## 添加依赖 ```kotlin dependencies { @@ -40,7 +35,7 @@ dependencies { } ``` -### Kotlin 用法 +## Kotlin 用法 ```kotlin val modelDir = ModelManager.ensureModels(context) @@ -63,7 +58,7 @@ pipeline.start() pipeline.pushAudio(samples) ``` -### 从源代码构建 +## 从源代码构建 ```bash git clone --recursive https://github.com/soniqo/speech-android.git @@ -73,7 +68,9 @@ cd speech-android ./gradlew :sdk:connectedAndroidTest # 34 个端到端测试 ``` -### 演示应用 +`./setup.sh` 会初始化 speech-core 子模块并将 ONNX Runtime 下载到 `./ort/`。 + +## 演示应用 [`app/`](app/) 模块是一个最小化的语音助手演示,包含: @@ -87,7 +84,7 @@ cd speech-android ./gradlew :app:installDebug ``` -### 系统语音输入(`RecognitionService`) +## 系统语音输入(`RecognitionService`) SDK 自带可直接使用的 `audio.soniqo.speech.service.SpeechRecognitionService`,接入 Android 框架的 `SpeechRecognizer` API — 无需编写代码。一旦你的应用被设为默认语音识别器,任何调用 `SpeechRecognizer.createSpeechRecognizer(context)`(不指定 `ComponentName`)的第三方应用都能通过你的流水线获得完全本地的 STT。 @@ -143,53 +140,6 @@ adb shell settings put secure voice_recognition_service \ | Kokoro 82M | TTS | 1.9 秒输出 | 1,075 毫秒 | 0.58 | | Silero VAD v5 | VAD | 32 毫秒块 | <1 毫秒 | <0.01 | -## 嵌入式 Linux - -适用于汽车和嵌入式平台的最小化 C API。完整文档参见 [`linux/README.md`](linux/README.md)。 - -### C API 用法 - -```c -#include - -void on_event(const speech_event_t* event, void* ctx) { - if (event->type == SPEECH_EVENT_TRANSCRIPTION) - printf("%s\n", event->text); -} - -speech_config_t cfg = speech_config_default(); -cfg.model_dir = "/opt/speech/models"; -cfg.use_qnn = true; // Hexagon DSP 加速 - -speech_pipeline_t p = speech_create(cfg, on_event, NULL); -speech_start(p); -speech_push_audio(p, pcm_samples, 512); -``` - -### 构建 - -```bash -cd linux && ./setup_linux.sh -cmake -B build -DORT_DIR=../ort-linux -cmake --build build -./build/speech_demo --model-dir /path/to/models -``` - -### 测试 - -```bash -linux/tests/download_models.sh # 下载 ONNX 模型 -SPEECH_MODEL_DIR=tests/models ./build/speech_test # 12 个测试 -``` - -### 为 Yocto 交叉编译 - -```bash -source /opt/poky/environment-setup-aarch64-poky-linux -cmake -B build -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64.cmake -DORT_DIR=... -cmake --build build -``` - ## 管线 ```text @@ -204,41 +154,47 @@ Idle → Listening → Transcribing → Speaking → Idle ```text ┌──────────────────────────────────────────────┐ -│ Android: SpeechPipeline (Kotlin/JNI) │ -│ Linux: speech.h (C API) │ -└──────────────────┬───────────────────────────┘ - │ -┌──────────────────┴───────────────────────────┐ -│ speech-core (C++ submodule) │ -│ Turn detection · Interruptions · Context │ -└──┬────────┬────────┬────────┬────────────────┘ - │ │ │ │ vtables -┌──┴──┐ ┌──┴──┐ ┌──┴──┐ ┌─┴────────┐ -│ VAD │ │ STT │ │ TTS │ │ Enhancer │ -│Silero│ │Para-│ │Koko-│ │DeepFilter│ -│ │ │keet │ │ro │ │Net3 │ -└──┬──┘ └──┬──┘ └──┬──┘ └─┬────────┘ - └────────┴────────┴────────┘ - ONNX Runtime (CPU / NNAPI / QNN) +│ SpeechPipeline (Kotlin) │ +│ │ │ +│ ▼ │ +│ jni_bridge.cpp (~250 行) │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────┐ │ +│ │ speech_core_models(git 子模块) │ │ +│ │ SileroVad / ParakeetStt / │ │ +│ │ KokoroTts / DeepFilterEnhancer │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ speech_core(编排: │ │ +│ │ 管线 · 轮次 · 打断) │ │ +│ └──────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ONNX Runtime (CPU / NNAPI) │ +└──────────────────────────────────────────────┘ ``` +每个模型类直接实现对应的 speech-core 接口(`VADInterface`、`STTInterface`、`TTSInterface`、`EnhancerInterface`)—— JNI 桥接实例化它们并将引用交给 `VoicePipeline`。无需 C-vtable 适配器样板代码。 + ## 硬件加速 -| 平台 | 芯片组 | 加速 | -| --- | --- | --- | -| Android | Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU | -| Android | Samsung Exynos 2200+ | NNAPI → Samsung NPU | -| Android | Google Tensor G2+ | NNAPI → Google TPU | -| 汽车 | SA8295P / SA8255P | QNN → Hexagon DSP | -| 任意 | CPU 回退 | XNNPACK | +| 芯片组 | 加速 | +| --- | --- | +| Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU | +| Samsung Exynos 2200+ | NNAPI → Samsung NPU | +| Google Tensor G2+ | NNAPI → Google TPU | +| CPU 回退 | XNNPACK | + +汽车 Qualcomm SA8295P / SA8255P 搭配 QNN(Hexagon DSP)的方案,请参见 [speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux)。 ## 相关项目 -| 仓库 | 平台 | +| 仓库 | 范围 | | --- | --- | | [speech-swift](https://github.com/soniqo/speech-swift) | Apple(macOS、iOS)— MLX + CoreML | -| [speech-core](https://github.com/soniqo/speech-core) | 跨平台 C++ 管线引擎 | -| **speech-android** | Android + 嵌入式 Linux — ONNX Runtime | +| [speech-core](https://github.com/soniqo/speech-core) | 跨平台 C++ 管线引擎 + ONNX 模型封装 + Linux/嵌入式示例 | +| **speech-android** | Android 封装 — 基于 speech-core 的 Kotlin SDK + JNI 桥接 | ## 许可证 diff --git a/linux/CMakeLists.txt b/linux/CMakeLists.txt deleted file mode 100644 index 667fccd..0000000 --- a/linux/CMakeLists.txt +++ /dev/null @@ -1,102 +0,0 @@ -cmake_minimum_required(VERSION 3.16) -project(speech_linux VERSION 0.1.0 LANGUAGES CXX) - -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(CMAKE_POSITION_INDEPENDENT_CODE ON) - -option(SPEECH_BUILD_DEMO "Build ALSA demo CLI" ON) -option(SPEECH_BUILD_TESTS "Build tests" ON) -option(SPEECH_BUILD_TOOLS "Build CLI tools (transcribe)" ON) - -# --- Paths --- -set(SDK_CPP "${CMAKE_CURRENT_SOURCE_DIR}/../sdk/src/main/cpp") -set(SPEECH_CORE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../speech-core" CACHE PATH "speech-core directory") -set(ORT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../ort-linux" CACHE PATH "ONNX Runtime directory") - -# --- speech-core (static library) --- -file(GLOB_RECURSE SPEECH_CORE_SOURCES "${SPEECH_CORE_DIR}/src/*.cpp") -add_library(speech_core STATIC ${SPEECH_CORE_SOURCES}) -target_include_directories(speech_core PUBLIC "${SPEECH_CORE_DIR}/include") -target_compile_features(speech_core PUBLIC cxx_std_17) - -# --- ONNX Runtime --- -add_library(onnxruntime SHARED IMPORTED) -if(APPLE) - set(_ORT_LIB "${ORT_DIR}/lib/libonnxruntime.dylib") -else() - set(_ORT_LIB "${ORT_DIR}/lib/libonnxruntime.so") -endif() -set_target_properties(onnxruntime PROPERTIES - IMPORTED_LOCATION "${_ORT_LIB}" - INTERFACE_INCLUDE_DIRECTORIES "${ORT_DIR}/include" -) - -# --- libspeech.so --- -add_library(speech SHARED - src/speech.cpp - ${SDK_CPP}/audio/mel.cpp - ${SDK_CPP}/audio/fft.cpp - ${SDK_CPP}/audio/stft.cpp - ${SDK_CPP}/models/silero_vad.cpp - ${SDK_CPP}/models/parakeet_stt.cpp - ${SDK_CPP}/models/kokoro_tts.cpp - ${SDK_CPP}/models/kokoro_phonemizer.cpp - ${SDK_CPP}/models/kokoro_multilingual.cpp - ${SDK_CPP}/models/deepfilter.cpp -) - -target_include_directories(speech - PUBLIC - ${CMAKE_CURRENT_SOURCE_DIR}/include - PRIVATE - ${SDK_CPP} - ${SDK_CPP}/models - ${ORT_DIR}/include - ${SPEECH_CORE_DIR}/include -) - -target_link_libraries(speech PRIVATE speech_core onnxruntime) - -# --- Demo CLI --- -if(SPEECH_BUILD_DEMO) - add_executable(speech_demo demo/main.cpp) - target_link_libraries(speech_demo PRIVATE speech) - target_include_directories(speech_demo PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include) - - find_library(ALSA_LIB asound) - if(ALSA_LIB) - target_link_libraries(speech_demo PRIVATE ${ALSA_LIB}) - target_compile_definitions(speech_demo PRIVATE HAS_ALSA=1) - endif() -endif() - -# --- Tests --- -if(SPEECH_BUILD_TESTS) - enable_testing() - add_executable(speech_test tests/test_pipeline.cpp) - target_link_libraries(speech_test PRIVATE speech) - target_include_directories(speech_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include) - add_test(NAME pipeline_test COMMAND speech_test) -endif() - -# --- CLI tools --- -if(SPEECH_BUILD_TOOLS) - add_executable(speech_transcribe tools/transcribe.cpp) - target_link_libraries(speech_transcribe PRIVATE speech) - target_include_directories(speech_transcribe PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include) - - # speech_synthesize calls KokoroTts directly — needs the SDK private headers. - add_executable(speech_synthesize tools/synthesize.cpp) - target_link_libraries(speech_synthesize PRIVATE speech onnxruntime) - target_include_directories(speech_synthesize PRIVATE - ${SDK_CPP} - ${SDK_CPP}/models - ${ORT_DIR}/include) - - add_executable(speech_phonemize tools/phonemize.cpp) - target_link_libraries(speech_phonemize PRIVATE speech) - target_include_directories(speech_phonemize PRIVATE - ${SDK_CPP} - ${SDK_CPP}/models) -endif() diff --git a/linux/README.md b/linux/README.md deleted file mode 100644 index 574bcdf..0000000 --- a/linux/README.md +++ /dev/null @@ -1,151 +0,0 @@ -# speech-linux - -On-device speech SDK for embedded Linux — VAD, STT (multilingual), TTS, noise cancellation. - -Targets automotive (Qualcomm SA8295P, SA8255P) and embedded ARM64 platforms running Yocto or similar Linux distributions. - -## Quick Start - -```bash -# Download ONNX Runtime -./setup_linux.sh - -# Build -cmake -B build -DORT_DIR=../ort-linux -cmake --build build - -# Run tests -cd build && ctest - -# Run demo (ALSA mic) -./speech_demo --model-dir /path/to/models - -# Run demo (stdin PCM pipe) -arecord -f FLOAT_LE -r 16000 -c 1 | ./speech_demo --model-dir /path/to/models -``` - -## C API - -```c -#include - -void on_event(const speech_event_t* event, void* ctx) { - if (event->type == SPEECH_EVENT_TRANSCRIPTION) - printf("STT: %s\n", event->text); -} - -int main() { - speech_config_t cfg = speech_config_default(); - cfg.model_dir = "/opt/speech/models"; - - speech_pipeline_t p = speech_create(cfg, on_event, NULL); - speech_start(p); - - // Feed 16kHz mono float32 PCM from your audio source - while (has_audio()) { - float buf[512]; - read_audio(buf, 512); - speech_push_audio(p, buf, 512); - } - - speech_destroy(p); -} -``` - -### Functions - -| Function | Description | -|---|---| -| `speech_config_default()` | Default config (INT8, CPU, 400ms silence threshold) | -| `speech_create(config, callback, ctx)` | Load models, create pipeline. Returns `NULL` on failure | -| `speech_start(pipeline)` | Start processing audio | -| `speech_push_audio(pipeline, samples, count)` | Feed PCM float32 at 16 kHz | -| `speech_resume_listening(pipeline)` | Resume after TTS playback | -| `speech_destroy(pipeline)` | Free all resources | -| `speech_version()` | Version string | - -### Events - -| Event | Fields | Description | -|---|---|---| -| `SPEECH_EVENT_READY` | — | Pipeline initialized | -| `SPEECH_EVENT_SPEECH_STARTED` | — | VAD detected speech | -| `SPEECH_EVENT_SPEECH_ENDED` | — | VAD detected silence | -| `SPEECH_EVENT_TRANSCRIPTION` | `text`, `confidence`, `stt_duration_ms` | Final transcription | -| `SPEECH_EVENT_RESPONSE_AUDIO` | `audio_data`, `audio_data_length` | TTS PCM16 audio chunk (24 kHz) | -| `SPEECH_EVENT_RESPONSE_DONE` | `tts_duration_ms` | TTS complete | -| `SPEECH_EVENT_ERROR` | `text` | Error message | - -### Configuration - -```c -speech_config_t cfg = speech_config_default(); -cfg.model_dir = "/opt/speech/models"; // required -cfg.use_int8 = true; // INT8 quantized models (default) -cfg.use_qnn = true; // Qualcomm QNN EP (Hexagon DSP) -cfg.enable_enhancer = true; // DeepFilterNet noise cancellation -cfg.transcribe_only = true; // STT only, no TTS echo -cfg.min_silence_duration = 0.4f; // seconds before end-of-speech -``` - -## Models - -Download from HuggingFace (`aufklarer/` org) into a single directory: - -``` -models/ - silero-vad.onnx 2 MB Voice activity detection - parakeet-encoder-int8.onnx 840 MB STT encoder (multilingual, 114 languages) - parakeet-decoder-joint-int8.onnx 51 MB STT decoder - vocab.json 156 KB BPE vocabulary (8192 tokens) - kokoro-int8.onnx 330 MB TTS (English) - vocab_index.json 2 KB TTS phonemizer vocab - us_gold.json 2 B TTS phonemizer dict - us_silver.json 2 B TTS phonemizer dict - voices/af_heart.bin 1 KB Voice embedding -``` - -## Cross-Compilation (Yocto) - -```bash -# Source Yocto SDK environment -source /opt/poky/environment-setup-aarch64-poky-linux - -# Build with cross-toolchain -cmake -B build \ - -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64.cmake \ - -DORT_DIR=/path/to/ort-linux-aarch64 - -cmake --build build -``` - -## QNN (Qualcomm Hexagon DSP) - -For hardware acceleration on SA8295P / SA8255P: - -1. Build ONNX Runtime with QNN EP or use Qualcomm's prebuilt -2. Place `libQnnHtp.so` in the library path -3. Set `cfg.use_qnn = true` - -The pipeline falls back to CPU if QNN is unavailable. - -## Architecture - -``` -libspeech.so - ├── speech.h (C API) - ├── speech-core (pipeline orchestration) - ├── Silero VAD v5 (voice activity detection) - ├── Parakeet TDT v3 (multilingual STT, 114 languages) - ├── Kokoro 82M (TTS) - ├── DeepFilterNet3 (noise cancellation) - └── ONNX Runtime (CPU / QNN EP) -``` - -All inference runs on-device. No network required after model download. - -## Thread Safety - -- `speech_push_audio()` is thread-safe (single producer) -- Event callback fires from an internal worker thread -- Do not call `speech_destroy()` from the event callback diff --git a/linux/demo/main.cpp b/linux/demo/main.cpp deleted file mode 100644 index 5ac0ffa..0000000 --- a/linux/demo/main.cpp +++ /dev/null @@ -1,135 +0,0 @@ -#include "speech.h" - -#include -#include -#include -#include -#include - -#ifdef HAS_ALSA -#include -#endif - -static volatile bool running = true; - -static void signal_handler(int) { running = false; } - -static void on_event(const speech_event_t* event, void* /*ctx*/) { - switch (event->type) { - case SPEECH_EVENT_SPEECH_STARTED: - fprintf(stderr, "[VAD] speech started\n"); - break; - case SPEECH_EVENT_SPEECH_ENDED: - fprintf(stderr, "[VAD] speech ended\n"); - break; - case SPEECH_EVENT_TRANSCRIPTION: - printf("[STT] %s (%.0fms, conf=%.2f)\n", - event->text ? event->text : "", - event->stt_duration_ms, event->confidence); - fflush(stdout); - break; - case SPEECH_EVENT_RESPONSE_DONE: - fprintf(stderr, "[TTS] done (%.0fms)\n", event->tts_duration_ms); - break; - case SPEECH_EVENT_ERROR: - fprintf(stderr, "[ERROR] %s\n", event->text ? event->text : "unknown"); - break; - default: - break; - } -} - -static void print_usage(const char* prog) { - fprintf(stderr, "Usage: %s --model-dir [--qnn] [--transcribe-only] [--device ]\n", prog); -} - -int main(int argc, char* argv[]) { - const char* model_dir = nullptr; - const char* alsa_device = "default"; - bool use_qnn = false; - bool transcribe_only = false; - - for (int i = 1; i < argc; i++) { - if (strcmp(argv[i], "--model-dir") == 0 && i + 1 < argc) { - model_dir = argv[++i]; - } else if (strcmp(argv[i], "--qnn") == 0) { - use_qnn = true; - } else if (strcmp(argv[i], "--transcribe-only") == 0) { - transcribe_only = true; - } else if (strcmp(argv[i], "--device") == 0 && i + 1 < argc) { - alsa_device = argv[++i]; - } else { - print_usage(argv[0]); - return 1; - } - } - - if (!model_dir) { - print_usage(argv[0]); - return 1; - } - - fprintf(stderr, "speech-linux %s\n", speech_version()); - fprintf(stderr, "Models: %s\n", model_dir); - fprintf(stderr, "QNN: %s\n", use_qnn ? "yes" : "no"); - - speech_config_t config = speech_config_default(); - config.model_dir = model_dir; - config.use_qnn = use_qnn; - config.transcribe_only = transcribe_only; - - fprintf(stderr, "Loading models...\n"); - speech_pipeline_t pipeline = speech_create(config, on_event, nullptr); - if (!pipeline) { - fprintf(stderr, "Failed to create pipeline\n"); - return 1; - } - - signal(SIGINT, signal_handler); - signal(SIGTERM, signal_handler); - - speech_start(pipeline); - fprintf(stderr, "Listening... (Ctrl+C to stop)\n"); - -#ifdef HAS_ALSA - snd_pcm_t* capture = nullptr; - int err = snd_pcm_open(&capture, alsa_device, SND_PCM_STREAM_CAPTURE, 0); - if (err < 0) { - fprintf(stderr, "ALSA open failed: %s\n", snd_strerror(err)); - speech_destroy(pipeline); - return 1; - } - - snd_pcm_set_params(capture, SND_PCM_FORMAT_FLOAT_LE, SND_PCM_ACCESS_RW_INTERLEAVED, - 1, 16000, 1, 100000); - - float buffer[512]; - while (running) { - snd_pcm_sframes_t frames = snd_pcm_readi(capture, buffer, 512); - if (frames < 0) { - frames = snd_pcm_recover(capture, (int)frames, 0); - if (frames < 0) break; - } - if (frames > 0) { - speech_push_audio(pipeline, buffer, (size_t)frames); - } - } - - snd_pcm_close(capture); -#else - // No ALSA: read raw float32 PCM from stdin - fprintf(stderr, "No ALSA — reading float32 PCM from stdin (16kHz mono)\n"); - float buffer[512]; - while (running) { - size_t n = fread(buffer, sizeof(float), 512, stdin); - if (n == 0) break; - speech_push_audio(pipeline, buffer, n); - // Simulate real-time pace - usleep((unsigned int)(n * 1000000 / 16000)); - } -#endif - - fprintf(stderr, "\nShutting down...\n"); - speech_destroy(pipeline); - return 0; -} diff --git a/linux/include/speech.h b/linux/include/speech.h deleted file mode 100644 index b7b1549..0000000 --- a/linux/include/speech.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef SPEECH_H -#define SPEECH_H - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct speech_pipeline_s* speech_pipeline_t; - -typedef enum { - SPEECH_EVENT_READY = 0, - SPEECH_EVENT_SPEECH_STARTED, - SPEECH_EVENT_SPEECH_ENDED, - SPEECH_EVENT_PARTIAL_TRANSCRIPTION, - SPEECH_EVENT_TRANSCRIPTION, - SPEECH_EVENT_RESPONSE_AUDIO, - SPEECH_EVENT_RESPONSE_DONE, - SPEECH_EVENT_ERROR -} speech_event_type_t; - -typedef struct { - speech_event_type_t type; - const char* text; - const uint8_t* audio_data; - size_t audio_data_length; - float confidence; - float stt_duration_ms; - float tts_duration_ms; -} speech_event_t; - -typedef struct { - const char* model_dir; - bool use_int8; - bool use_qnn; - bool enable_enhancer; - bool transcribe_only; - float min_silence_duration; -} speech_config_t; - -typedef void (*speech_event_fn)(const speech_event_t* event, void* context); - -speech_config_t speech_config_default(void); - -speech_pipeline_t speech_create(speech_config_t config, - speech_event_fn on_event, - void* event_context); - -void speech_start(speech_pipeline_t pipeline); - -void speech_push_audio(speech_pipeline_t pipeline, - const float* samples, size_t count); - -void speech_resume_listening(speech_pipeline_t pipeline); - -void speech_destroy(speech_pipeline_t pipeline); - -const char* speech_version(void); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/linux/setup_linux.sh b/linux/setup_linux.sh deleted file mode 100755 index 8b7f5a7..0000000 --- a/linux/setup_linux.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -set -euo pipefail - -ORT_VERSION="1.19.0" -OS="${OS:-$(uname -s)}" -ARCH="${1:-$(uname -m)}" - -ROOT="$(cd "$(dirname "$0")/.." && pwd)" -ORT_DIR="${ROOT}/ort-linux" - -echo "=== speech-linux setup (${OS} ${ARCH}) ===" - -if [ ! -f "${ORT_DIR}/include/onnxruntime_c_api.h" ]; then - echo "Downloading ONNX Runtime ${ORT_VERSION} for ${OS} ${ARCH}..." - - case "${OS}-${ARCH}" in - Linux-aarch64|Linux-arm64) - ORT_URL="https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VERSION}/onnxruntime-linux-aarch64-${ORT_VERSION}.tgz" - ORT_LIB_GLOB="libonnxruntime.so*" - ;; - Linux-x86_64|Linux-amd64) - ORT_URL="https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VERSION}/onnxruntime-linux-x64-${ORT_VERSION}.tgz" - ORT_LIB_GLOB="libonnxruntime.so*" - ;; - Darwin-arm64|Darwin-aarch64) - ORT_URL="https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VERSION}/onnxruntime-osx-arm64-${ORT_VERSION}.tgz" - ORT_LIB_GLOB="libonnxruntime*.dylib" - ;; - Darwin-x86_64) - ORT_URL="https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VERSION}/onnxruntime-osx-x86_64-${ORT_VERSION}.tgz" - ORT_LIB_GLOB="libonnxruntime*.dylib" - ;; - *) - echo "Unsupported platform: ${OS}-${ARCH}" - exit 1 - ;; - esac - - TMP_DIR=$(mktemp -d) - curl -L -o "${TMP_DIR}/ort.tgz" "${ORT_URL}" - - mkdir -p "${ORT_DIR}" - tar xf "${TMP_DIR}/ort.tgz" -C "${TMP_DIR}" - - # Find extracted dir - ORT_EXTRACTED=$(find "${TMP_DIR}" -maxdepth 1 -name "onnxruntime-*" -type d | head -1) - - mkdir -p "${ORT_DIR}/include" "${ORT_DIR}/lib" - cp "${ORT_EXTRACTED}"/include/*.h "${ORT_DIR}/include/" - cp "${ORT_EXTRACTED}"/lib/${ORT_LIB_GLOB} "${ORT_DIR}/lib/" - - rm -rf "${TMP_DIR}" - echo "ONNX Runtime installed to ${ORT_DIR}" -else - echo "ONNX Runtime already installed" -fi - -echo "" -echo "Build with:" -echo " cd linux && cmake -B build -DORT_DIR=${ORT_DIR} && cmake --build build" diff --git a/linux/src/speech.cpp b/linux/src/speech.cpp deleted file mode 100644 index fb933fa..0000000 --- a/linux/src/speech.cpp +++ /dev/null @@ -1,259 +0,0 @@ -#include "speech.h" - -#include -#include - -#include -#include "models/onnx_engine.h" -#include "models/silero_vad.h" -#include "models/parakeet_stt.h" -#include "models/kokoro_tts.h" -#include "models/deepfilter.h" - -// --------------------------------------------------------------------------- -// Pipeline handle -// --------------------------------------------------------------------------- - -struct speech_pipeline_s { - sc_pipeline_t pipeline = nullptr; - SileroVad* vad = nullptr; - ParakeetStt* stt = nullptr; - KokoroTts* tts = nullptr; - DeepFilterEnhancer* enhancer = nullptr; - speech_event_fn user_callback = nullptr; - void* user_context = nullptr; - - ~speech_pipeline_s() { - if (pipeline) sc_pipeline_destroy(pipeline); - delete enhancer; - delete tts; - delete stt; - delete vad; - } -}; - -// --------------------------------------------------------------------------- -// speech-core vtable adapters (pure C++, no platform deps) -// --------------------------------------------------------------------------- - -static float vad_process_chunk(void* ctx, const float* samples, size_t len) { - return static_cast(ctx)->process_chunk(samples, len); -} -static void vad_reset(void* ctx) { - static_cast(ctx)->reset(); -} -static int vad_sample_rate(void* ctx) { - return static_cast(ctx)->input_sample_rate(); -} -static size_t vad_chunk_size(void* ctx) { - return static_cast(ctx)->chunk_size(); -} - -static sc_transcription_result_t stt_transcribe( - void* ctx, const float* audio, size_t len, int sr) -{ - auto* stt = static_cast(ctx); - auto r = stt->transcribe(audio, len, sr); - - static thread_local std::string text_buf; - static thread_local std::string lang_buf; - text_buf = std::move(r.text); - lang_buf = std::move(r.language); - - return { - .text = text_buf.c_str(), - .language = lang_buf.empty() ? nullptr : lang_buf.c_str(), - .confidence = r.confidence, - .start_time = 0.0f, - .end_time = 0.0f, - }; -} -static int stt_sample_rate(void* ctx) { - return static_cast(ctx)->input_sample_rate(); -} - -static void tts_synthesize( - void* ctx, const char* text, const char* language, - sc_tts_chunk_fn on_chunk, void* chunk_ctx) -{ - static_cast(ctx)->synthesize(text, language, on_chunk, chunk_ctx); -} -static int tts_sample_rate(void* ctx) { - return static_cast(ctx)->output_sample_rate(); -} -static void tts_cancel(void* ctx) { - static_cast(ctx)->cancel(); -} - -static void enhancer_enhance( - void* ctx, const float* input, size_t len, int sr, float* output) -{ - static_cast(ctx)->enhance(input, len, sr, output); -} -static int enhancer_sample_rate(void* ctx) { - return static_cast(ctx)->input_sample_rate(); -} - -// --------------------------------------------------------------------------- -// Event bridge: sc_event_t → speech_event_t -// --------------------------------------------------------------------------- - -static void on_pipeline_event(const sc_event_t* event, void* context) { - auto* h = static_cast(context); - if (!h->user_callback) return; - - speech_event_t out = {}; - out.text = event->text; - out.audio_data = event->audio_data; - out.audio_data_length = event->audio_data_length; - out.confidence = event->confidence; - out.stt_duration_ms = event->stt_duration_ms; - out.tts_duration_ms = event->tts_duration_ms; - - switch (event->type) { - case SC_EVENT_SESSION_CREATED: out.type = SPEECH_EVENT_READY; break; - case SC_EVENT_SPEECH_STARTED: out.type = SPEECH_EVENT_SPEECH_STARTED; break; - case SC_EVENT_SPEECH_ENDED: out.type = SPEECH_EVENT_SPEECH_ENDED; break; - case SC_EVENT_PARTIAL_TRANSCRIPTION: out.type = SPEECH_EVENT_PARTIAL_TRANSCRIPTION; break; - case SC_EVENT_TRANSCRIPTION_COMPLETED: out.type = SPEECH_EVENT_TRANSCRIPTION; break; - case SC_EVENT_RESPONSE_AUDIO_DELTA: out.type = SPEECH_EVENT_RESPONSE_AUDIO; break; - case SC_EVENT_RESPONSE_DONE: out.type = SPEECH_EVENT_RESPONSE_DONE; break; - case SC_EVENT_ERROR: out.type = SPEECH_EVENT_ERROR; break; - default: return; // skip unmapped events - } - - h->user_callback(&out, h->user_context); -} - -// --------------------------------------------------------------------------- -// Public C API -// --------------------------------------------------------------------------- - -speech_config_t speech_config_default(void) { - return { - .model_dir = nullptr, - .use_int8 = true, - .use_qnn = false, - .enable_enhancer = false, - .transcribe_only = false, - .min_silence_duration = 0.4f, - }; -} - -speech_pipeline_t speech_create(speech_config_t config, - speech_event_fn on_event, - void* event_context) -{ - if (!config.model_dir) return nullptr; - - auto* h = new speech_pipeline_s(); - h->user_callback = on_event; - h->user_context = event_context; - - std::string dir(config.model_dir); - std::string suffix = config.use_int8 ? "-int8" : ""; - bool hw_accel = config.use_qnn; - - try { - h->vad = new SileroVad(dir + "/silero-vad.onnx"); - h->stt = new ParakeetStt( - dir + "/parakeet-encoder" + suffix + ".onnx", - dir + "/parakeet-decoder-joint" + suffix + ".onnx", - dir + "/vocab.json", - hw_accel); - // Skip TTS when transcribe-only — saves model load time and lets - // the CLI run on a slimmer model directory (no kokoro-e2e bundle). - if (!config.transcribe_only) { - h->tts = new KokoroTts( - dir + "/kokoro-e2e.onnx", - dir + "/voices", dir, hw_accel); - } - - // VAD vtable - sc_vad_vtable_t vad_vt = {}; - vad_vt.context = h->vad; - vad_vt.process_chunk = vad_process_chunk; - vad_vt.reset = ::vad_reset; - vad_vt.input_sample_rate = ::vad_sample_rate; - vad_vt.chunk_size = ::vad_chunk_size; - - // STT vtable - sc_stt_vtable_t stt_vt = {}; - stt_vt.context = h->stt; - stt_vt.transcribe = ::stt_transcribe; - stt_vt.input_sample_rate = ::stt_sample_rate; - - // TTS vtable — populated only when TTS was loaded. - sc_tts_vtable_t tts_vt = {}; - if (h->tts) { - tts_vt.context = h->tts; - tts_vt.synthesize = ::tts_synthesize; - tts_vt.output_sample_rate = ::tts_sample_rate; - tts_vt.cancel = ::tts_cancel; - } - - // Pipeline config - sc_config_t sc_cfg = sc_config_default(); - sc_cfg.min_silence_duration = config.min_silence_duration; - if (config.transcribe_only) { - sc_cfg.mode = SC_MODE_TRANSCRIBE_ONLY; - } else { - sc_cfg.mode = SC_MODE_ECHO; - } - - h->pipeline = sc_pipeline_create( - stt_vt, tts_vt, nullptr, vad_vt, - sc_cfg, on_pipeline_event, h); - - if (!h->pipeline) { - delete h; - return nullptr; - } - - // Optional enhancer - if (config.enable_enhancer) { - std::string aux = dir + "/deepfilter-auxiliary.bin"; - std::string df = dir + "/deepfilter" + suffix + ".onnx"; - FILE* f = fopen(df.c_str(), "r"); - if (f) { - fclose(f); - h->enhancer = new DeepFilterEnhancer(df, aux, hw_accel); - sc_enhancer_vtable_t enh_vt = {}; - enh_vt.context = h->enhancer; - enh_vt.enhance = ::enhancer_enhance; - enh_vt.input_sample_rate = ::enhancer_sample_rate; - sc_pipeline_set_enhancer(h->pipeline, enh_vt); - } - } - - return h; - - } catch (const std::exception& e) { - LOGE("Pipeline creation failed: %s", e.what()); - delete h; - return nullptr; - } -} - -void speech_start(speech_pipeline_t pipeline) { - if (pipeline && pipeline->pipeline) sc_pipeline_start(pipeline->pipeline); -} - -void speech_push_audio(speech_pipeline_t pipeline, - const float* samples, size_t count) { - if (pipeline && pipeline->pipeline) - sc_pipeline_push_audio(pipeline->pipeline, samples, count); -} - -void speech_resume_listening(speech_pipeline_t pipeline) { - if (pipeline && pipeline->pipeline) - sc_pipeline_resume_listening(pipeline->pipeline); -} - -void speech_destroy(speech_pipeline_t pipeline) { - delete pipeline; -} - -const char* speech_version(void) { - return "0.0.1"; -} diff --git a/linux/tests/download_models.sh b/linux/tests/download_models.sh deleted file mode 100755 index b803f1a..0000000 --- a/linux/tests/download_models.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash -set -euo pipefail - -# Download ONNX models for testing. -# Usage: ./download_models.sh [output_dir] - -BASE_URL="https://huggingface.co/aufklarer" -OUT="${1:-$(dirname "$0")/models}" -mkdir -p "$OUT/voices" - -FILES=( - "Silero-VAD-v5-ONNX/silero-vad.onnx" - "Parakeet-TDT-v3-ONNX/parakeet-encoder-int8.onnx" - "Parakeet-TDT-v3-ONNX/parakeet-decoder-joint-int8.onnx" - "Parakeet-TDT-v3-ONNX/vocab.json" - "Kokoro-82M-ONNX/kokoro-e2e.onnx" - "Kokoro-82M-ONNX/kokoro-e2e.onnx.data" - "Kokoro-82M-ONNX/vocab_index.json" - "Kokoro-82M-ONNX/us_gold.json" - "Kokoro-82M-ONNX/us_silver.json" - "Kokoro-82M-ONNX/dict_fr.json" - "Kokoro-82M-ONNX/dict_es.json" - "Kokoro-82M-ONNX/dict_it.json" - "Kokoro-82M-ONNX/dict_pt.json" - "Kokoro-82M-ONNX/dict_hi.json" - "Kokoro-82M-ONNX/voices/af_heart.bin" -) - -for entry in "${FILES[@]}"; do - repo="${entry%%/*}" - file="${entry#*/}" - dest="$OUT/$file" - if [ -f "$dest" ]; then - continue - fi - echo "Downloading $file..." - curl -sL -o "$dest" "$BASE_URL/$repo/resolve/main/$file" -done - -echo "Models ready in $OUT" diff --git a/linux/tests/test_pipeline.cpp b/linux/tests/test_pipeline.cpp deleted file mode 100644 index afe9461..0000000 --- a/linux/tests/test_pipeline.cpp +++ /dev/null @@ -1,284 +0,0 @@ -#include "speech.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// --------------------------------------------------------------------------- -// Test helpers -// --------------------------------------------------------------------------- - -static int tests_run = 0; -static int tests_passed = 0; - -#define TEST(name) \ - static void test_##name(); \ - static struct Register_##name { \ - Register_##name() { test_funcs.push_back({#name, test_##name}); } \ - } reg_##name; \ - static void test_##name() - -#define ASSERT(cond) do { \ - if (!(cond)) { \ - fprintf(stderr, " FAIL: %s (line %d)\n", #cond, __LINE__); \ - return; \ - } \ -} while(0) - -#define PASS() tests_passed++ - -struct TestFunc { const char* name; void (*fn)(); }; -static std::vector test_funcs; - -// --------------------------------------------------------------------------- -// Tests -// --------------------------------------------------------------------------- - -TEST(config_default) { - speech_config_t cfg = speech_config_default(); - ASSERT(cfg.use_int8 == true); - ASSERT(cfg.use_qnn == false); - ASSERT(cfg.min_silence_duration > 0.0f); - ASSERT(cfg.model_dir == nullptr); - PASS(); -} - -TEST(version) { - const char* v = speech_version(); - ASSERT(v != nullptr); - ASSERT(strlen(v) > 0); - PASS(); -} - -TEST(create_null_dir_fails) { - speech_config_t cfg = speech_config_default(); - cfg.model_dir = nullptr; - speech_pipeline_t p = speech_create(cfg, nullptr, nullptr); - ASSERT(p == nullptr); - PASS(); -} - -TEST(create_bad_dir_fails) { - speech_config_t cfg = speech_config_default(); - cfg.model_dir = "/nonexistent/path"; - speech_pipeline_t p = speech_create(cfg, nullptr, nullptr); - ASSERT(p == nullptr); - PASS(); -} - -TEST(destroy_null_safe) { - speech_destroy(nullptr); - PASS(); -} - -TEST(push_null_safe) { - float buf[512] = {}; - speech_push_audio(nullptr, buf, 512); - speech_start(nullptr); - speech_resume_listening(nullptr); - PASS(); -} - -// If models are available, test the full pipeline -static const char* find_model_dir() { - const char* env = getenv("SPEECH_MODEL_DIR"); - if (env) return env; - // Check common locations - static const char* paths[] = { - "./models", - "../models", - "../tests/models", - "/opt/speech/models", - nullptr - }; - for (const char** p = paths; *p; p++) { - char path[512]; - snprintf(path, sizeof(path), "%s/silero-vad.onnx", *p); - FILE* f = fopen(path, "r"); - if (f) { fclose(f); return *p; } - } - return nullptr; -} - -struct EventLog { - std::atomic transcriptions{0}; - std::atomic speech_started{0}; - std::atomic speech_ended{0}; - std::string last_text; -}; - -static void test_event_cb(const speech_event_t* event, void* ctx) { - auto* log = static_cast(ctx); - switch (event->type) { - case SPEECH_EVENT_SPEECH_STARTED: log->speech_started++; break; - case SPEECH_EVENT_SPEECH_ENDED: log->speech_ended++; break; - case SPEECH_EVENT_TRANSCRIPTION: - log->transcriptions++; - if (event->text) log->last_text = event->text; - break; - default: break; - } -} - -TEST(pipeline_lifecycle) { - const char* dir = find_model_dir(); - if (!dir) { fprintf(stderr, " SKIP (no models)\n"); PASS(); return; } - - speech_config_t cfg = speech_config_default(); - cfg.model_dir = dir; - cfg.transcribe_only = true; - - EventLog log; - speech_pipeline_t p = speech_create(cfg, test_event_cb, &log); - ASSERT(p != nullptr); - - speech_start(p); - - // Push 2 seconds of silence - float silence[512] = {}; - for (int i = 0; i < 62; i++) { - speech_push_audio(p, silence, 512); - std::this_thread::sleep_for(std::chrono::milliseconds(5)); - } - - speech_destroy(p); - // No crash = success - PASS(); -} - -TEST(pipeline_speech_detection) { - const char* dir = find_model_dir(); - if (!dir) { fprintf(stderr, " SKIP (no models)\n"); PASS(); return; } - - speech_config_t cfg = speech_config_default(); - cfg.model_dir = dir; - cfg.transcribe_only = true; - - EventLog log; - speech_pipeline_t p = speech_create(cfg, test_event_cb, &log); - ASSERT(p != nullptr); - - speech_start(p); - - // Push speech-like signal (150Hz buzz) for 1.5s - float speech[512]; - for (int chunk = 0; chunk < 47; chunk++) { - for (int i = 0; i < 512; i++) { - float t = (float)(chunk * 512 + i) / 16000.0f; - speech[i] = 0.3f * sinf(2.0f * 3.14159f * 150.0f * t) - + 0.2f * sinf(2.0f * 3.14159f * 300.0f * t); - } - speech_push_audio(p, speech, 512); - std::this_thread::sleep_for(std::chrono::milliseconds(5)); - } - - // Push 1.5s silence to trigger end-of-speech - float silence[512] = {}; - for (int i = 0; i < 47; i++) { - speech_push_audio(p, silence, 512); - std::this_thread::sleep_for(std::chrono::milliseconds(5)); - } - - // Wait for processing - std::this_thread::sleep_for(std::chrono::seconds(3)); - - speech_destroy(p); - - // VAD should have detected speech - ASSERT(log.speech_started > 0); - PASS(); -} - -TEST(resume_listening_null_safe) { - speech_resume_listening(nullptr); - PASS(); -} - -TEST(pipeline_multiple_sessions) { - const char* dir = find_model_dir(); - if (!dir) { fprintf(stderr, " SKIP (no models)\n"); PASS(); return; } - - for (int session = 0; session < 3; session++) { - speech_config_t cfg = speech_config_default(); - cfg.model_dir = dir; - cfg.transcribe_only = true; - - EventLog log; - speech_pipeline_t p = speech_create(cfg, test_event_cb, &log); - ASSERT(p != nullptr); - - speech_start(p); - - // Push 1 second of silence - float silence[512] = {}; - for (int i = 0; i < 31; i++) { - speech_push_audio(p, silence, 512); - std::this_thread::sleep_for(std::chrono::milliseconds(5)); - } - - speech_destroy(p); - } - // No crash or leak after 3 create/destroy cycles - PASS(); -} - -TEST(pipeline_concurrent_push) { - const char* dir = find_model_dir(); - if (!dir) { fprintf(stderr, " SKIP (no models)\n"); PASS(); return; } - - speech_config_t cfg = speech_config_default(); - cfg.model_dir = dir; - cfg.transcribe_only = true; - - EventLog log; - speech_pipeline_t p = speech_create(cfg, test_event_cb, &log); - ASSERT(p != nullptr); - - speech_start(p); - - // Push audio from 4 threads concurrently - std::vector threads; - for (int t = 0; t < 4; t++) { - threads.emplace_back([p]() { - float buf[512] = {}; - for (int i = 0; i < 50; i++) { - speech_push_audio(p, buf, 512); - std::this_thread::sleep_for(std::chrono::milliseconds(2)); - } - }); - } - - for (auto& th : threads) { - th.join(); - } - - speech_destroy(p); - // No crash under concurrent push - PASS(); -} - -// --------------------------------------------------------------------------- -// Main -// --------------------------------------------------------------------------- - -int main() { - fprintf(stderr, "speech-linux tests (%s)\n\n", speech_version()); - - for (auto& t : test_funcs) { - tests_run++; - fprintf(stderr, " %s... ", t.name); - t.fn(); - if (tests_passed == tests_run) { - fprintf(stderr, "ok\n"); - } - } - - fprintf(stderr, "\n%d/%d passed\n", tests_passed, tests_run); - return tests_passed == tests_run ? 0 : 1; -} diff --git a/linux/toolchain-aarch64.cmake b/linux/toolchain-aarch64.cmake deleted file mode 100644 index 272ed2d..0000000 --- a/linux/toolchain-aarch64.cmake +++ /dev/null @@ -1,11 +0,0 @@ -set(CMAKE_SYSTEM_NAME Linux) -set(CMAKE_SYSTEM_PROCESSOR aarch64) - -# Yocto SDK cross-compiler (source environment-setup-aarch64-poky-linux first) -set(CMAKE_C_COMPILER $ENV{CC} CACHE STRING "" FORCE) -set(CMAKE_CXX_COMPILER $ENV{CXX} CACHE STRING "" FORCE) -set(CMAKE_SYSROOT $ENV{SDKTARGETSYSROOT} CACHE STRING "" FORCE) - -set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) -set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) -set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) diff --git a/linux/tools/phonemize.cpp b/linux/tools/phonemize.cpp deleted file mode 100644 index a183df3..0000000 --- a/linux/tools/phonemize.cpp +++ /dev/null @@ -1,47 +0,0 @@ -// Tiny CLI that dumps the phoneme string + token IDs the Kokoro phonemizer -// produces for a piece of text. Used to verify text→phoneme conversion is -// correct before blaming the TTS model. -// -// Usage: speech_phonemize "" [language] - -#include "models/kokoro_phonemizer.h" - -#include -#include - -int main(int argc, char** argv) { - if (argc < 3) { - std::fprintf(stderr, - "usage: %s \"\" [language]\n" - " model_dir : directory holding vocab_index.json + dictionaries\n" - " language : BCP-47 tag (default: en)\n", - argv[0]); - return 2; - } - const std::string model_dir = argv[1]; - const std::string text = argv[2]; - const std::string language = (argc >= 4) ? argv[3] : "en"; - - KokoroPhonemizer p; - if (!p.load_vocab(model_dir + "/vocab_index.json")) { - std::fprintf(stderr, "failed to load vocab from %s/vocab_index.json\n", - model_dir.c_str()); - return 1; - } - p.load_dictionaries(model_dir); - for (const char* lang : {"fr", "es", "it", "pt", "hi"}) { - p.load_language_dict(lang, model_dir + "/dict_" + std::string(lang) + ".json"); - } - p.set_language(language); - - std::string phonemes = p.text_to_phonemes(text); - auto ids = p.tokenize(text, 128); - - std::printf("text : %s\n", text.c_str()); - std::printf("language : %s\n", language.c_str()); - std::printf("phonemes : %s\n", phonemes.c_str()); - std::printf("tokens : [%zu]", ids.size()); - for (auto id : ids) std::printf(" %lld", static_cast(id)); - std::printf("\n"); - return 0; -} diff --git a/linux/tools/synthesize.cpp b/linux/tools/synthesize.cpp deleted file mode 100644 index 2701608..0000000 --- a/linux/tools/synthesize.cpp +++ /dev/null @@ -1,110 +0,0 @@ -// Tiny CLI that runs Kokoro TTS on a piece of text and writes the audio to a WAV. -// -// Usage: speech_synthesize "" [language] -// -// Pairs with speech_transcribe — round-trip a known prompt through synthesis -// and back through STT to surface phonemizer / tokenizer / decoder bugs -// without bouncing through Android. -// -// Calls KokoroTts directly (skipping the speech-core pipeline) so we can -// inspect the raw audio buffer the model emits. - -#include "models/kokoro_tts.h" - -#include -#include -#include -#include -#include -#include - -namespace { - -constexpr int kSampleRate = 24000; - -struct ChunkSink { - std::vector samples; -}; - -static void on_chunk(const float* samples, size_t length, - bool /*is_final*/, void* ctx) { - auto* sink = static_cast(ctx); - sink->samples.insert(sink->samples.end(), samples, samples + length); -} - -static bool write_wav(const std::string& path, - const float* samples, size_t count, int sample_rate) { - std::ofstream f(path, std::ios::binary); - if (!f.is_open()) return false; - - auto put32 = [&](uint32_t v) { - char b[4] = {char(v & 0xFF), char((v >> 8) & 0xFF), - char((v >> 16) & 0xFF), char((v >> 24) & 0xFF)}; - f.write(b, 4); - }; - auto put16 = [&](uint16_t v) { - char b[2] = {char(v & 0xFF), char((v >> 8) & 0xFF)}; - f.write(b, 2); - }; - - const uint32_t data_bytes = static_cast(count) * 2; - f.write("RIFF", 4); put32(36 + data_bytes); - f.write("WAVE", 4); - f.write("fmt ", 4); put32(16); - put16(1); // PCM - put16(1); // mono - put32(static_cast(sample_rate)); - put32(static_cast(sample_rate) * 2); - put16(2); // block align - put16(16); // bits/sample - f.write("data", 4); put32(data_bytes); - - for (size_t i = 0; i < count; i++) { - float clamped = samples[i]; - if (clamped < -1.0f) clamped = -1.0f; - if (clamped > 1.0f) clamped = 1.0f; - int16_t v = static_cast(clamped * 32767.0f); - put16(static_cast(v)); - } - return f.good(); -} - -} // namespace - -int main(int argc, char** argv) { - if (argc < 4) { - std::fprintf(stderr, - "usage: %s \"\" [language]\n" - " model_dir : directory holding kokoro-e2e.onnx + voices/*.bin\n" - " language : BCP-47 tag (default: en). Auto-switches voice.\n", - argv[0]); - return 2; - } - const std::string model_dir = argv[1]; - const std::string out_wav = argv[2]; - const std::string text = argv[3]; - const std::string language = (argc >= 5) ? argv[4] : "en"; - - KokoroTts tts(model_dir + "/kokoro-e2e.onnx", - model_dir + "/voices", - model_dir, - /*nnapi=*/false); - - ChunkSink sink; - tts.synthesize(text.c_str(), language.c_str(), on_chunk, &sink); - - if (sink.samples.empty()) { - std::fprintf(stderr, "synthesis produced no audio\n"); - return 1; - } - - if (!write_wav(out_wav, sink.samples.data(), sink.samples.size(), kSampleRate)) { - std::fprintf(stderr, "could not write %s\n", out_wav.c_str()); - return 1; - } - std::fprintf(stderr, "wrote %zu samples (%.2fs @ %d Hz) to %s\n", - sink.samples.size(), - double(sink.samples.size()) / double(kSampleRate), - kSampleRate, out_wav.c_str()); - return 0; -} diff --git a/linux/tools/transcribe.cpp b/linux/tools/transcribe.cpp deleted file mode 100644 index 2bad307..0000000 --- a/linux/tools/transcribe.cpp +++ /dev/null @@ -1,262 +0,0 @@ -// Tiny CLI that runs Parakeet STT on a WAV file and prints what it heard. -// -// Usage: speech_transcribe -// -// Reads PCM Float32 / Int16 / Int24 mono or stereo at any sample rate, then -// resamples + downmixes to 16 kHz mono Float32 and feeds it through the -// pipeline. Useful for diagnosing TTS round-trip quality (synthesise speech, -// transcribe it back, compare to the original prompt). -// -// No external deps beyond libspeech. - -#include "speech.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace { - -constexpr int kTargetSampleRate = 16000; -constexpr size_t kChunkSamples = 512; // 32 ms at 16 kHz - -// --------------------------------------------------------------------------- -// WAV reader -// --------------------------------------------------------------------------- - -struct WavData { - std::vector samples; // mono, target sample rate - int sample_rate = 0; - int original_sample_rate = 0; - int original_channels = 0; - int original_bits = 0; -}; - -static uint32_t read_u32(const uint8_t* p) { - return uint32_t(p[0]) | (uint32_t(p[1]) << 8) - | (uint32_t(p[2]) << 16) | (uint32_t(p[3]) << 24); -} -static uint16_t read_u16(const uint8_t* p) { - return uint16_t(p[0]) | (uint16_t(p[1]) << 8); -} - -static bool load_wav(const std::string& path, WavData& out, std::string& err) { - std::ifstream f(path, std::ios::binary); - if (!f.is_open()) { err = "cannot open " + path; return false; } - - std::vector bytes((std::istreambuf_iterator(f)), - std::istreambuf_iterator()); - if (bytes.size() < 44) { err = "file too small to be a WAV"; return false; } - if (std::memcmp(bytes.data(), "RIFF", 4) != 0 || - std::memcmp(bytes.data() + 8, "WAVE", 4) != 0) { - err = "not a RIFF/WAVE file"; - return false; - } - - // Walk chunks looking for fmt + data. - size_t pos = 12; - uint16_t fmt_format = 0, fmt_channels = 0, fmt_bits = 0; - uint32_t fmt_rate = 0; - const uint8_t* data_ptr = nullptr; - uint32_t data_len = 0; - while (pos + 8 <= bytes.size()) { - const uint8_t* hdr = bytes.data() + pos; - const char tag[5] = {char(hdr[0]), char(hdr[1]), char(hdr[2]), char(hdr[3]), 0}; - uint32_t chunk_len = read_u32(hdr + 4); - if (pos + 8 + chunk_len > bytes.size()) break; - if (std::strcmp(tag, "fmt ") == 0 && chunk_len >= 16) { - fmt_format = read_u16(hdr + 8); - fmt_channels = read_u16(hdr + 10); - fmt_rate = read_u32(hdr + 12); - fmt_bits = read_u16(hdr + 22); - } else if (std::strcmp(tag, "data") == 0) { - data_ptr = hdr + 8; - data_len = chunk_len; - break; - } - pos += 8 + chunk_len + (chunk_len & 1); // pad to even - } - if (!data_ptr || fmt_channels == 0) { - err = "WAV has no fmt or data chunk"; - return false; - } - if (fmt_format != 1 /*PCM*/ && fmt_format != 3 /*FLOAT*/) { - err = "WAV format " + std::to_string(fmt_format) - + " unsupported (need PCM=1 or FLOAT=3)"; - return false; - } - - out.original_sample_rate = static_cast(fmt_rate); - out.original_channels = fmt_channels; - out.original_bits = fmt_bits; - - // Decode + downmix to mono float - const size_t bytes_per_sample = fmt_bits / 8; - const size_t frame_bytes = bytes_per_sample * fmt_channels; - const size_t frame_count = data_len / frame_bytes; - std::vector mono(frame_count); - for (size_t i = 0; i < frame_count; i++) { - float sum = 0.0f; - for (int c = 0; c < fmt_channels; c++) { - const uint8_t* sp = data_ptr + i * frame_bytes + c * bytes_per_sample; - float s = 0.0f; - if (fmt_format == 3 && fmt_bits == 32) { - std::memcpy(&s, sp, 4); - } else if (fmt_format == 1 && fmt_bits == 16) { - int16_t v = int16_t(uint16_t(sp[0]) | (uint16_t(sp[1]) << 8)); - s = float(v) / 32768.0f; - } else if (fmt_format == 1 && fmt_bits == 24) { - int32_t v = int32_t(uint32_t(sp[0]) - | (uint32_t(sp[1]) << 8) | (uint32_t(sp[2]) << 16)); - if (v & 0x800000) v |= 0xFF000000; // sign extend - s = float(v) / 8388608.0f; - } else if (fmt_format == 1 && fmt_bits == 32) { - int32_t v = int32_t(read_u32(sp)); - s = float(v) / 2147483648.0f; - } else { - err = "unsupported sample width " + std::to_string(fmt_bits); - return false; - } - sum += s; - } - mono[i] = sum / float(fmt_channels); - } - - // Linear-interpolation resample to kTargetSampleRate. Cheap, but - // adequate for diagnosing model output — TTS bandwidth is well below - // 8 kHz so aliasing isn't a meaningful concern here. - if (static_cast(fmt_rate) == kTargetSampleRate) { - out.samples = std::move(mono); - } else { - const double ratio = double(fmt_rate) / double(kTargetSampleRate); - const size_t out_len = size_t(double(mono.size()) / ratio); - out.samples.resize(out_len); - for (size_t i = 0; i < out_len; i++) { - double src = double(i) * ratio; - size_t i0 = size_t(src); - double frac = src - double(i0); - float a = mono[i0]; - float b = (i0 + 1 < mono.size()) ? mono[i0 + 1] : a; - out.samples[i] = float(double(a) + (double(b) - double(a)) * frac); - } - } - out.sample_rate = kTargetSampleRate; - return true; -} - -// --------------------------------------------------------------------------- -// Pipeline event handler -// --------------------------------------------------------------------------- - -struct Result { - std::mutex mu; - std::condition_variable cv; - std::string text; - float confidence = 0.0f; - bool done = false; - bool error = false; -}; - -static void on_event(const speech_event_t* event, void* ctx) { - auto* r = static_cast(ctx); - std::unique_lock lock(r->mu); - switch (event->type) { - case SPEECH_EVENT_TRANSCRIPTION: - if (event->text) r->text = event->text; - r->confidence = event->confidence; - r->done = true; - r->cv.notify_all(); - break; - case SPEECH_EVENT_PARTIAL_TRANSCRIPTION: - if (event->text) { - std::cerr << " [partial] " << event->text << "\r" << std::flush; - } - break; - case SPEECH_EVENT_ERROR: - std::cerr << " [error] " << (event->text ? event->text : "") << "\n"; - r->error = true; - r->done = true; - r->cv.notify_all(); - break; - default: - break; - } -} - -} // namespace - -int main(int argc, char** argv) { - if (argc != 3) { - std::fprintf(stderr, - "usage: %s \n" - " model_dir : directory holding parakeet-* + silero-vad.onnx\n" - " input.wav : audio to transcribe (mono or stereo, 16-bit/24-bit/float)\n", - argv[0]); - return 2; - } - const std::string model_dir = argv[1]; - const std::string wav_path = argv[2]; - - WavData wav; - std::string err; - if (!load_wav(wav_path, wav, err)) { - std::fprintf(stderr, "wav: %s\n", err.c_str()); - return 1; - } - std::fprintf(stderr, - "loaded %s: %d Hz × %dch × %d-bit → %.2fs of 16 kHz mono\n", - wav_path.c_str(), - wav.original_sample_rate, wav.original_channels, wav.original_bits, - double(wav.samples.size()) / double(wav.sample_rate)); - - speech_config_t cfg = speech_config_default(); - cfg.model_dir = model_dir.c_str(); - cfg.transcribe_only = true; - - Result result; - speech_pipeline_t pipeline = speech_create(cfg, on_event, &result); - if (!pipeline) { - std::fprintf(stderr, "speech_create failed (model dir? missing files?)\n"); - return 1; - } - speech_start(pipeline); - - // Push real audio - for (size_t off = 0; off < wav.samples.size(); off += kChunkSamples) { - size_t n = std::min(kChunkSamples, wav.samples.size() - off); - speech_push_audio(pipeline, wav.samples.data() + off, n); - } - // Trailing 1.5 s of silence so VAD sees end-of-utterance and Parakeet flushes - std::vector silence(kChunkSamples, 0.0f); - for (int i = 0; i < 47; i++) { - speech_push_audio(pipeline, silence.data(), silence.size()); - } - - // Wait up to 30 s for the transcription event - { - std::unique_lock lock(result.mu); - result.cv.wait_for(lock, std::chrono::seconds(30), - [&]{ return result.done; }); - } - - speech_destroy(pipeline); - - if (!result.done || result.error) { - std::fprintf(stderr, "transcription did not complete\n"); - return 1; - } - // Result on stdout — single line, useful for piping - std::printf("%s\n", result.text.c_str()); - std::fprintf(stderr, "confidence: %.3f\n", result.confidence); - return 0; -} diff --git a/sdk/src/main/cpp/CMakeLists.txt b/sdk/src/main/cpp/CMakeLists.txt index 3dc8bc8..16f6ceb 100644 --- a/sdk/src/main/cpp/CMakeLists.txt +++ b/sdk/src/main/cpp/CMakeLists.txt @@ -6,71 +6,41 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) # --------------------------------------------------------------------------- # speech-core (git submodule) +# +# Pull in speech-core via add_subdirectory so we reuse its target definitions +# rather than re-listing sources here. SPEECH_CORE_WITH_ONNX=ON adds the +# speech_core_models target (Silero VAD / Parakeet STT / Kokoro TTS / +# DeepFilterNet wrappers). Tests and examples are off — the Android NDK +# can't run host ctest binaries anyway. # --------------------------------------------------------------------------- if(NOT DEFINED SPEECH_CORE_DIR) - message(FATAL_ERROR "SPEECH_CORE_DIR must be set") + message(FATAL_ERROR "SPEECH_CORE_DIR must be set (path to the speech-core submodule)") endif() -set(SPEECH_CORE_SOURCES - ${SPEECH_CORE_DIR}/src/pipeline/voice_pipeline.cpp - ${SPEECH_CORE_DIR}/src/pipeline/turn_detector.cpp - ${SPEECH_CORE_DIR}/src/pipeline/speech_queue.cpp - ${SPEECH_CORE_DIR}/src/pipeline/conversation_context.cpp - ${SPEECH_CORE_DIR}/src/vad/streaming_vad.cpp - ${SPEECH_CORE_DIR}/src/audio/audio_buffer.cpp - ${SPEECH_CORE_DIR}/src/audio/resampler.cpp - ${SPEECH_CORE_DIR}/src/audio/pcm_codec.cpp - ${SPEECH_CORE_DIR}/src/tools/tool_registry.cpp - ${SPEECH_CORE_DIR}/src/tools/intent_matcher.cpp - ${SPEECH_CORE_DIR}/src/tools/tool_executor.cpp - ${SPEECH_CORE_DIR}/src/speech_core_c.cpp -) - -add_library(speech_core STATIC ${SPEECH_CORE_SOURCES}) -target_include_directories(speech_core PUBLIC ${SPEECH_CORE_DIR}/include) -target_compile_options(speech_core PRIVATE -O2) - -# --------------------------------------------------------------------------- -# ONNX Runtime (prebuilt) -# --------------------------------------------------------------------------- - if(NOT DEFINED ORT_DIR) - message(FATAL_ERROR "ORT_DIR must be set (path to onnxruntime with include/ and lib/)") + message(FATAL_ERROR "ORT_DIR must be set (path to onnxruntime with include/ and lib/${ANDROID_ABI}/libonnxruntime.so)") endif() -add_library(onnxruntime SHARED IMPORTED) -set_target_properties(onnxruntime PROPERTIES - IMPORTED_LOCATION ${ORT_DIR}/lib/${ANDROID_ABI}/libonnxruntime.so -) +set(SPEECH_CORE_WITH_ONNX ON CACHE BOOL "" FORCE) +set(SPEECH_CORE_BUILD_TESTS OFF CACHE BOOL "" FORCE) +set(SPEECH_CORE_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE) + +add_subdirectory(${SPEECH_CORE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/speech_core) # --------------------------------------------------------------------------- -# speech-android native library +# speech-android native library — JNI bridge only +# +# All audio DSP / model wrappers / onnx_engine now live in speech-core's +# speech_core_models target. The bridge just constructs them and feeds +# them into speech_core::VoicePipeline. # --------------------------------------------------------------------------- -add_library(speech_android SHARED - jni_bridge.cpp - audio/mel.cpp - audio/fft.cpp - audio/stft.cpp - models/soc_detect.cpp - models/silero_vad.cpp - models/parakeet_stt.cpp - models/kokoro_tts.cpp - models/kokoro_phonemizer.cpp - models/kokoro_multilingual.cpp - models/deepfilter.cpp -) - -target_include_directories(speech_android PRIVATE - ${ORT_DIR}/include - ${SPEECH_CORE_DIR}/include - ${CMAKE_CURRENT_SOURCE_DIR} -) +add_library(speech_android SHARED jni_bridge.cpp) target_link_libraries(speech_android - speech_core - onnxruntime - android - log + PRIVATE + speech_core_models + android + log ) diff --git a/sdk/src/main/cpp/audio/fft.cpp b/sdk/src/main/cpp/audio/fft.cpp deleted file mode 100644 index dc221fd..0000000 --- a/sdk/src/main/cpp/audio/fft.cpp +++ /dev/null @@ -1,92 +0,0 @@ -#include "fft.h" -#include -#include - -static void fft_complex(float* re, float* im, size_t n, bool inverse) { - // Bit-reversal permutation - for (size_t i = 1, j = 0; i < n; i++) { - size_t bit = n >> 1; - while (j & bit) { j ^= bit; bit >>= 1; } - j ^= bit; - if (i < j) { - std::swap(re[i], re[j]); - std::swap(im[i], im[j]); - } - } - - // Cooley-Tukey - float sign = inverse ? 1.0f : -1.0f; - for (size_t len = 2; len <= n; len <<= 1) { - float ang = sign * 2.0f * static_cast(M_PI) / static_cast(len); - float wr = std::cos(ang), wi = std::sin(ang); - - for (size_t i = 0; i < n; i += len) { - float cur_r = 1.0f, cur_i = 0.0f; - for (size_t j = 0; j < len / 2; j++) { - size_t u = i + j, v = i + j + len / 2; - float tr = re[v] * cur_r - im[v] * cur_i; - float ti = re[v] * cur_i + im[v] * cur_r; - re[v] = re[u] - tr; - im[v] = im[u] - ti; - re[u] += tr; - im[u] += ti; - float new_r = cur_r * wr - cur_i * wi; - cur_i = cur_r * wi + cur_i * wr; - cur_r = new_r; - } - } - } - - if (inverse) { - float inv_n = 1.0f / static_cast(n); - for (size_t i = 0; i < n; i++) { - re[i] *= inv_n; - im[i] *= inv_n; - } - } -} - -// Zero-pad to next power of 2 for non-power-of-2 FFT sizes -static size_t next_pow2(size_t n) { - size_t p = 1; - while (p < n) p <<= 1; - return p; -} - -void fft_real(const float* input, size_t n, - float* out_real, float* out_imag) -{ - size_t N = next_pow2(n); - std::vector re(N, 0.0f), im(N, 0.0f); - for (size_t i = 0; i < n; i++) re[i] = input[i]; - - fft_complex(re.data(), im.data(), N, false); - - size_t bins = n / 2 + 1; - for (size_t i = 0; i < bins; i++) { - out_real[i] = re[i]; - out_imag[i] = im[i]; - } -} - -void ifft_real(const float* in_real, const float* in_imag, size_t n, - float* output) -{ - size_t N = next_pow2(n); - std::vector re(N, 0.0f), im(N, 0.0f); - - size_t bins = n / 2 + 1; - for (size_t i = 0; i < bins; i++) { - re[i] = in_real[i]; - im[i] = in_imag[i]; - } - // Conjugate symmetry - for (size_t i = bins; i < N; i++) { - re[i] = re[N - i]; - im[i] = -im[N - i]; - } - - fft_complex(re.data(), im.data(), N, true); - - for (size_t i = 0; i < n; i++) output[i] = re[i]; -} diff --git a/sdk/src/main/cpp/audio/fft.h b/sdk/src/main/cpp/audio/fft.h deleted file mode 100644 index bfe5433..0000000 --- a/sdk/src/main/cpp/audio/fft.h +++ /dev/null @@ -1,13 +0,0 @@ -#pragma once - -#include -#include - -/// Minimal radix-2 FFT for Android NDK (no external dependencies). -/// Operates on real signals — returns complex spectrum [0..N/2]. - -void fft_real(const float* input, size_t n, - float* out_real, float* out_imag); - -void ifft_real(const float* in_real, const float* in_imag, size_t n, - float* output); diff --git a/sdk/src/main/cpp/audio/mel.cpp b/sdk/src/main/cpp/audio/mel.cpp deleted file mode 100644 index 3d2edf0..0000000 --- a/sdk/src/main/cpp/audio/mel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -#include "mel.h" -#include "fft.h" -#include -#include - -// HTK mel scale (used when slaney_norm=false). -static float htk_hz_to_mel(float hz) { - return 2595.0f * std::log10(1.0f + hz / 700.0f); -} -static float htk_mel_to_hz(float mel) { - return 700.0f * (std::pow(10.0f, mel / 2595.0f) - 1.0f); -} - -// Slaney mel scale (used when slaney_norm=true): -// Linear below 1000 Hz: mel = 3 * f / 200 -// Log above 1000 Hz: mel = 15 + 27 * log(f/1000) / log(6.4) -static constexpr float kSlaneyBreakHz = 1000.0f; -static constexpr float kSlaneyBreakMel = 15.0f; // 3 * 1000 / 200 -static const float kSlaneyLogStep = 27.0f / std::log(6.4f); // ≈ 14.536 - -static float slaney_hz_to_mel(float hz) { - if (hz < kSlaneyBreakHz) - return 3.0f * hz / 200.0f; - return kSlaneyBreakMel + std::log(hz / kSlaneyBreakHz) * kSlaneyLogStep; -} -static float slaney_mel_to_hz(float mel) { - if (mel < kSlaneyBreakMel) - return 200.0f * mel / 3.0f; - return kSlaneyBreakHz * std::exp((mel - kSlaneyBreakMel) / kSlaneyLogStep); -} - -static std::vector mel_filterbank( - int num_mel_bins, int n_fft, int sample_rate, bool slaney_norm) -{ - int num_bins = n_fft / 2 + 1; - - // Choose mel scale: Slaney (torchaudio default) when slaney_norm is on, - // HTK otherwise (backward compat). - auto hz2mel = slaney_norm ? slaney_hz_to_mel : htk_hz_to_mel; - auto mel2hz = slaney_norm ? slaney_mel_to_hz : htk_mel_to_hz; - - float mel_low = hz2mel(0.0f); - float mel_high = hz2mel(static_cast(sample_rate) / 2.0f); - - std::vector mel_points(num_mel_bins + 2); - // Hz centres of each mel point (for Slaney norm later). - std::vector hz_points(num_mel_bins + 2); - for (int i = 0; i < num_mel_bins + 2; i++) { - float mel = mel_low + (mel_high - mel_low) * i / (num_mel_bins + 1); - hz_points[i] = mel2hz(mel); - } - - // Convert to FFT bin indices - std::vector bin_freqs(num_mel_bins + 2); - for (int i = 0; i < num_mel_bins + 2; i++) { - bin_freqs[i] = hz_points[i] * n_fft / sample_rate; - } - - // Triangular filters [num_mel_bins * num_bins] - std::vector fb(num_mel_bins * num_bins, 0.0f); - for (int m = 0; m < num_mel_bins; m++) { - float left = bin_freqs[m]; - float center = bin_freqs[m + 1]; - float right = bin_freqs[m + 2]; - - for (int f = 0; f < num_bins; f++) { - float ff = static_cast(f); - if (ff >= left && ff <= center && center > left) { - fb[m * num_bins + f] = (ff - left) / (center - left); - } else if (ff > center && ff <= right && right > center) { - fb[m * num_bins + f] = (right - ff) / (right - center); - } - } - - // Slaney normalization: divide each filter by its bandwidth in Hz - // so the filter has unit area. Matches torchaudio norm="slaney". - if (slaney_norm) { - float bandwidth = hz_points[m + 2] - hz_points[m]; - if (bandwidth > 0.0f) { - float enorm = 2.0f / bandwidth; - for (int f = 0; f < num_bins; f++) { - fb[m * num_bins + f] *= enorm; - } - } - } - } - return fb; -} - -std::vector mel_spectrogram( - const float* audio, size_t length, - int sample_rate, int n_fft, int hop_length, - int win_length, int num_mel_bins, - bool slaney_norm, float log_floor, bool center) -{ - // Optional center padding: pad signal by n_fft/2 on each side using - // reflect mode (matches torchaudio / NeMo center=True). - std::vector padded; - const float* sig = audio; - size_t sig_len = length; - - if (center) { - int pad = n_fft / 2; - sig_len = length + 2 * static_cast(pad); - padded.resize(sig_len); - - // Left reflect padding: padded[pad-1-i] = audio[i+1] for i in [0, pad-1) - for (int i = 0; i < pad; ++i) { - int src = std::min(i + 1, static_cast(length) - 1); - padded[pad - 1 - i] = audio[src]; - } - // Copy original signal - std::copy(audio, audio + length, padded.begin() + pad); - // Right reflect padding - for (int i = 0; i < pad; ++i) { - int src = std::max(static_cast(length) - 2 - i, 0); - padded[pad + static_cast(length) + i] = audio[src]; - } - sig = padded.data(); - } - - int num_bins = n_fft / 2 + 1; - int num_frames = static_cast((sig_len - static_cast(win_length)) - / hop_length) + 1; - if (num_frames <= 0) return {}; - - auto fb = mel_filterbank(num_mel_bins, n_fft, sample_rate, slaney_norm); - - // Hann window - std::vector window(win_length); - for (int i = 0; i < win_length; i++) { - window[i] = 0.5f * (1.0f - std::cos(2.0f * static_cast(M_PI) - * i / (win_length - 1))); - } - - // STFT + mel - std::vector mel(num_mel_bins * num_frames); - std::vector frame(n_fft, 0.0f); - std::vector spec_re(num_bins), spec_im(num_bins); - - for (int t = 0; t < num_frames; t++) { - // Windowed frame (zero-padded if win_length < n_fft) - std::fill(frame.begin(), frame.end(), 0.0f); - for (int i = 0; i < win_length; i++) { - frame[i] = sig[t * hop_length + i] * window[i]; - } - - fft_real(frame.data(), n_fft, spec_re.data(), spec_im.data()); - - // Power spectrum → mel → log - for (int m = 0; m < num_mel_bins; m++) { - float sum = 0.0f; - for (int f = 0; f < num_bins; f++) { - float power = spec_re[f] * spec_re[f] - + spec_im[f] * spec_im[f]; - sum += power * fb[m * num_bins + f]; - } - mel[m * num_frames + t] = std::log(sum + log_floor); - } - } - - return mel; -} diff --git a/sdk/src/main/cpp/audio/mel.h b/sdk/src/main/cpp/audio/mel.h deleted file mode 100644 index 7350c84..0000000 --- a/sdk/src/main/cpp/audio/mel.h +++ /dev/null @@ -1,20 +0,0 @@ -#pragma once - -#include -#include - -/// Compute log-mel spectrogram from raw audio. -/// Returns flattened [num_mel_bins, num_frames] in channels-first layout -/// (row = mel bin, column = time frame). -/// -/// Optional parameters (default to the original behaviour): -/// slaney_norm — area-normalise each triangular filter by its bandwidth -/// log_floor — additive floor before log: log(x + floor) -/// center — pad signal by n_fft/2 on each side (reflect mode) -std::vector mel_spectrogram( - const float* audio, size_t length, - int sample_rate, int n_fft, int hop_length, - int win_length, int num_mel_bins, - bool slaney_norm = false, - float log_floor = 1e-10f, - bool center = false); diff --git a/sdk/src/main/cpp/audio/stft.cpp b/sdk/src/main/cpp/audio/stft.cpp deleted file mode 100644 index 8e89ead..0000000 --- a/sdk/src/main/cpp/audio/stft.cpp +++ /dev/null @@ -1,64 +0,0 @@ -#include "stft.h" -#include "fft.h" -#include -#include -#include - -int stft_num_frames(size_t signal_length, int fft_size, int hop_size) { - if (static_cast(signal_length) < fft_size) return 0; - return static_cast((signal_length - fft_size) / hop_size) + 1; -} - -void stft_forward(const float* audio, size_t length, - int fft_size, int hop_size, - const float* window, - float* out_real, float* out_imag) -{ - int num_frames = stft_num_frames(length, fft_size, hop_size); - int freq_bins = fft_size / 2 + 1; - std::vector frame(fft_size); - - for (int t = 0; t < num_frames; t++) { - // Apply window - for (int i = 0; i < fft_size; i++) { - frame[i] = audio[t * hop_size + i] * window[i]; - } - - fft_real(frame.data(), fft_size, - out_real + t * freq_bins, - out_imag + t * freq_bins); - } -} - -void stft_inverse(const float* spec_real, const float* spec_imag, - int num_frames, int fft_size, int hop_size, - const float* window, - float* output, size_t out_length) -{ - int freq_bins = fft_size / 2 + 1; - std::vector frame(fft_size); - std::vector win_sum(out_length, 0.0f); - - std::memset(output, 0, out_length * sizeof(float)); - - for (int t = 0; t < num_frames; t++) { - ifft_real(spec_real + t * freq_bins, - spec_imag + t * freq_bins, - fft_size, frame.data()); - - // Overlap-add with synthesis window - for (int i = 0; i < fft_size; i++) { - size_t idx = t * hop_size + i; - if (idx >= out_length) break; - output[idx] += frame[i] * window[i]; - win_sum[idx] += window[i] * window[i]; - } - } - - // Normalize by window sum - for (size_t i = 0; i < out_length; i++) { - if (win_sum[i] > 1e-8f) { - output[i] /= win_sum[i]; - } - } -} diff --git a/sdk/src/main/cpp/audio/stft.h b/sdk/src/main/cpp/audio/stft.h deleted file mode 100644 index b5f5160..0000000 --- a/sdk/src/main/cpp/audio/stft.h +++ /dev/null @@ -1,33 +0,0 @@ -#pragma once - -#include - -/// Number of STFT frames for a given signal length. -int stft_num_frames(size_t signal_length, int fft_size, int hop_size); - -/// Forward STFT with overlap-add windowing. -/// @param audio Input signal -/// @param length Number of samples -/// @param fft_size FFT size (e.g. 960 for DeepFilterNet3) -/// @param hop_size Hop size (e.g. 480) -/// @param window Analysis window [fft_size] -/// @param out_real Output real spectrum [num_frames * freq_bins] -/// @param out_imag Output imaginary spectrum [num_frames * freq_bins] -void stft_forward(const float* audio, size_t length, - int fft_size, int hop_size, - const float* window, - float* out_real, float* out_imag); - -/// Inverse STFT via overlap-add. -/// @param spec_real Real spectrum [num_frames * freq_bins] -/// @param spec_imag Imaginary spectrum [num_frames * freq_bins] -/// @param num_frames Number of STFT frames -/// @param fft_size FFT size -/// @param hop_size Hop size -/// @param window Synthesis window [fft_size] -/// @param output Output signal buffer -/// @param out_length Expected output length (samples) -void stft_inverse(const float* spec_real, const float* spec_imag, - int num_frames, int fft_size, int hop_size, - const float* window, - float* output, size_t out_length); diff --git a/sdk/src/main/cpp/jni_bridge.cpp b/sdk/src/main/cpp/jni_bridge.cpp index 9ec1ed5..c2bc7cd 100644 --- a/sdk/src/main/cpp/jni_bridge.cpp +++ b/sdk/src/main/cpp/jni_bridge.cpp @@ -1,41 +1,42 @@ #include #include -#include -#include -#include -#include "models/onnx_engine.h" -#include "models/silero_vad.h" -#include "models/parakeet_stt.h" -#include "models/kokoro_tts.h" -#include "models/deepfilter.h" +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include #define LOG_TAG "Speech" #define LOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__) #define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__) // --------------------------------------------------------------------------- -// Pipeline handle — owns all native objects for one pipeline instance +// Pipeline handle +// +// speech_core::* model wrappers directly implement the speech_core interfaces +// (VADInterface / STTInterface / TTSInterface / EnhancerInterface), so the +// JNI bridge constructs them and hands references to VoicePipeline. No +// C-vtable adapters needed — the entire vtable boilerplate that used to live +// here was deleted in this change. // --------------------------------------------------------------------------- struct PipelineHandle { - sc_pipeline_t pipeline = nullptr; - SileroVad* vad = nullptr; - ParakeetStt* stt = nullptr; - KokoroTts* tts = nullptr; - DeepFilterEnhancer* enhancer = nullptr; + std::unique_ptr vad; + std::unique_ptr stt; + std::unique_ptr tts; + std::unique_ptr enhancer; + std::unique_ptr pipeline; JavaVM* jvm = nullptr; jobject callback = nullptr; jmethodID on_event_mid = nullptr; - - ~PipelineHandle() { - if (pipeline) sc_pipeline_destroy(pipeline); - delete enhancer; - delete tts; - delete stt; - delete vad; - } }; // --------------------------------------------------------------------------- @@ -51,152 +52,68 @@ static JNIEnv* get_env(JavaVM* jvm) { } // --------------------------------------------------------------------------- -// speech-core vtable adapters +// Pipeline event → Kotlin onEvent +// +// Kotlin signature unchanged: +// void onEvent(int type, String text, byte[] audio, +// float confidence, float sttMs, float ttsMs) // --------------------------------------------------------------------------- -// --- VAD --- - -static float vad_process_chunk(void* ctx, const float* samples, size_t len) { - return static_cast(ctx)->process_chunk(samples, len); -} -static void vad_reset(void* ctx) { - static_cast(ctx)->reset(); -} -static int vad_sample_rate(void* ctx) { - return static_cast(ctx)->input_sample_rate(); -} -static size_t vad_chunk_size(void* ctx) { - return static_cast(ctx)->chunk_size(); -} - -// --- STT --- - -static sc_transcription_result_t stt_transcribe( - void* ctx, const float* audio, size_t len, int sr) -{ - auto* stt = static_cast(ctx); - auto r = stt->transcribe(audio, len, sr); - - // Static buffers — valid until next call (per C API contract) - static thread_local std::string text_buf; - static thread_local std::string lang_buf; - text_buf = std::move(r.text); - lang_buf = std::move(r.language); - - return { - .text = text_buf.c_str(), - .language = lang_buf.empty() ? nullptr : lang_buf.c_str(), - .confidence = r.confidence, - .start_time = 0.0f, - .end_time = 0.0f, - }; -} -static int stt_sample_rate(void* ctx) { - return static_cast(ctx)->input_sample_rate(); -} - -static void stt_begin_stream(void* ctx, int sample_rate) { - static_cast(ctx)->begin_stream(sample_rate); -} - -static sc_partial_result_t stt_push_chunk(void* ctx, const float* audio, size_t len) { - auto* stt = static_cast(ctx); - auto r = stt->push_chunk(audio, len); - static thread_local std::string text_buf; - static thread_local std::string lang_buf; - text_buf = std::move(r.text); - lang_buf = std::move(r.language); - return { - .text = text_buf.c_str(), - .language = lang_buf.empty() ? nullptr : lang_buf.c_str(), - .confidence = r.confidence, - }; -} - -static void stt_flush_stream(void* ctx) { - static_cast(ctx)->flush_stream(); -} - -static sc_transcription_result_t stt_end_stream(void* ctx) { - auto* stt = static_cast(ctx); - auto r = stt->end_stream(); - static thread_local std::string text_buf; - static thread_local std::string lang_buf; - text_buf = std::move(r.text); - lang_buf = std::move(r.language); - return { - .text = text_buf.c_str(), - .language = lang_buf.empty() ? nullptr : lang_buf.c_str(), - .confidence = r.confidence, - .start_time = 0.0f, - .end_time = 0.0f, - }; -} - -static void stt_cancel_stream(void* ctx) { - static_cast(ctx)->cancel_stream(); -} - -// --- TTS --- - -static void tts_synthesize( - void* ctx, const char* text, const char* language, - sc_tts_chunk_fn on_chunk, void* chunk_ctx) -{ - auto* tts = static_cast(ctx); - tts->synthesize(text, language, on_chunk, chunk_ctx); -} -static int tts_sample_rate(void* ctx) { - return static_cast(ctx)->output_sample_rate(); -} -static void tts_cancel(void* ctx) { - static_cast(ctx)->cancel(); -} - -// --- Enhancer --- - -static void enhancer_enhance( - void* ctx, const float* input, size_t len, int sr, float* output) -{ - static_cast(ctx)->enhance(input, len, sr, output); -} -static int enhancer_sample_rate(void* ctx) { - return static_cast(ctx)->input_sample_rate(); +// Map speech_core::EventType → the int values the Kotlin side expects. +// +// Kotlin's SpeechPipeline.kt switches on raw ints inherited from the original +// C ABI (sc_event_t.type), whose ordering differs from speech_core::EventType: +// the C ABI had ResponseAudioDelta=7 / ResponseDone=8, the enum has them +// swapped. Map explicitly so renumbering speech_core::EventType in the future +// can't silently break the Kotlin event stream. +static jint to_kotlin_event(speech_core::EventType t) { + using ET = speech_core::EventType; + switch (t) { + case ET::SessionCreated: return 0; + case ET::SpeechStarted: return 1; + case ET::SpeechEnded: return 2; + case ET::PartialTranscription: return 3; + case ET::TranscriptionCompleted: return 4; + case ET::ResponseCreated: return 5; + case ET::ResponseInterrupted: return 6; + case ET::ResponseAudioDelta: return 7; + case ET::ResponseDone: return 8; + case ET::ToolCallStarted: return 9; + case ET::ToolCallCompleted: return 10; + case ET::Error: return 11; + } + return -1; } -// --------------------------------------------------------------------------- -// Event callback → Kotlin -// --------------------------------------------------------------------------- - -static void on_pipeline_event(const sc_event_t* event, void* context) { - auto* handle = static_cast(context); +static void dispatch_event(PipelineHandle* h, + const speech_core::PipelineEvent& event) { LOGI("event type=%d text='%.60s' audio=%zu stt=%.0fms tts=%.0fms", - event->type, event->text ? event->text : "", - event->audio_data_length, event->stt_duration_ms, event->tts_duration_ms); - if (!handle->callback) return; + static_cast(event.type), event.text.c_str(), + event.audio_data.size(), event.stt_duration_ms, + event.tts_duration_ms); + + if (!h->callback) return; - JNIEnv* env = get_env(handle->jvm); + JNIEnv* env = get_env(h->jvm); if (!env) return; - jstring text = event->text - ? env->NewStringUTF(event->text) : nullptr; + jstring text = !event.text.empty() + ? env->NewStringUTF(event.text.c_str()) : nullptr; jbyteArray audio = nullptr; - if (event->audio_data && event->audio_data_length > 0) { - audio = env->NewByteArray(static_cast(event->audio_data_length)); + if (!event.audio_data.empty()) { + audio = env->NewByteArray(static_cast(event.audio_data.size())); env->SetByteArrayRegion(audio, 0, - static_cast(event->audio_data_length), - reinterpret_cast(event->audio_data)); + static_cast(event.audio_data.size()), + reinterpret_cast(event.audio_data.data())); } - // void onEvent(int type, String text, byte[] audio, - // float confidence, float sttMs, float ttsMs) - env->CallVoidMethod(handle->callback, handle->on_event_mid, - static_cast(event->type), + env->CallVoidMethod(h->callback, h->on_event_mid, + to_kotlin_event(event.type), text, audio, - event->confidence, - event->stt_duration_ms, - event->tts_duration_ms); + event.confidence, + event.stt_duration_ms, + event.tts_duration_ms); if (audio) env->DeleteLocalRef(audio); if (text) env->DeleteLocalRef(text); @@ -227,7 +144,7 @@ Java_audio_soniqo_speech_NativeBridge_nativeCreate( bool nnapi = useNnapi; std::string suffix = useInt8 ? "-int8" : ""; - auto* h = new PipelineHandle(); + auto h = std::make_unique(); env->GetJavaVM(&h->jvm); h->callback = env->NewGlobalRef(callback); @@ -238,62 +155,38 @@ Java_audio_soniqo_speech_NativeBridge_nativeCreate( try { // Load models - h->vad = new SileroVad(dir + "/silero-vad.onnx", false); - h->stt = new ParakeetStt( + h->vad = std::make_unique( + dir + "/silero-vad.onnx", /*hw_accel=*/false); + h->stt = std::make_unique( dir + "/parakeet-encoder" + suffix + ".onnx", dir + "/parakeet-decoder-joint" + suffix + ".onnx", dir + "/vocab.json", nnapi); - h->tts = new KokoroTts( + h->tts = std::make_unique( dir + "/kokoro-e2e.onnx", dir + "/voices", dir, nnapi); - // Build vtables - sc_vad_vtable_t vad_vt = { - .context = h->vad, - .process_chunk = vad_process_chunk, - .reset = vad_reset, - .input_sample_rate = vad_sample_rate, - .chunk_size = vad_chunk_size, - }; - - sc_stt_vtable_t stt_vt = {}; - stt_vt.context = h->stt; - stt_vt.transcribe = stt_transcribe; - stt_vt.input_sample_rate = stt_sample_rate; - stt_vt.begin_stream = stt_begin_stream; - stt_vt.push_chunk = stt_push_chunk; - stt_vt.flush_stream = stt_flush_stream; - stt_vt.end_stream = stt_end_stream; - stt_vt.cancel_stream = stt_cancel_stream; - - sc_tts_vtable_t tts_vt = {}; - tts_vt.context = h->tts; - tts_vt.synthesize = tts_synthesize; - tts_vt.output_sample_rate = tts_sample_rate; - tts_vt.cancel = tts_cancel; - - // Pipeline config - sc_config_t config = sc_config_default(); - config.min_silence_duration = 0.5f; - config.eager_stt = false; - config.min_speech_duration = 0.15f; - config.post_playback_guard = 0.15f; - config.emit_partial_transcriptions = emitPartialTranscriptions; - config.partial_transcription_interval = partialTranscriptionInterval; - - config.mode = SC_MODE_ECHO; - h->pipeline = sc_pipeline_create( - stt_vt, tts_vt, nullptr, vad_vt, - config, on_pipeline_event, h); + speech_core::AgentConfig cfg; + cfg.vad.min_silence_duration = 0.5f; + cfg.vad.min_speech_duration = 0.15f; + cfg.eager_stt = false; + cfg.post_playback_guard = 0.15f; + cfg.emit_partial_transcriptions = emitPartialTranscriptions; + cfg.partial_transcription_interval = partialTranscriptionInterval; + cfg.mode = speech_core::AgentConfig::Mode::Echo; // Note: DeepFilterNet3 noise cancellation is disabled in the pipeline. - // DFN operates at 48kHz but the pipeline pushes 16kHz audio — running - // DFN without resampling produces artifacts. Needs 16k→48k→DFN→48k→16k - // resample chain before it can be re-enabled. See issue #12. - // The model is still downloaded for future use. + // DFN operates at 48 kHz but the pipeline pushes 16 kHz audio — + // running DFN without resampling produces artifacts. Needs a + // 16k→48k→DFN→48k→16k resample chain before it can be re-enabled. + // See issue #12. The model is still downloaded for future use. + + PipelineHandle* raw = h.get(); + h->pipeline = std::make_unique( + *h->stt, *h->tts, /*llm=*/nullptr, *h->vad, cfg, + [raw](const speech_core::PipelineEvent& e) { dispatch_event(raw, e); }); auto& engine = OnnxEngine::get(); if (engine.had_nnapi_fallback()) { @@ -305,7 +198,6 @@ Java_audio_soniqo_speech_NativeBridge_nativeCreate( } catch (const std::exception& e) { LOGE("Pipeline creation failed: %s", e.what()); if (h->callback) env->DeleteGlobalRef(h->callback); - delete h; jclass ex_cls = env->FindClass("java/lang/RuntimeException"); if (ex_cls) { std::string msg = std::string("Native pipeline failed: ") + e.what(); @@ -314,7 +206,7 @@ Java_audio_soniqo_speech_NativeBridge_nativeCreate( return 0; } - return reinterpret_cast(h); + return reinterpret_cast(h.release()); } JNIEXPORT jstring JNICALL @@ -344,7 +236,7 @@ Java_audio_soniqo_speech_NativeBridge_nativeStart( JNIEnv* /*env*/, jobject /*thiz*/, jlong handle) { auto* h = reinterpret_cast(handle); - if (h && h->pipeline) sc_pipeline_start(h->pipeline); + if (h && h->pipeline) h->pipeline->start(); } JNIEXPORT void JNICALL @@ -352,7 +244,7 @@ Java_audio_soniqo_speech_NativeBridge_nativeStop( JNIEnv* /*env*/, jobject /*thiz*/, jlong handle) { auto* h = reinterpret_cast(handle); - if (h && h->pipeline) sc_pipeline_stop(h->pipeline); + if (h && h->pipeline) h->pipeline->stop(); } JNIEXPORT void JNICALL @@ -364,7 +256,7 @@ Java_audio_soniqo_speech_NativeBridge_nativePushAudio( if (!h || !h->pipeline) return; float* data = env->GetFloatArrayElements(samples, nullptr); - sc_pipeline_push_audio(h->pipeline, data, static_cast(count)); + h->pipeline->push_audio(data, static_cast(count)); env->ReleaseFloatArrayElements(samples, data, JNI_ABORT); } @@ -373,7 +265,7 @@ Java_audio_soniqo_speech_NativeBridge_nativeResumeListen( JNIEnv* /*env*/, jobject /*thiz*/, jlong handle) { auto* h = reinterpret_cast(handle); - if (h && h->pipeline) sc_pipeline_resume_listening(h->pipeline); + if (h && h->pipeline) h->pipeline->resume_listening(); } JNIEXPORT jint JNICALL @@ -381,8 +273,8 @@ Java_audio_soniqo_speech_NativeBridge_nativeGetState( JNIEnv* /*env*/, jobject /*thiz*/, jlong handle) { auto* h = reinterpret_cast(handle); - if (!h || !h->pipeline) return SC_STATE_IDLE; - return sc_pipeline_state(h->pipeline); + if (!h || !h->pipeline) return 0; + return static_cast(h->pipeline->state()); } } // extern "C" diff --git a/sdk/src/main/cpp/models/deepfilter.cpp b/sdk/src/main/cpp/models/deepfilter.cpp deleted file mode 100644 index eb43bae..0000000 --- a/sdk/src/main/cpp/models/deepfilter.cpp +++ /dev/null @@ -1,192 +0,0 @@ -#include "deepfilter.h" -#include "onnx_engine.h" -#include "../audio/stft.h" -#include -#include -#include - -DeepFilterEnhancer::DeepFilterEnhancer( - const std::string& model_path, - const std::string& auxiliary_path, - bool nnapi) -{ - auto& engine = OnnxEngine::get(); - api_ = engine.api(); - session_ = engine.load(model_path, nnapi); - load_auxiliary(auxiliary_path); -} - -DeepFilterEnhancer::~DeepFilterEnhancer() { - if (session_) api_->ReleaseSession(session_); -} - -void DeepFilterEnhancer::load_auxiliary(const std::string& path) { - // Load precomputed ERB filterbanks and window from binary file. - // Format: erb_fb [481*32] | erb_inv_fb [32*481] | window [960] (float32) - std::ifstream file(path, std::ios::binary); - if (!file.is_open()) { - LOGE("Auxiliary file not found: %s", path.c_str()); - return; - } - - erb_fb_.resize(cfg_.freq_bins * cfg_.erb_bands); - erb_inv_fb_.resize(cfg_.erb_bands * cfg_.freq_bins); - window_.resize(cfg_.fft_size); - - file.read(reinterpret_cast(erb_fb_.data()), - erb_fb_.size() * sizeof(float)); - file.read(reinterpret_cast(erb_inv_fb_.data()), - erb_inv_fb_.size() * sizeof(float)); - file.read(reinterpret_cast(window_.data()), - window_.size() * sizeof(float)); -} - -void DeepFilterEnhancer::compute_erb_features( - const float* spec_real, const float* spec_imag, int num_frames, - std::vector& feat_erb, std::vector& feat_spec) -{ - feat_erb.resize(num_frames * cfg_.erb_bands); - feat_spec.resize(num_frames * 2 * cfg_.df_bins); - - for (int t = 0; t < num_frames; t++) { - // Power spectrum → ERB bands - for (int b = 0; b < cfg_.erb_bands; b++) { - float sum = 0.0f; - for (int f = 0; f < cfg_.freq_bins; f++) { - float re = spec_real[t * cfg_.freq_bins + f]; - float im = spec_imag[t * cfg_.freq_bins + f]; - sum += (re * re + im * im) * erb_fb_[f * cfg_.erb_bands + b]; - } - feat_erb[t * cfg_.erb_bands + b] = 10.0f * std::log10(sum + 1e-10f); - } - - // Complex spectrum for deep-filtered bins - for (int f = 0; f < cfg_.df_bins; f++) { - feat_spec[t * 2 * cfg_.df_bins + f] = - spec_real[t * cfg_.freq_bins + f]; - feat_spec[t * 2 * cfg_.df_bins + cfg_.df_bins + f] = - spec_imag[t * cfg_.freq_bins + f]; - } - } -} - -void DeepFilterEnhancer::apply_erb_mask( - float* spec_real, float* spec_imag, - const float* mask, int num_frames) -{ - for (int t = 0; t < num_frames; t++) { - for (int f = 0; f < cfg_.freq_bins; f++) { - // Expand ERB mask to full spectrum - float gain = 0.0f; - for (int b = 0; b < cfg_.erb_bands; b++) { - gain += mask[t * cfg_.erb_bands + b] - * erb_inv_fb_[b * cfg_.freq_bins + f]; - } - spec_real[t * cfg_.freq_bins + f] *= gain; - spec_imag[t * cfg_.freq_bins + f] *= gain; - } - } -} - -void DeepFilterEnhancer::apply_deep_filter( - float* spec_real, float* spec_imag, - const float* coefs, int num_frames) -{ - int pad_before = cfg_.df_order - 1 - cfg_.df_lookahead; - - for (int t = 0; t < num_frames; t++) { - for (int f = 0; f < cfg_.df_bins; f++) { - float out_re = 0.0f, out_im = 0.0f; - - for (int n = 0; n < cfg_.df_order; n++) { - int src_t = t + n - pad_before; - if (src_t < 0 || src_t >= num_frames) continue; - - float x_re = spec_real[src_t * cfg_.freq_bins + f]; - float x_im = spec_imag[src_t * cfg_.freq_bins + f]; - - // coefs layout: [1, df_order, T, df_bins, 2] - int idx = (n * num_frames * cfg_.df_bins + t * cfg_.df_bins + f) * 2; - float w_re = coefs[idx]; - float w_im = coefs[idx + 1]; - - // Complex multiply - out_re += x_re * w_re - x_im * w_im; - out_im += x_re * w_im + x_im * w_re; - } - - spec_real[t * cfg_.freq_bins + f] = out_re; - spec_imag[t * cfg_.freq_bins + f] = out_im; - } - } -} - -void DeepFilterEnhancer::enhance( - const float* audio, size_t length, int /*sample_rate*/, float* output) -{ - auto* mem = OnnxEngine::get().cpu_memory(); - - // --- STFT --- - - int num_frames = stft_num_frames(length, cfg_.fft_size, cfg_.hop_size); - std::vector spec_real(num_frames * cfg_.freq_bins); - std::vector spec_imag(num_frames * cfg_.freq_bins); - - stft_forward(audio, length, cfg_.fft_size, cfg_.hop_size, - window_.data(), spec_real.data(), spec_imag.data()); - - // --- features --- - - std::vector feat_erb, feat_spec; - compute_erb_features(spec_real.data(), spec_imag.data(), - num_frames, feat_erb, feat_spec); - - // --- ONNX inference --- - - int64_t T = num_frames; - const int64_t erb_shape[] = {1, 1, T, cfg_.erb_bands}; - const int64_t spec_shape[] = {1, 2, T, cfg_.df_bins}; - - OrtValue* t_erb = nullptr; - ort_check(api_, api_->CreateTensorWithDataAsOrtValue( - mem, feat_erb.data(), feat_erb.size() * sizeof(float), - erb_shape, 4, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &t_erb)); - - OrtValue* t_spec = nullptr; - ort_check(api_, api_->CreateTensorWithDataAsOrtValue( - mem, feat_spec.data(), feat_spec.size() * sizeof(float), - spec_shape, 4, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &t_spec)); - - const char* in_names[] = {"feat_erb", "feat_spec"}; - const char* out_names[] = {"erb_mask", "df_coefs"}; - OrtValue* inputs[] = {t_erb, t_spec}; - OrtValue* outputs[] = {nullptr, nullptr}; - - ort_check(api_, api_->Run( - session_, nullptr, - in_names, inputs, 2, - out_names, 2, outputs)); - - float* erb_mask = nullptr; - ort_check(api_, api_->GetTensorMutableData(outputs[0], (void**)&erb_mask)); - float* df_coefs = nullptr; - ort_check(api_, api_->GetTensorMutableData(outputs[1], (void**)&df_coefs)); - - // --- apply mask + deep filter --- - - apply_erb_mask(spec_real.data(), spec_imag.data(), erb_mask, num_frames); - apply_deep_filter(spec_real.data(), spec_imag.data(), df_coefs, num_frames); - - // --- inverse STFT --- - - stft_inverse(spec_real.data(), spec_imag.data(), num_frames, - cfg_.fft_size, cfg_.hop_size, - window_.data(), output, length); - - // --- cleanup --- - - api_->ReleaseValue(outputs[1]); - api_->ReleaseValue(outputs[0]); - api_->ReleaseValue(t_spec); - api_->ReleaseValue(t_erb); -} diff --git a/sdk/src/main/cpp/models/deepfilter.h b/sdk/src/main/cpp/models/deepfilter.h deleted file mode 100644 index c6603e7..0000000 --- a/sdk/src/main/cpp/models/deepfilter.h +++ /dev/null @@ -1,58 +0,0 @@ -#pragma once - -#include -#include -#include - -/// DeepFilterNet3 — real-time speech enhancement / noise cancellation. -/// Processes audio at 48 kHz using STFT + ERB filterbank + neural network. -/// Model size: ~2.1M parameters (~8 MB FP16). -class DeepFilterEnhancer { -public: - struct Config { - int fft_size = 960; - int hop_size = 480; - int erb_bands = 32; - int df_bins = 96; // deep-filtered frequency bins - int df_order = 5; // filter taps - int df_lookahead = 2; - int freq_bins = 481; // fft_size / 2 + 1 - int sample_rate = 48000; - }; - - DeepFilterEnhancer(const std::string& model_path, - const std::string& auxiliary_path, - bool nnapi = true); - ~DeepFilterEnhancer(); - - /// Enhance audio by removing noise. - /// @param audio Input PCM Float32 at 48 kHz - /// @param length Number of samples - /// @param sample_rate Input sample rate (must be 48000) - /// @param output Pre-allocated output buffer (same length) - void enhance(const float* audio, size_t length, int sample_rate, - float* output); - - int input_sample_rate() const { return cfg_.sample_rate; } - -private: - void load_auxiliary(const std::string& path); - void compute_erb_features(const float* spectrum_real, - const float* spectrum_imag, - int num_frames, - std::vector& feat_erb, - std::vector& feat_spec); - void apply_erb_mask(float* spectrum_real, float* spectrum_imag, - const float* mask, int num_frames); - void apply_deep_filter(float* spectrum_real, float* spectrum_imag, - const float* coefs, int num_frames); - - const OrtApi* api_; - OrtSession* session_ = nullptr; - Config cfg_; - - // ERB filterbanks - std::vector erb_fb_; // [freq_bins, erb_bands] - std::vector erb_inv_fb_; // [erb_bands, freq_bins] - std::vector window_; // Vorbis window [fft_size] -}; diff --git a/sdk/src/main/cpp/models/inference_engine.h b/sdk/src/main/cpp/models/inference_engine.h deleted file mode 100644 index c2eb10e..0000000 --- a/sdk/src/main/cpp/models/inference_engine.h +++ /dev/null @@ -1,73 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include - -/// Supported inference backends. -enum class Backend { ONNX, LITERT, AUTO }; - -/// Tensor element data types. -enum class DType { FLOAT32, INT64, INT32, INT8 }; - -/// Describes a tensor's data, shape, and type for passing to inference. -struct TensorInfo { - const void* data; - std::vector shape; - DType dtype; - - size_t byte_size() const { - size_t elems = 1; - for (auto d : shape) elems *= static_cast(d); - switch (dtype) { - case DType::FLOAT32: return elems * 4; - case DType::INT64: return elems * 8; - case DType::INT32: return elems * 4; - case DType::INT8: return elems * 1; - } - return elems * 4; - } -}; - -/// Wraps a single output tensor from an inference call. -/// Owns the backend-specific memory — valid until destroyed or next run(). -class OutputTensor { -public: - virtual ~OutputTensor() = default; - - virtual float* data_float() = 0; - virtual int64_t* data_int64() = 0; - virtual std::vector shape() = 0; - virtual size_t element_count() = 0; -}; - -/// A loaded model session — run inference with named inputs/outputs. -class InferenceSession { -public: - virtual ~InferenceSession() = default; - - /// Run inference. Outputs are returned as owned OutputTensor objects. - virtual std::vector> run( - const std::vector& input_names, - const std::vector& inputs, - const std::vector& output_names) = 0; -}; - -/// Factory for loading models. Each backend implements this. -class InferenceBackend { -public: - virtual ~InferenceBackend() = default; - - virtual std::unique_ptr load( - const std::string& path, bool hw_accel = true) = 0; - - virtual Backend type() const = 0; -}; - -/// Detect the optimal backend for the current device's SoC. -Backend detect_optimal_backend(); - -/// Create a backend instance. AUTO resolves via detect_optimal_backend(). -std::unique_ptr create_backend(Backend preference); diff --git a/sdk/src/main/cpp/models/kokoro_multilingual.cpp b/sdk/src/main/cpp/models/kokoro_multilingual.cpp deleted file mode 100644 index 9987258..0000000 --- a/sdk/src/main/cpp/models/kokoro_multilingual.cpp +++ /dev/null @@ -1,1841 +0,0 @@ -#include "kokoro_multilingual.h" -#include -#include -#include -#include -#include - -// --------------------------------------------------------------------------- -// UTF-8 helpers -// --------------------------------------------------------------------------- - -/// Decode one UTF-8 character, returning codepoint and advancing pos. -static uint32_t utf8_decode(const std::string& s, size_t& pos) { - if (pos >= s.size()) return 0; - unsigned char c = static_cast(s[pos]); - uint32_t cp; - size_t len; - if (c < 0x80) { cp = c; len = 1; } - else if (c < 0xC0) { cp = c; len = 1; } // continuation — error - else if (c < 0xE0) { cp = c & 0x1F; len = 2; } - else if (c < 0xF0) { cp = c & 0x0F; len = 3; } - else { cp = c & 0x07; len = 4; } - for (size_t i = 1; i < len && (pos + i) < s.size(); i++) { - cp = (cp << 6) | (static_cast(s[pos + i]) & 0x3F); - } - pos += len; - return cp; -} - -/// Get one UTF-8 character as a string, advancing pos. -static std::string utf8_char_at(const std::string& s, size_t& pos) { - if (pos >= s.size()) return ""; - unsigned char c = static_cast(s[pos]); - size_t len = 1; - if ((c & 0xE0) == 0xC0) len = 2; - else if ((c & 0xF0) == 0xE0) len = 3; - else if ((c & 0xF8) == 0xF0) len = 4; - if (pos + len > s.size()) len = s.size() - pos; - std::string result = s.substr(pos, len); - pos += len; - return result; -} - -/// Split UTF-8 string into individual characters. -static std::vector utf8_split(const std::string& s) { - std::vector out; - size_t pos = 0; - while (pos < s.size()) { - out.push_back(utf8_char_at(s, pos)); - } - return out; -} - -/// Encode a Unicode codepoint to UTF-8. -static std::string utf8_encode(uint32_t cp) { - std::string out; - if (cp < 0x80) { - out += static_cast(cp); - } else if (cp < 0x800) { - out += static_cast(0xC0 | (cp >> 6)); - out += static_cast(0x80 | (cp & 0x3F)); - } else if (cp < 0x10000) { - out += static_cast(0xE0 | (cp >> 12)); - out += static_cast(0x80 | ((cp >> 6) & 0x3F)); - out += static_cast(0x80 | (cp & 0x3F)); - } else { - out += static_cast(0xF0 | (cp >> 18)); - out += static_cast(0x80 | ((cp >> 12) & 0x3F)); - out += static_cast(0x80 | ((cp >> 6) & 0x3F)); - out += static_cast(0x80 | (cp & 0x3F)); - } - return out; -} - -/// Get codepoint of first UTF-8 character. -static uint32_t utf8_codepoint(const std::string& s) { - size_t pos = 0; - return utf8_decode(s, pos); -} - -/// Check if a character is a vowel letter (for Latin languages). -static bool is_latin_vowel(char c) { - char lc = static_cast(std::tolower(static_cast(c))); - return lc == 'a' || lc == 'e' || lc == 'i' || lc == 'o' || lc == 'u' || lc == 'y'; -} - -/// Check if character at offset i in string is a vowel (ASCII only). -static bool is_vowel_at(const std::string& s, size_t i) { - if (i >= s.size()) return false; - return is_latin_vowel(s[i]); -} - -/// Check if character is a consonant letter (ASCII). -static bool is_consonant(char c) { - char lc = static_cast(std::tolower(static_cast(c))); - return lc >= 'a' && lc <= 'z' && !is_latin_vowel(lc); -} - -/// Lowercase an ASCII string. -static std::string to_lower_ascii(const std::string& s) { - std::string r = s; - for (auto& c : r) c = static_cast(std::tolower(static_cast(c))); - return r; -} - -/// Check if a string starts with prefix at position pos. -static bool starts_with_at(const std::string& s, size_t pos, const std::string& prefix) { - if (pos + prefix.size() > s.size()) return false; - return s.compare(pos, prefix.size(), prefix) == 0; -} - -/// Post-process IPA for Kokoro vocab compatibility. -/// Maps IPA symbols that Kokoro doesn't have to ones it does. -static std::string kokoro_postprocess(const std::string& ipa) { - std::string result = ipa; - - // dʒ → ʤ - { - std::string from = "d\xCA\x92"; // dʒ - std::string to = "\xCA\xA4"; // ʤ - size_t pos = 0; - while ((pos = result.find(from, pos)) != std::string::npos) { - result.replace(pos, from.size(), to); - pos += to.size(); - } - } - // tʃ → ʧ - { - std::string from = "t\xCA\x83"; // tʃ - std::string to = "\xCA\xA7"; // ʧ - size_t pos = 0; - while ((pos = result.find(from, pos)) != std::string::npos) { - result.replace(pos, from.size(), to); - pos += to.size(); - } - } - // ʁ → ɹ (French uvular R → Kokoro's approximant R) - { - std::string from = "\xCA\x81"; // ʁ - std::string to = "\xC9\xB9"; // ɹ - size_t pos = 0; - while ((pos = result.find(from, pos)) != std::string::npos) { - result.replace(pos, from.size(), to); - pos += to.size(); - } - } - return result; -} - -/// Check if position is at word end (next char is space, punct, or end). -static bool at_word_end(const std::string& s, size_t pos) { - if (pos >= s.size()) return true; - char c = s[pos]; - return c == ' ' || c == ',' || c == '.' || c == '!' || c == '?' - || c == ';' || c == ':' || c == '-' || c == '\n' || c == '\t'; -} - -// =========================================================================== -// FRENCH -// =========================================================================== - -std::string multilingual::french_g2p(const std::string& text) { - std::string s = to_lower_ascii(text); - std::string ipa; - size_t len = s.size(); - - for (size_t i = 0; i < len; ) { - // --- Trigraphs --- - if (i + 3 <= len) { - std::string tri = s.substr(i, 3); - if (tri == "eau") { ipa += "o"; i += 3; continue; } - if (tri == "ain") { - // ain before consonant or end = nasal - if (i + 3 >= len || !is_vowel_at(s, i + 3)) { - ipa += "\xC9\x9B\xCC\x83"; // ɛ̃ - i += 3; continue; - } - } - if (tri == "ein") { - if (i + 3 >= len || !is_vowel_at(s, i + 3)) { - ipa += "\xC9\x9B\xCC\x83"; // ɛ̃ - i += 3; continue; - } - } - if (tri == "oin") { - if (i + 3 >= len || !is_vowel_at(s, i + 3)) { - ipa += "w\xC9\x9B\xCC\x83"; // wɛ̃ - i += 3; continue; - } - } - if (tri == "ien") { - if (i + 3 >= len || !is_vowel_at(s, i + 3)) { - ipa += "j\xC9\x9B\xCC\x83"; // jɛ̃ - i += 3; continue; - } - } - } - - // --- Digraphs --- - if (i + 2 <= len) { - std::string di = s.substr(i, 2); - - // Nasal vowels (before consonant or end, not before vowel) - if (di == "on" || di == "om") { - if (i + 2 >= len || !is_vowel_at(s, i + 2)) { - // Check not followed by another n/m (e.g., "onne") - if (i + 2 < len && (s[i + 2] == 'n' || s[i + 2] == 'm')) { - // Not nasal — "bonne" → not nasal - } else { - ipa += "\xC9\x94\xCC\x83"; // ɔ̃ - i += 2; continue; - } - } - } - if (di == "an" || di == "am") { - if (i + 2 >= len || !is_vowel_at(s, i + 2)) { - if (i + 2 < len && (s[i + 2] == 'n' || s[i + 2] == 'm')) { - // Not nasal - } else { - ipa += "\xC9\x91\xCC\x83"; // ɑ̃ - i += 2; continue; - } - } - } - if (di == "en" || di == "em") { - if (i + 2 >= len || !is_vowel_at(s, i + 2)) { - if (i + 2 < len && (s[i + 2] == 'n' || s[i + 2] == 'm')) { - // Not nasal - } else { - ipa += "\xC9\x91\xCC\x83"; // ɑ̃ - i += 2; continue; - } - } - } - if (di == "in" || di == "im") { - if (i + 2 >= len || !is_vowel_at(s, i + 2)) { - if (i + 2 < len && (s[i + 2] == 'n' || s[i + 2] == 'm')) { - // Not nasal - } else { - ipa += "\xC9\x9B\xCC\x83"; // ɛ̃ - i += 2; continue; - } - } - } - if (di == "un" || di == "um") { - if (i + 2 >= len || !is_vowel_at(s, i + 2)) { - if (i + 2 < len && (s[i + 2] == 'n' || s[i + 2] == 'm')) { - // Not nasal - } else { - ipa += "\xC5\x93\xCC\x83"; // œ̃ - i += 2; continue; - } - } - } - - // Other digraphs - if (di == "ou") { ipa += "u"; i += 2; continue; } - if (di == "oi") { ipa += "wa"; i += 2; continue; } - if (di == "ai") { ipa += "\xC9\x9B"; i += 2; continue; } // ɛ - if (di == "ei") { ipa += "\xC9\x9B"; i += 2; continue; } // ɛ - if (di == "au") { ipa += "o"; i += 2; continue; } - if (di == "eu") { ipa += "\xC3\xB8"; i += 2; continue; } // ø - if (di == "ch") { ipa += "\xCA\x83"; i += 2; continue; } // ʃ - if (di == "ph") { ipa += "f"; i += 2; continue; } - if (di == "gn") { ipa += "\xC9\xB2"; i += 2; continue; } // ɲ - if (di == "qu") { ipa += "k"; i += 2; continue; } - if (di == "gu") { - // gu before e/i → g (silent u) - if (i + 2 < len && (s[i + 2] == 'e' || s[i + 2] == 'i')) { - ipa += "g"; i += 2; continue; - } - } - if (di == "ss") { ipa += "s"; i += 2; continue; } - if (di == "ll") { ipa += "l"; i += 2; continue; } - if (di == "tt") { ipa += "t"; i += 2; continue; } - if (di == "nn") { ipa += "n"; i += 2; continue; } - if (di == "mm") { ipa += "m"; i += 2; continue; } - if (di == "rr") { ipa += "\xCA\x81"; i += 2; continue; } // ʁ - } - - char c = s[i]; - - // Context-dependent consonants - if (c == 'c') { - if (i + 1 < len && (s[i + 1] == 'e' || s[i + 1] == 'i' || s[i + 1] == 'y')) { - ipa += "s"; - } else { - ipa += "k"; - } - i++; continue; - } - if (c == 'g') { - if (i + 1 < len && (s[i + 1] == 'e' || s[i + 1] == 'i')) { - ipa += "\xCA\x92"; // ʒ - } else { - ipa += "g"; - } - i++; continue; - } - - // Silent final consonants - if ((c == 'd' || c == 't' || c == 's' || c == 'x' || c == 'z' || c == 'p') - && at_word_end(s, i + 1)) { - i++; continue; - } - - // Simple consonant mappings - if (c == 'j') { ipa += "\xCA\x92"; i++; continue; } // ʒ - if (c == 'r') { ipa += "\xCA\x81"; i++; continue; } // ʁ - if (c == 'x') { ipa += "ks"; i++; continue; } - - // Vowels - if (c == 'e') { - // Final 'e' is often silent (schwa) - if (at_word_end(s, i + 1) && i > 0) { - // Silent final -e (except monosyllables) - i++; continue; - } - ipa += "\xC9\x99"; // ə - i++; continue; - } - if (c == 'u') { ipa += "y"; i++; continue; } - if (c == 'y') { ipa += "i"; i++; continue; } - - // Passthrough: a, i, o, b, d, f, k, l, m, n, p, t, v, w, z + punctuation - if (c == ' ') { ipa += " "; i++; continue; } - if (c >= 'a' && c <= 'z') { - ipa += c; - i++; continue; - } - - // Punctuation passthrough - if (c == ',' || c == '.' || c == '!' || c == '?' || c == ';' || c == ':' || c == '-') { - ipa += c; - i++; continue; - } - - // Skip unknown characters - size_t tmp = i; - utf8_char_at(s, tmp); - i = tmp; - } - - return kokoro_postprocess(ipa); -} - -// =========================================================================== -// SPANISH -// =========================================================================== - -std::string multilingual::spanish_g2p(const std::string& text) { - // Work with UTF-8 characters for accented vowels - auto chars = utf8_split(text); - std::string ipa; - - for (size_t i = 0; i < chars.size(); ) { - std::string c = chars[i]; - uint32_t cp = utf8_codepoint(c); - - // Lowercase for comparison - std::string cl; - if (cp >= 'A' && cp <= 'Z') { - cl = std::string(1, static_cast(cp + 32)); - } else { - cl = c; - } - - // --- Digraphs (check two chars) --- - std::string next_l; - if (i + 1 < chars.size()) { - uint32_t ncp = utf8_codepoint(chars[i + 1]); - if (ncp >= 'A' && ncp <= 'Z') { - next_l = std::string(1, static_cast(ncp + 32)); - } else { - next_l = chars[i + 1]; - } - } - - if (!next_l.empty()) { - std::string di = cl + next_l; - if (di == "ch") { ipa += "t\xCA\x83"; i += 2; continue; } // tʃ - if (di == "ll") { ipa += "\xCA\x9D"; i += 2; continue; } // ʝ - if (di == "rr") { ipa += "r"; i += 2; continue; } - if (di == "qu") { - // qu before e/i = k (silent u) - if (i + 2 < chars.size()) { - uint32_t nncp = utf8_codepoint(chars[i + 2]); - char nn = static_cast(std::tolower(nncp)); - if (nn == 'e' || nn == 'i') { - ipa += "k"; i += 2; continue; - } - } - ipa += "k"; i += 2; continue; - } - if (di == "gu") { - // gu before e/i = g (silent u) - if (i + 2 < chars.size()) { - uint32_t nncp = utf8_codepoint(chars[i + 2]); - char nn = static_cast(std::tolower(nncp)); - if (nn == 'e' || nn == 'i') { - ipa += "g"; i += 2; continue; - } - } - } - } - - // --- Accented vowels (stressed, lengthened) --- - // á = C3 A1, é = C3 A9, í = C3 AD, ó = C3 B3, ú = C3 BA - // Á = C3 81, É = C3 89, Í = C3 8D, Ó = C3 93, Ú = C3 9A - if (cp == 0xE1 || cp == 0xC1) { ipa += "a\xCB\x90"; i++; continue; } // aː - if (cp == 0xE9 || cp == 0xC9) { ipa += "e\xCB\x90"; i++; continue; } // eː - if (cp == 0xED || cp == 0xCD) { ipa += "i\xCB\x90"; i++; continue; } // iː - if (cp == 0xF3 || cp == 0xD3) { ipa += "o\xCB\x90"; i++; continue; } // oː - if (cp == 0xFA || cp == 0xDA) { ipa += "u\xCB\x90"; i++; continue; } // uː - - // ñ = C3 B1, Ñ = C3 91 - if (cp == 0xF1 || cp == 0xD1) { ipa += "\xC9\xB2"; i++; continue; } // ɲ - - // ü = C3 BC (used in güe, güi) - if (cp == 0xFC || cp == 0xDC) { ipa += "w"; i++; continue; } - - // --- Context-dependent consonants --- - if (cl == "c") { - if (!next_l.empty() && (next_l == "e" || next_l == "i")) { - ipa += "\xCE\xB8"; // θ (Castilian) - } else { - ipa += "k"; - } - i++; continue; - } - if (cl == "g") { - if (!next_l.empty() && (next_l == "e" || next_l == "i")) { - ipa += "x"; // velar fricative - } else { - ipa += "g"; - } - i++; continue; - } - if (cl == "j") { ipa += "x"; i++; continue; } - if (cl == "z") { ipa += "\xCE\xB8"; i++; continue; } // θ - if (cl == "v") { ipa += "b"; i++; continue; } // Spanish v = b - if (cl == "h") { i++; continue; } // silent h - if (cl == "x") { ipa += "ks"; i++; continue; } - - // Simple passthrough - if (cp == ' ') { ipa += " "; i++; continue; } - if (cp >= 'a' && cp <= 'z') { ipa += static_cast(cp); i++; continue; } - if (cp >= 'A' && cp <= 'Z') { ipa += static_cast(cp + 32); i++; continue; } - - // Punctuation - if (cp == ',' || cp == '.' || cp == '!' || cp == '?' || cp == ';' - || cp == ':' || cp == '-') { - ipa += static_cast(cp); - i++; continue; - } - // Inverted punctuation - if (cp == 0xBF || cp == 0xA1) { i++; continue; } // ¿ ¡ — skip - - i++; // skip unknown - } - - return kokoro_postprocess(ipa); -} - -// =========================================================================== -// PORTUGUESE -// =========================================================================== - -std::string multilingual::portuguese_g2p(const std::string& text) { - auto chars = utf8_split(text); - std::string ipa; - - for (size_t i = 0; i < chars.size(); ) { - std::string c = chars[i]; - uint32_t cp = utf8_codepoint(c); - - std::string cl; - if (cp >= 'A' && cp <= 'Z') { - cl = std::string(1, static_cast(cp + 32)); - } else { - cl = c; - } - - // Lookahead - std::string next_l, next2_l; - if (i + 1 < chars.size()) { - uint32_t ncp = utf8_codepoint(chars[i + 1]); - next_l = (ncp >= 'A' && ncp <= 'Z') - ? std::string(1, static_cast(ncp + 32)) : chars[i + 1]; - } - if (i + 2 < chars.size()) { - uint32_t ncp = utf8_codepoint(chars[i + 2]); - next2_l = (ncp >= 'A' && ncp <= 'Z') - ? std::string(1, static_cast(ncp + 32)) : chars[i + 2]; - } - - // --- Trigraphs --- - if (!next_l.empty() && !next2_l.empty()) { - std::string tri = cl + next_l + next2_l; - // ção → sɐ̃w̃ - if (cp == 0xE7 || cp == 0xC7) { // ç - if (next_l == "a" || next_l == "\xC3\xA3") { // ã (U+00E3) - uint32_t n2cp = utf8_codepoint(chars[i + 2]); - if (n2cp == 'o' || n2cp == 0xF5) { // o or õ - ipa += "s\xC9\x90\xCC\x83w\xCC\x83"; // sɐ̃w̃ - i += 3; continue; - } - } - } - if (tri == "lha" || tri == "lhe" || tri == "lhi" || tri == "lho" || tri == "lhu") { - // lh → ʎ - ipa += "\xCA\x8E"; // ʎ - i += 2; continue; // consume lh, leave vowel for next iteration - } - if (tri == "nha" || tri == "nhe" || tri == "nhi" || tri == "nho" || tri == "nhu") { - ipa += "\xC9\xB2"; // ɲ - i += 2; continue; - } - } - - // --- Digraphs --- - if (!next_l.empty()) { - std::string di = cl + next_l; - if (di == "nh") { ipa += "\xC9\xB2"; i += 2; continue; } // ɲ - if (di == "lh") { ipa += "\xCA\x8E"; i += 2; continue; } // ʎ - if (di == "ch") { ipa += "\xCA\x83"; i += 2; continue; } // ʃ - if (di == "ss") { ipa += "s"; i += 2; continue; } - if (di == "rr") { ipa += "\xCA\x81"; i += 2; continue; } // ʁ - if (di == "qu") { ipa += "k"; i += 2; continue; } - if (di == "gu") { - if (!next2_l.empty() && (next2_l == "e" || next2_l == "i")) { - ipa += "g"; i += 2; continue; - } - } - if (di == "ou") { ipa += "ow"; i += 2; continue; } - if (di == "ei") { ipa += "ej"; i += 2; continue; } - if (di == "ai") { ipa += "aj"; i += 2; continue; } - if (di == "oi") { ipa += "oj"; i += 2; continue; } - } - - // --- Nasal vowels (ã, õ) --- - // ã = U+00E3, õ = U+00F5 - if (cp == 0xE3 || cp == 0xC3) { - // Check for ão - if (!next_l.empty()) { - uint32_t ncp = utf8_codepoint(chars[i + 1]); - if (ncp == 'o' || ncp == 0xF5) { - ipa += "\xC9\x90\xCC\x83w\xCC\x83"; // ɐ̃w̃ - i += 2; continue; - } - } - ipa += "\xC9\x90\xCC\x83"; // ɐ̃ - i++; continue; - } - if (cp == 0xF5 || cp == 0xD5) { - // Check for õe - if (!next_l.empty() && (next_l == "e" || next_l == "\xC3\xA9")) { - ipa += "o\xCC\x83j\xCC\x83"; // õj̃ - i += 2; continue; - } - ipa += "o\xCC\x83"; // õ - i++; continue; - } - - // --- Accented vowels --- - if (cp == 0xE1 || cp == 0xC1) { ipa += "a"; i++; continue; } // á - if (cp == 0xE2 || cp == 0xC2) { ipa += "a"; i++; continue; } // â - if (cp == 0xE9 || cp == 0xC9) { ipa += "\xC9\x9B"; i++; continue; } // é → ɛ (open) - if (cp == 0xEA || cp == 0xCA) { ipa += "e"; i++; continue; } // ê - if (cp == 0xED || cp == 0xCD) { ipa += "i"; i++; continue; } // í - if (cp == 0xF3 || cp == 0xD3) { ipa += "\xC9\x94"; i++; continue; } // ó → ɔ (open) - if (cp == 0xF4 || cp == 0xD4) { ipa += "o"; i++; continue; } // ô - if (cp == 0xFA || cp == 0xDA) { ipa += "u"; i++; continue; } // ú - - // ç → s - if (cp == 0xE7 || cp == 0xC7) { ipa += "s"; i++; continue; } - - // Context-dependent - if (cl == "c") { - if (!next_l.empty() && (next_l == "e" || next_l == "i")) { - ipa += "s"; - } else { - ipa += "k"; - } - i++; continue; - } - if (cl == "g") { - if (!next_l.empty() && (next_l == "e" || next_l == "i")) { - ipa += "\xCA\x92"; // ʒ - } else { - ipa += "g"; - } - i++; continue; - } - if (cl == "r") { - // Initial r or rr = ʁ, intervocalic = ɾ - if (i == 0 || (i > 0 && !is_latin_vowel(chars[i - 1][0]))) { - ipa += "\xCA\x81"; // ʁ - } else { - ipa += "\xC9\xBE"; // ɾ - } - i++; continue; - } - if (cl == "s") { - // Intervocalic s = z - if (i > 0 && i + 1 < chars.size() - && is_latin_vowel(chars[i - 1][0]) && is_latin_vowel(chars[i + 1][0])) { - ipa += "z"; - } else { - ipa += "s"; - } - i++; continue; - } - - if (cl == "j") { ipa += "\xCA\x92"; i++; continue; } // ʒ - if (cl == "x") { ipa += "\xCA\x83"; i++; continue; } // ʃ (most common) - if (cl == "h") { i++; continue; } // silent - - // Passthrough - if (cp == ' ') { ipa += " "; i++; continue; } - if (cp >= 'a' && cp <= 'z') { ipa += static_cast(cp); i++; continue; } - if (cp >= 'A' && cp <= 'Z') { ipa += static_cast(cp + 32); i++; continue; } - - // Punctuation - if (cp == ',' || cp == '.' || cp == '!' || cp == '?' || cp == ';' - || cp == ':' || cp == '-') { - ipa += static_cast(cp); - i++; continue; - } - - i++; // skip unknown - } - - return kokoro_postprocess(ipa); -} - -// =========================================================================== -// ITALIAN -// =========================================================================== - -std::string multilingual::italian_g2p(const std::string& text) { - auto chars = utf8_split(text); - std::string ipa; - - for (size_t i = 0; i < chars.size(); ) { - std::string c = chars[i]; - uint32_t cp = utf8_codepoint(c); - - std::string cl; - if (cp >= 'A' && cp <= 'Z') { - cl = std::string(1, static_cast(cp + 32)); - } else { - cl = c; - } - - // Lookahead - std::string next_l, next2_l; - if (i + 1 < chars.size()) { - uint32_t ncp = utf8_codepoint(chars[i + 1]); - next_l = (ncp >= 'A' && ncp <= 'Z') - ? std::string(1, static_cast(ncp + 32)) : chars[i + 1]; - } - if (i + 2 < chars.size()) { - uint32_t ncp = utf8_codepoint(chars[i + 2]); - next2_l = (ncp >= 'A' && ncp <= 'Z') - ? std::string(1, static_cast(ncp + 32)) : chars[i + 2]; - } - - // --- Trigraphs --- - if (!next_l.empty() && !next2_l.empty()) { - std::string tri = cl + next_l + next2_l; - // sci before e/i = ʃ + vowel - if (tri == "sce" || tri == "sci") { - ipa += "\xCA\x83"; // ʃ - i += 2; continue; // consume sc, leave vowel - } - // gli before vowel = ʎ + vowel - if (cl == "g" && next_l == "l" && next2_l == "i") { - // Check if followed by a vowel - if (i + 3 < chars.size()) { - uint32_t nncp = utf8_codepoint(chars[i + 3]); - if (nncp == 'a' || nncp == 'e' || nncp == 'i' || nncp == 'o' || nncp == 'u') { - ipa += "\xCA\x8E"; // ʎ - i += 3; continue; // consume gli - } - } - // gli at end or before consonant = ʎi - ipa += "\xCA\x8Ei"; // ʎi - i += 3; continue; - } - // ghi/ghe = g + vowel (hard g before e/i) - if (cl == "g" && next_l == "h") { - if (next2_l == "e" || next2_l == "i") { - ipa += "g"; - i += 2; continue; // consume gh, leave vowel - } - } - // chi/che = k + vowel (hard c before e/i) - if (cl == "c" && next_l == "h") { - if (next2_l == "e" || next2_l == "i") { - ipa += "k"; - i += 2; continue; - } - } - } - - // --- Digraphs --- - if (!next_l.empty()) { - std::string di = cl + next_l; - if (di == "gn") { ipa += "\xC9\xB2"; i += 2; continue; } // ɲ - if (di == "sc") { - // sc before e/i = ʃ (already handled in trigraphs above for explicit vowel) - if (!next2_l.empty() && (next2_l == "e" || next2_l == "i")) { - ipa += "\xCA\x83"; // ʃ - i += 2; continue; - } - ipa += "sk"; i += 2; continue; - } - if (di == "qu") { ipa += "kw"; i += 2; continue; } - if (di == "ss") { ipa += "s"; i += 2; continue; } - if (di == "zz") { ipa += "ts"; i += 2; continue; } - if (di == "cc") { - if (!next2_l.empty() && (next2_l == "e" || next2_l == "i")) { - ipa += "t\xCA\x83"; // tʃ - i += 2; continue; - } - ipa += "kk"; i += 2; continue; - } - if (di == "gg") { - if (!next2_l.empty() && (next2_l == "e" || next2_l == "i")) { - ipa += "d\xCA\x92"; // dʒ - i += 2; continue; - } - ipa += "gg"; i += 2; continue; - } - if (di == "gl") { - // gl before i = ʎ (covered in trigraphs) - // gl otherwise = gl - ipa += "gl"; i += 2; continue; - } - } - - // --- Context-dependent consonants --- - if (cl == "c") { - if (!next_l.empty() && (next_l == "e" || next_l == "i")) { - ipa += "t\xCA\x83"; // tʃ - } else { - ipa += "k"; - } - i++; continue; - } - if (cl == "g") { - if (!next_l.empty() && (next_l == "e" || next_l == "i")) { - ipa += "d\xCA\x92"; // dʒ - } else { - ipa += "g"; - } - i++; continue; - } - if (cl == "z") { - // Default: ts (can be dz in some words — would need dictionary) - ipa += "ts"; - i++; continue; - } - if (cl == "s") { - // Intervocalic s = z - if (i > 0 && i + 1 < chars.size() - && is_latin_vowel(chars[i - 1][0]) && is_latin_vowel(chars[i + 1][0])) { - ipa += "z"; - } else { - ipa += "s"; - } - i++; continue; - } - - // Accented vowels - if (cp == 0xE0 || cp == 0xC0) { ipa += "a"; i++; continue; } // à - if (cp == 0xE1 || cp == 0xC1) { ipa += "a"; i++; continue; } // á - if (cp == 0xE8 || cp == 0xC8) { ipa += "\xC9\x9B"; i++; continue; } // è → ɛ - if (cp == 0xE9 || cp == 0xC9) { ipa += "e"; i++; continue; } // é - if (cp == 0xEC || cp == 0xCC) { ipa += "i"; i++; continue; } // ì - if (cp == 0xED || cp == 0xCD) { ipa += "i"; i++; continue; } // í - if (cp == 0xF2 || cp == 0xD2) { ipa += "\xC9\x94"; i++; continue; } // ò → ɔ - if (cp == 0xF3 || cp == 0xD3) { ipa += "o"; i++; continue; } // ó - if (cp == 0xF9 || cp == 0xD9) { ipa += "u"; i++; continue; } // ù - if (cp == 0xFA || cp == 0xDA) { ipa += "u"; i++; continue; } // ú - - if (cl == "h") { i++; continue; } // silent - if (cl == "j") { ipa += "j"; i++; continue; } - - // Passthrough - if (cp == ' ') { ipa += " "; i++; continue; } - if (cp >= 'a' && cp <= 'z') { ipa += static_cast(cp); i++; continue; } - if (cp >= 'A' && cp <= 'Z') { ipa += static_cast(cp + 32); i++; continue; } - - // Punctuation - if (cp == ',' || cp == '.' || cp == '!' || cp == '?' || cp == ';' - || cp == ':' || cp == '-') { - ipa += static_cast(cp); - i++; continue; - } - - i++; // skip unknown - } - - return kokoro_postprocess(ipa); -} - -// =========================================================================== -// JAPANESE -// =========================================================================== - -// Katakana and Hiragana → IPA tables. -// Built as static maps initialized on first use. - -struct KanaEntry { - const char* kana; - const char* ipa; -}; - -static const std::unordered_map& get_kana_map() { - static const std::unordered_map map = []() { - std::unordered_map m; - - // --- Katakana digraphs (must be checked before singles) --- - // Stored with their UTF-8 sequences. - - // キャ行 - m["\xe3\x82\xad\xe3\x83\xa3"] = "kja"; // キャ - m["\xe3\x82\xad\xe3\x83\xa5"] = "kju"; // キュ - m["\xe3\x82\xad\xe3\x83\xa7"] = "kjo"; // キョ - - // シャ行 - m["\xe3\x82\xb7\xe3\x83\xa3"] = "\xca\x83" "a"; // シャ = ʃa - m["\xe3\x82\xb7\xe3\x83\xa5"] = "\xca\x83" "u"; // シュ = ʃu - m["\xe3\x82\xb7\xe3\x83\xa7"] = "\xca\x83" "o"; // ショ = ʃo - - // チャ行 - m["\xe3\x83\x81\xe3\x83\xa3"] = "t\xca\x83" "a"; // チャ = tʃa - m["\xe3\x83\x81\xe3\x83\xa5"] = "t\xca\x83" "u"; // チュ = tʃu - m["\xe3\x83\x81\xe3\x83\xa7"] = "t\xca\x83" "o"; // チョ = tʃo - - // ニャ行 - m["\xe3\x83\x8b\xe3\x83\xa3"] = "\xc9\xb2" "a"; // ニャ = ɲa - m["\xe3\x83\x8b\xe3\x83\xa5"] = "\xc9\xb2" "u"; // ニュ = ɲu - m["\xe3\x83\x8b\xe3\x83\xa7"] = "\xc9\xb2" "o"; // ニョ = ɲo - - // ヒャ行 - m["\xe3\x83\x92\xe3\x83\xa3"] = "\xc3\xa7" "a"; // ヒャ = ça - m["\xe3\x83\x92\xe3\x83\xa5"] = "\xc3\xa7" "u"; // ヒュ = çu - m["\xe3\x83\x92\xe3\x83\xa7"] = "\xc3\xa7" "o"; // ヒョ = ço - - // ミャ行 - m["\xe3\x83\x9f\xe3\x83\xa3"] = "mja"; // ミャ - m["\xe3\x83\x9f\xe3\x83\xa5"] = "mju"; // ミュ - m["\xe3\x83\x9f\xe3\x83\xa7"] = "mjo"; // ミョ - - // リャ行 - m["\xe3\x83\xaa\xe3\x83\xa3"] = "\xc9\xbe" "ja"; // リャ = ɾja - m["\xe3\x83\xaa\xe3\x83\xa5"] = "\xc9\xbe" "ju"; // リュ = ɾju - m["\xe3\x83\xaa\xe3\x83\xa7"] = "\xc9\xbe" "jo"; // リョ = ɾjo - - // ギャ行 - m["\xe3\x82\xae\xe3\x83\xa3"] = "gja"; // ギャ - m["\xe3\x82\xae\xe3\x83\xa5"] = "gju"; // ギュ - m["\xe3\x82\xae\xe3\x83\xa7"] = "gjo"; // ギョ - - // ジャ行 - m["\xe3\x82\xb8\xe3\x83\xa3"] = "d\xca\x92" "a"; // ジャ = dʒa - m["\xe3\x82\xb8\xe3\x83\xa5"] = "d\xca\x92" "u"; // ジュ = dʒu - m["\xe3\x82\xb8\xe3\x83\xa7"] = "d\xca\x92" "o"; // ジョ = dʒo - - // ビャ行 - m["\xe3\x83\x93\xe3\x83\xa3"] = "bja"; // ビャ - m["\xe3\x83\x93\xe3\x83\xa5"] = "bju"; // ビュ - m["\xe3\x83\x93\xe3\x83\xa7"] = "bjo"; // ビョ - - // ピャ行 - m["\xe3\x83\x94\xe3\x83\xa3"] = "pja"; // ピャ - m["\xe3\x83\x94\xe3\x83\xa5"] = "pju"; // ピュ - m["\xe3\x83\x94\xe3\x83\xa7"] = "pjo"; // ピョ - - // --- Katakana singles --- - // ア行 - m["\xe3\x82\xa2"] = "a"; // ア - m["\xe3\x82\xa4"] = "i"; // イ - m["\xe3\x82\xa6"] = "\xc9\xb0"; // ウ = ɰ (unrounded) - m["\xe3\x82\xa8"] = "e"; // エ - m["\xe3\x82\xaa"] = "o"; // オ - - // カ行 - m["\xe3\x82\xab"] = "ka"; // カ - m["\xe3\x82\xad"] = "ki"; // キ - m["\xe3\x82\xaf"] = "k\xc9\xb0"; // ク = kɰ - m["\xe3\x82\xb1"] = "ke"; // ケ - m["\xe3\x82\xb3"] = "ko"; // コ - - // サ行 - m["\xe3\x82\xb5"] = "sa"; // サ - m["\xe3\x82\xb7"] = "\xca\x83i"; // シ = ʃi - m["\xe3\x82\xb9"] = "s\xc9\xb0"; // ス = sɰ - m["\xe3\x82\xbb"] = "se"; // セ - m["\xe3\x82\xbd"] = "so"; // ソ - - // タ行 - m["\xe3\x82\xbf"] = "ta"; // タ - m["\xe3\x83\x81"] = "t\xca\x83i"; // チ = tʃi - m["\xe3\x83\x84"] = "ts\xc9\xb0"; // ツ = tsɰ - m["\xe3\x83\x86"] = "te"; // テ - m["\xe3\x83\x88"] = "to"; // ト - - // ナ行 - m["\xe3\x83\x8a"] = "na"; // ナ - m["\xe3\x83\x8b"] = "\xc9\xb2i"; // ニ = ɲi - m["\xe3\x83\x8c"] = "n\xc9\xb0"; // ヌ = nɰ - m["\xe3\x83\x8d"] = "ne"; // ネ - m["\xe3\x83\x8e"] = "no"; // ノ - - // ハ行 - m["\xe3\x83\x8f"] = "ha"; // ハ - m["\xe3\x83\x92"] = "\xc3\xa7i"; // ヒ = çi - m["\xe3\x83\x95"] = "\xc9\xb8\xc9\xb0"; // フ = ɸɰ - m["\xe3\x83\x98"] = "he"; // ヘ - m["\xe3\x83\x9b"] = "ho"; // ホ - - // マ行 - m["\xe3\x83\x9e"] = "ma"; // マ - m["\xe3\x83\x9f"] = "mi"; // ミ - m["\xe3\x83\xa0"] = "m\xc9\xb0"; // ム = mɰ - m["\xe3\x83\xa1"] = "me"; // メ - m["\xe3\x83\xa2"] = "mo"; // モ - - // ヤ行 - m["\xe3\x83\xa4"] = "ja"; // ヤ - m["\xe3\x83\xa6"] = "j\xc9\xb0"; // ユ = jɰ - m["\xe3\x83\xa8"] = "jo"; // ヨ - - // ラ行 - m["\xe3\x83\xa9"] = "\xc9\xbe" "a"; // ラ = ɾa - m["\xe3\x83\xaa"] = "\xc9\xbe" "i"; // リ = ɾi - m["\xe3\x83\xab"] = "\xc9\xbe\xc9\xb0"; // ル = ɾɰ - m["\xe3\x83\xac"] = "\xc9\xbe" "e"; // レ = ɾe - m["\xe3\x83\xad"] = "\xc9\xbe" "o"; // ロ = ɾo - - // ワ行 - m["\xe3\x83\xaf"] = "wa"; // ワ - m["\xe3\x83\xb2"] = "o"; // ヲ - m["\xe3\x83\xb3"] = "\xc9\xb4"; // ン = ɴ - - // 濁音 (voiced) — ガ行 - m["\xe3\x82\xac"] = "ga"; // ガ - m["\xe3\x82\xae"] = "gi"; // ギ - m["\xe3\x82\xb0"] = "g\xc9\xb0"; // グ = gɰ - m["\xe3\x82\xb2"] = "ge"; // ゲ - m["\xe3\x82\xb4"] = "go"; // ゴ - - // ザ行 - m["\xe3\x82\xb6"] = "za"; // ザ - m["\xe3\x82\xb8"] = "d\xca\x92i"; // ジ = dʒi - m["\xe3\x82\xba"] = "z\xc9\xb0"; // ズ = zɰ - m["\xe3\x82\xbc"] = "ze"; // ゼ - m["\xe3\x82\xbe"] = "zo"; // ゾ - - // ダ行 - m["\xe3\x83\x80"] = "da"; // ダ - m["\xe3\x83\x82"] = "d\xca\x92i"; // ヂ = dʒi - m["\xe3\x83\x85"] = "z\xc9\xb0"; // ヅ = zɰ - m["\xe3\x83\x87"] = "de"; // デ - m["\xe3\x83\x89"] = "do"; // ド - - // バ行 - m["\xe3\x83\x90"] = "ba"; // バ - m["\xe3\x83\x93"] = "bi"; // ビ - m["\xe3\x83\x96"] = "b\xc9\xb0"; // ブ = bɰ - m["\xe3\x83\x99"] = "be"; // ベ - m["\xe3\x83\x9c"] = "bo"; // ボ - - // パ行 - m["\xe3\x83\x91"] = "pa"; // パ - m["\xe3\x83\x94"] = "pi"; // ピ - m["\xe3\x83\x97"] = "p\xc9\xb0"; // プ = pɰ - m["\xe3\x83\x9a"] = "pe"; // ペ - m["\xe3\x83\x9d"] = "po"; // ポ - - // Special - m["\xe3\x83\x83"] = "\xca\x94"; // ッ (small tsu) = ʔ (glottal stop) - m["\xe3\x83\xbc"] = "\xcb\x90"; // ー (long vowel) = ː - - // --- Hiragana (offset katakana by 0x60) --- - // We add the same entries for hiragana. - // Hiragana range: U+3041-U+3093 - // Katakana range: U+30A1-U+30F3 - // Offset: Hiragana = Katakana - 0x60 - - // Hiragana vowels - m["\xe3\x81\x82"] = "a"; // あ - m["\xe3\x81\x84"] = "i"; // い - m["\xe3\x81\x86"] = "\xc9\xb0"; // う = ɰ - m["\xe3\x81\x88"] = "e"; // え - m["\xe3\x81\x8a"] = "o"; // お - - // か行 - m["\xe3\x81\x8b"] = "ka"; // か - m["\xe3\x81\x8d"] = "ki"; // き - m["\xe3\x81\x8f"] = "k\xc9\xb0"; // く - m["\xe3\x81\x91"] = "ke"; // け - m["\xe3\x81\x93"] = "ko"; // こ - - // さ行 - m["\xe3\x81\x95"] = "sa"; // さ - m["\xe3\x81\x97"] = "\xca\x83i"; // し = ʃi - m["\xe3\x81\x99"] = "s\xc9\xb0"; // す - m["\xe3\x81\x9b"] = "se"; // せ - m["\xe3\x81\x9d"] = "so"; // そ - - // た行 - m["\xe3\x81\x9f"] = "ta"; // た - m["\xe3\x81\xa1"] = "t\xca\x83i"; // ち = tʃi - m["\xe3\x81\xa4"] = "ts\xc9\xb0"; // つ - m["\xe3\x81\xa6"] = "te"; // て - m["\xe3\x81\xa8"] = "to"; // と - - // な行 - m["\xe3\x81\xaa"] = "na"; // な - m["\xe3\x81\xab"] = "\xc9\xb2i"; // に = ɲi - m["\xe3\x81\xac"] = "n\xc9\xb0"; // ぬ - m["\xe3\x81\xad"] = "ne"; // ね - m["\xe3\x81\xae"] = "no"; // の - - // は行 - m["\xe3\x81\xaf"] = "ha"; // は - m["\xe3\x81\xb2"] = "\xc3\xa7i"; // ひ = çi - m["\xe3\x81\xb5"] = "\xc9\xb8\xc9\xb0"; // ふ = ɸɰ - m["\xe3\x81\xb8"] = "he"; // へ - m["\xe3\x81\xbb"] = "ho"; // ほ - - // ま行 - m["\xe3\x81\xbe"] = "ma"; // ま - m["\xe3\x81\xbf"] = "mi"; // み - m["\xe3\x82\x80"] = "m\xc9\xb0"; // む - m["\xe3\x82\x81"] = "me"; // め - m["\xe3\x82\x82"] = "mo"; // も - - // や行 - m["\xe3\x82\x84"] = "ja"; // や - m["\xe3\x82\x86"] = "j\xc9\xb0"; // ゆ - m["\xe3\x82\x88"] = "jo"; // よ - - // ら行 - m["\xe3\x82\x89"] = "\xc9\xbe" "a"; // ら = ɾa - m["\xe3\x82\x8a"] = "\xc9\xbe" "i"; // り = ɾi - m["\xe3\x82\x8b"] = "\xc9\xbe\xc9\xb0"; // る = ɾɰ - m["\xe3\x82\x8c"] = "\xc9\xbe" "e"; // れ = ɾe - m["\xe3\x82\x8d"] = "\xc9\xbe" "o"; // ろ = ɾo - - // わ行 - m["\xe3\x82\x8f"] = "wa"; // わ - m["\xe3\x82\x92"] = "o"; // を - m["\xe3\x82\x93"] = "\xc9\xb4"; // ん = ɴ - - // 濁音 — が行 - m["\xe3\x81\x8c"] = "ga"; // が - m["\xe3\x81\x8e"] = "gi"; // ぎ - m["\xe3\x81\x90"] = "g\xc9\xb0"; // ぐ - m["\xe3\x81\x92"] = "ge"; // げ - m["\xe3\x81\x94"] = "go"; // ご - - // ざ行 - m["\xe3\x81\x96"] = "za"; // ざ - m["\xe3\x81\x98"] = "d\xca\x92i"; // じ = dʒi - m["\xe3\x81\x9a"] = "z\xc9\xb0"; // ず - m["\xe3\x81\x9c"] = "ze"; // ぜ - m["\xe3\x81\x9e"] = "zo"; // ぞ - - // だ行 - m["\xe3\x81\xa0"] = "da"; // だ - m["\xe3\x81\xa2"] = "d\xca\x92i"; // ぢ = dʒi - m["\xe3\x81\xa5"] = "z\xc9\xb0"; // づ - m["\xe3\x81\xa7"] = "de"; // で - m["\xe3\x81\xa9"] = "do"; // ど - - // ば行 - m["\xe3\x81\xb0"] = "ba"; // ば - m["\xe3\x81\xb3"] = "bi"; // び - m["\xe3\x81\xb6"] = "b\xc9\xb0"; // ぶ - m["\xe3\x81\xb9"] = "be"; // べ - m["\xe3\x81\xbc"] = "bo"; // ぼ - - // ぱ行 - m["\xe3\x81\xb1"] = "pa"; // ぱ - m["\xe3\x81\xb4"] = "pi"; // ぴ - m["\xe3\x81\xb7"] = "p\xc9\xb0"; // ぷ - m["\xe3\x81\xba"] = "pe"; // ぺ - m["\xe3\x81\xbd"] = "po"; // ぽ - - // Special hiragana - m["\xe3\x81\xa3"] = "\xca\x94"; // っ (small tsu) = ʔ - - // Hiragana digraphs (きゃ etc.) - m["\xe3\x81\x8d\xe3\x82\x83"] = "kja"; // きゃ - m["\xe3\x81\x8d\xe3\x82\x85"] = "kju"; // きゅ - m["\xe3\x81\x8d\xe3\x82\x87"] = "kjo"; // きょ - - m["\xe3\x81\x97\xe3\x82\x83"] = "\xca\x83" "a"; // しゃ = ʃa - m["\xe3\x81\x97\xe3\x82\x85"] = "\xca\x83" "u"; // しゅ = ʃu - m["\xe3\x81\x97\xe3\x82\x87"] = "\xca\x83" "o"; // しょ = ʃo - - m["\xe3\x81\xa1\xe3\x82\x83"] = "t\xca\x83" "a"; // ちゃ = tʃa - m["\xe3\x81\xa1\xe3\x82\x85"] = "t\xca\x83" "u"; // ちゅ = tʃu - m["\xe3\x81\xa1\xe3\x82\x87"] = "t\xca\x83" "o"; // ちょ = tʃo - - m["\xe3\x81\xab\xe3\x82\x83"] = "\xc9\xb2" "a"; // にゃ = ɲa - m["\xe3\x81\xab\xe3\x82\x85"] = "\xc9\xb2" "u"; // にゅ = ɲu - m["\xe3\x81\xab\xe3\x82\x87"] = "\xc9\xb2" "o"; // にょ = ɲo - - m["\xe3\x81\xb2\xe3\x82\x83"] = "\xc3\xa7" "a"; // ひゃ = ça - m["\xe3\x81\xb2\xe3\x82\x85"] = "\xc3\xa7" "u"; // ひゅ = çu - m["\xe3\x81\xb2\xe3\x82\x87"] = "\xc3\xa7" "o"; // ひょ = ço - - m["\xe3\x81\xbf\xe3\x82\x83"] = "mja"; // みゃ - m["\xe3\x81\xbf\xe3\x82\x85"] = "mju"; // みゅ - m["\xe3\x81\xbf\xe3\x82\x87"] = "mjo"; // みょ - - m["\xe3\x82\x8a\xe3\x82\x83"] = "\xc9\xbe" "ja"; // りゃ = ɾja - m["\xe3\x82\x8a\xe3\x82\x85"] = "\xc9\xbe" "ju"; // りゅ = ɾju - m["\xe3\x82\x8a\xe3\x82\x87"] = "\xc9\xbe" "jo"; // りょ = ɾjo - - m["\xe3\x81\x8e\xe3\x82\x83"] = "gja"; // ぎゃ - m["\xe3\x81\x8e\xe3\x82\x85"] = "gju"; // ぎゅ - m["\xe3\x81\x8e\xe3\x82\x87"] = "gjo"; // ぎょ - - m["\xe3\x81\x98\xe3\x82\x83"] = "d\xca\x92" "a"; // じゃ = dʒa - m["\xe3\x81\x98\xe3\x82\x85"] = "d\xca\x92" "u"; // じゅ = dʒu - m["\xe3\x81\x98\xe3\x82\x87"] = "d\xca\x92" "o"; // じょ = dʒo - - m["\xe3\x81\xb3\xe3\x82\x83"] = "bja"; // びゃ - m["\xe3\x81\xb3\xe3\x82\x85"] = "bju"; // びゅ - m["\xe3\x81\xb3\xe3\x82\x87"] = "bjo"; // びょ - - m["\xe3\x81\xb4\xe3\x82\x83"] = "pja"; // ぴゃ - m["\xe3\x81\xb4\xe3\x82\x85"] = "pju"; // ぴゅ - m["\xe3\x81\xb4\xe3\x82\x87"] = "pjo"; // ぴょ - - return m; - }(); - return map; -} - -std::string multilingual::japanese_g2p(const std::string& text) { - const auto& kana_map = get_kana_map(); - auto chars = utf8_split(text); - std::string ipa; - - for (size_t i = 0; i < chars.size(); ) { - // Try digraph (two characters) first - if (i + 1 < chars.size()) { - std::string pair = chars[i] + chars[i + 1]; - auto it = kana_map.find(pair); - if (it != kana_map.end()) { - ipa += it->second; - i += 2; - continue; - } - } - - // Try single character - auto it = kana_map.find(chars[i]); - if (it != kana_map.end()) { - ipa += it->second; - i++; - continue; - } - - uint32_t cp = utf8_codepoint(chars[i]); - - // ASCII passthrough - if (cp == ' ') { ipa += " "; i++; continue; } - if ((cp >= 'a' && cp <= 'z') || (cp >= 'A' && cp <= 'Z') - || (cp >= '0' && cp <= '9')) { - ipa += chars[i]; i++; continue; - } - // Punctuation - if (cp == ',' || cp == '.' || cp == '!' || cp == '?' - || cp == 0x3001 || cp == 0x3002) { // 、。 - ipa += ","; // normalize Japanese punctuation to comma pause - i++; continue; - } - - // CJK ideographs (kanji) — pass through as-is (requires JNI/dictionary for proper conversion) - if (cp >= 0x4E00 && cp <= 0x9FFF) { - // TODO: Kanji→reading conversion requires dictionary or JNI callback - ipa += chars[i]; - i++; continue; - } - - i++; // skip unknown - } - - return kokoro_postprocess(ipa); -} - -// =========================================================================== -// CHINESE (Pinyin → IPA) -// =========================================================================== - -// Pinyin syllable → IPA conversion. -// This handles pre-segmented pinyin input (space-separated syllables). -// For raw Chinese text, a pinyin segmenter is needed upstream (JNI or ICU). - -struct PinyinMapping { - const char* pinyin; - const char* ipa; -}; - -// Build the pinyin→IPA table on first use. -static const std::unordered_map& get_pinyin_finals_map() { - static const std::unordered_map map = { - // Complex finals first (longer match priority) - {"iang", "ja\xc5\x8b"}, // jaŋ - {"iong", "j\xca\x8a\xc5\x8b"}, // jʊŋ - {"uang", "wa\xc5\x8b"}, // waŋ - {"iao", "jaw"}, - {"ian", "j\xc9\x9bn"}, // jɛn - {"ang", "a\xc5\x8b"}, // aŋ - {"eng", "\xc9\x99\xc5\x8b"}, // əŋ - {"ing", "i\xc5\x8b"}, // iŋ - {"ong", "\xca\x8a\xc5\x8b"}, // ʊŋ - {"uai", "waj"}, - {"uan", "wan"}, - {"ai", "aj"}, - {"ei", "ej"}, - {"ao", "aw"}, - {"ou", "ow"}, - {"an", "an"}, - {"en", "\xc9\x99n"}, // ən - {"in", "in"}, - {"un", "\xc9\x99n"}, // ən (=uen simplified) - {"ia", "ja"}, - {"ie", "je"}, - {"uo", "wo"}, - {"ua", "wa"}, - {"ue", "we"}, // üe - {"ui", "wej"}, // =uei - {"iu", "jow"}, // =iou - {"er", "\xc9\x99\xc9\xbb"}, // əɻ - {"a", "a"}, - {"e", "\xc9\xa4"}, // ɤ - {"i", "i"}, - {"o", "wo"}, - {"u", "u"}, - }; - return map; -} - -static const std::unordered_map& get_pinyin_initials_map() { - static const std::unordered_map map = { - {"zh", "\xca\x88\xca\x82"}, // ʈʂ - {"ch", "\xca\x88\xca\x82\xca\xb0"}, // ʈʂʰ - {"sh", "\xca\x82"}, // ʂ - {"b", "p"}, - {"p", "p\xca\xb0"}, // pʰ - {"m", "m"}, - {"f", "f"}, - {"d", "t"}, - {"t", "t\xca\xb0"}, // tʰ - {"n", "n"}, - {"l", "l"}, - {"g", "k"}, - {"k", "k\xca\xb0"}, // kʰ - {"h", "x"}, - {"j", "t\xc9\x95"}, // tɕ - {"q", "t\xc9\x95\xca\xb0"}, // tɕʰ - {"x", "\xc9\x95"}, // ɕ - {"z", "ts"}, - {"c", "ts\xca\xb0"}, // tsʰ - {"s", "s"}, - {"r", "\xc9\xbb"}, // ɻ - {"y", "j"}, // glide - {"w", "w"}, // glide - }; - return map; -} - -/// Convert a single pinyin syllable (with optional tone number) to IPA. -static std::string pinyin_syllable_to_ipa(const std::string& syllable) { - if (syllable.empty()) return ""; - - std::string syl = to_lower_ascii(syllable); - - // Strip tone number (1-5) at end - if (!syl.empty() && syl.back() >= '1' && syl.back() <= '5') { - syl.pop_back(); - } - if (syl.empty()) return ""; - - // Handle ü (written as v or ü in some pinyin systems) - { - size_t pos = 0; - while ((pos = syl.find('v', pos)) != std::string::npos) { - syl.replace(pos, 1, "\xc3\xbc"); // ü - pos += 2; - } - } - - const auto& initials = get_pinyin_initials_map(); - const auto& finals = get_pinyin_finals_map(); - - std::string initial_ipa; - std::string remaining = syl; - - // Try two-char initial first, then one-char - if (syl.size() >= 2) { - auto it = initials.find(syl.substr(0, 2)); - if (it != initials.end()) { - initial_ipa = it->second; - remaining = syl.substr(2); - } - } - if (initial_ipa.empty() && syl.size() >= 1) { - auto it = initials.find(syl.substr(0, 1)); - if (it != initials.end()) { - initial_ipa = it->second; - remaining = syl.substr(1); - } - } - - // Special case: ü finals after j/q/x/y (written as u but pronounced y) - if (!remaining.empty() && remaining[0] == 'u') { - std::string init1 = syl.size() >= 1 ? syl.substr(0, 1) : ""; - if (init1 == "j" || init1 == "q" || init1 == "x" || init1 == "y") { - remaining = "v" + remaining.substr(1); // treat as ü - // Actually, for j/q/x, the u IS ü. Map to y sound. - // Keep as-is for finals matching, the final will handle it. - } - } - - // Match final - std::string final_ipa; - // Try longest match first - for (size_t len = std::min(remaining.size(), size_t(4)); len > 0; len--) { - auto it = finals.find(remaining.substr(0, len)); - if (it != finals.end()) { - final_ipa = it->second; - break; - } - } - - if (final_ipa.empty() && !remaining.empty()) { - // Fallback: just use the remaining as-is - final_ipa = remaining; - } - - return initial_ipa + final_ipa; -} - -std::string multilingual::chinese_g2p(const std::string& text) { - // Input is expected to be pinyin (space-separated syllables) or mixed text. - // CJK characters are passed through (would need pinyin conversion upstream). - auto chars = utf8_split(text); - std::string ipa; - std::string current_syllable; - - auto flush_syllable = [&]() { - if (!current_syllable.empty()) { - ipa += pinyin_syllable_to_ipa(current_syllable); - current_syllable.clear(); - } - }; - - for (size_t i = 0; i < chars.size(); i++) { - uint32_t cp = utf8_codepoint(chars[i]); - - // CJK ideographs — pass through (needs upstream pinyin conversion) - if (cp >= 0x4E00 && cp <= 0x9FFF) { - flush_syllable(); - // TODO: Character→pinyin conversion requires dictionary or JNI callback - ipa += chars[i]; - continue; - } - - // Space or punctuation = syllable boundary - if (cp == ' ' || cp == ',' || cp == '.' || cp == '!' || cp == '?' - || cp == ';' || cp == ':' || cp == '-') { - flush_syllable(); - if (cp == ' ') ipa += " "; - else ipa += static_cast(cp); - continue; - } - - // ASCII letters and digits = part of pinyin syllable - if ((cp >= 'a' && cp <= 'z') || (cp >= 'A' && cp <= 'Z') - || (cp >= '0' && cp <= '9')) { - current_syllable += static_cast(cp); - continue; - } - - // ü (U+00FC) - if (cp == 0xFC) { - current_syllable += "v"; // internal representation - continue; - } - - // Skip unknown - flush_syllable(); - } - flush_syllable(); - - return kokoro_postprocess(ipa); -} - -// =========================================================================== -// HINDI (Devanagari → IPA) -// =========================================================================== - -// Devanagari consonants → IPA -static const std::unordered_map& get_devanagari_consonants() { - static const std::unordered_map map = { - // Velars - {0x0915, "k"}, // क - {0x0916, "k\xca\xb0"}, // ख = kʰ - {0x0917, "\xc9\xa1"}, // ग = ɡ - {0x0918, "\xc9\xa1\xca\xb1"}, // घ = ɡʱ - {0x0919, "\xc5\x8b"}, // ङ = ŋ - - // Palatals - {0x091A, "t\xca\x83"}, // च = tʃ - {0x091B, "t\xca\x83\xca\xb0"}, // छ = tʃʰ - {0x091C, "d\xca\x92"}, // ज = dʒ - {0x091D, "d\xca\x92\xca\xb1"}, // झ = dʒʱ - {0x091E, "\xc9\xb2"}, // ञ = ɲ - - // Retroflexes - {0x091F, "\xca\x88"}, // ट = ʈ - {0x0920, "\xca\x88\xca\xb0"}, // ठ = ʈʰ - {0x0921, "\xc9\x96"}, // ड = ɖ - {0x0922, "\xc9\x96\xca\xb1"}, // ढ = ɖʱ - {0x0923, "\xc9\xb3"}, // ण = ɳ - - // Dentals - {0x0924, "t\xcc\xaa"}, // त = t̪ - {0x0925, "t\xcc\xaa\xca\xb0"}, // थ = t̪ʰ - {0x0926, "d\xcc\xaa"}, // द = d̪ - {0x0927, "d\xcc\xaa\xca\xb1"}, // ध = d̪ʱ - {0x0928, "n"}, // न = n - - // Labials - {0x092A, "p"}, // प - {0x092B, "p\xca\xb0"}, // फ = pʰ - {0x092C, "b"}, // ब - {0x092D, "b\xca\xb1"}, // भ = bʱ - {0x092E, "m"}, // म - - // Semi-vowels / Approximants - {0x092F, "j"}, // य - {0x0930, "\xc9\xbe"}, // र = ɾ - {0x0932, "l"}, // ल - {0x0935, "\xca\x8b"}, // व = ʋ - - // Sibilants / Fricatives - {0x0936, "\xca\x83"}, // श = ʃ - {0x0937, "\xca\x82"}, // ष = ʂ - {0x0938, "s"}, // स - {0x0939, "\xc9\xa6"}, // ह = ɦ - - // Nukta variants - {0x0958, "k"}, // क़ → k (Urdu qaf) - {0x0959, "x"}, // ख़ → x - {0x095A, "\xc9\xa3"}, // ग़ → ɣ - {0x095B, "z"}, // ज़ → z - {0x095C, "\xc9\x96"}, // ड़ → ɖ (flap) - {0x095D, "\xc9\x96\xca\xb1"}, // ढ़ → ɖʱ - {0x095E, "f"}, // फ़ → f - }; - return map; -} - -// Devanagari independent vowels → IPA -static const std::unordered_map& get_devanagari_vowels() { - static const std::unordered_map map = { - {0x0905, "\xc9\x99"}, // अ = ə - {0x0906, "a\xcb\x90"}, // आ = aː - {0x0907, "\xc9\xaa"}, // इ = ɪ - {0x0908, "i\xcb\x90"}, // ई = iː - {0x0909, "\xca\x8a"}, // उ = ʊ - {0x090A, "u\xcb\x90"}, // ऊ = uː - {0x090B, "\xc9\xbe\xc9\xaa"}, // ऋ = ɾɪ - {0x090F, "e\xcb\x90"}, // ए = eː - {0x0910, "\xc9\x99j"}, // ऐ = əj (diphthong) - {0x0913, "o\xcb\x90"}, // ओ = oː - {0x0914, "\xc9\x99w"}, // औ = əw (diphthong) - }; - return map; -} - -// Devanagari vowel signs (matras) → IPA -static const std::unordered_map& get_devanagari_matras() { - static const std::unordered_map map = { - {0x093E, "a\xcb\x90"}, // ा = aː - {0x093F, "\xc9\xaa"}, // ि = ɪ - {0x0940, "i\xcb\x90"}, // ी = iː - {0x0941, "\xca\x8a"}, // ु = ʊ - {0x0942, "u\xcb\x90"}, // ू = uː - {0x0943, "\xc9\xbe\xc9\xaa"}, // ृ = ɾɪ - {0x0947, "e\xcb\x90"}, // े = eː - {0x0948, "\xc9\x99j"}, // ै = əj - {0x094B, "o\xcb\x90"}, // ो = oː - {0x094C, "\xc9\x99w"}, // ौ = əw - }; - return map; -} - -std::string multilingual::hindi_g2p(const std::string& text) { - const auto& consonants = get_devanagari_consonants(); - const auto& vowels = get_devanagari_vowels(); - const auto& matras = get_devanagari_matras(); - - auto chars = utf8_split(text); - std::string ipa; - bool prev_was_consonant = false; // track for inherent schwa - - for (size_t i = 0; i < chars.size(); i++) { - uint32_t cp = utf8_codepoint(chars[i]); - - // Virama (halant) — suppresses inherent vowel - if (cp == 0x094D) { - prev_was_consonant = false; // no schwa for previous consonant - continue; - } - - // Anusvara (nasalization) - if (cp == 0x0902) { - ipa += "\xc9\xb4"; // ɴ (generic nasal, assimilates in speech) - prev_was_consonant = false; - continue; - } - - // Visarga - if (cp == 0x0903) { - ipa += "\xc9\xa6"; // ɦ - prev_was_consonant = false; - continue; - } - - // Chandrabindu (nasalization of vowel) - if (cp == 0x0901) { - ipa += "\xcc\x83"; // combining tilde (nasalize previous vowel) - continue; - } - - // Nukta — modifies previous consonant. Skip (handled in nukta consonant entries). - if (cp == 0x093C) { - continue; - } - - // Check vowel signs (matras) first - auto matra_it = matras.find(cp); - if (matra_it != matras.end()) { - prev_was_consonant = false; - ipa += matra_it->second; - continue; - } - - // Independent vowels - auto vowel_it = vowels.find(cp); - if (vowel_it != vowels.end()) { - if (prev_was_consonant) { - // Previous consonant had no explicit vowel — add inherent schwa - ipa += "\xc9\x99"; // ə - } - prev_was_consonant = false; - ipa += vowel_it->second; - continue; - } - - // Consonants - auto cons_it = consonants.find(cp); - if (cons_it != consonants.end()) { - if (prev_was_consonant) { - // Previous consonant had no explicit vowel — add inherent schwa - ipa += "\xc9\x99"; // ə - } - ipa += cons_it->second; - prev_was_consonant = true; - continue; - } - - // Space - if (cp == ' ') { - if (prev_was_consonant) { - // Word-final consonant: add schwa for open syllables - // (Hindi schwa deletion is complex — we add it conservatively) - ipa += "\xc9\x99"; // ə - } - prev_was_consonant = false; - ipa += " "; - continue; - } - - // ASCII passthrough - if ((cp >= 'a' && cp <= 'z') || (cp >= 'A' && cp <= 'Z') - || (cp >= '0' && cp <= '9')) { - if (prev_was_consonant) { - ipa += "\xc9\x99"; // ə - prev_was_consonant = false; - } - ipa += chars[i]; - continue; - } - - // Punctuation - if (cp == ',' || cp == '.' || cp == '!' || cp == '?' - || cp == ';' || cp == ':' || cp == '-' - || cp == 0x0964 || cp == 0x0965) { // Devanagari danda / double danda - if (prev_was_consonant) { - ipa += "\xc9\x99"; // ə - prev_was_consonant = false; - } - if (cp == 0x0964 || cp == 0x0965) { - ipa += "."; - } else { - ipa += static_cast(cp); - } - continue; - } - - // Devanagari digits (0x0966-0x096F) — pass through as Arabic numerals - if (cp >= 0x0966 && cp <= 0x096F) { - if (prev_was_consonant) { - ipa += "\xc9\x99"; - prev_was_consonant = false; - } - ipa += static_cast('0' + (cp - 0x0966)); - continue; - } - - // Skip unknown - if (prev_was_consonant) { - ipa += "\xc9\x99"; - prev_was_consonant = false; - } - } - - // Handle trailing consonant - if (prev_was_consonant) { - ipa += "\xc9\x99"; // ə - } - - return kokoro_postprocess(ipa); -} - -// =========================================================================== -// DICTIONARY-FIRST PHONEMIZERS -// =========================================================================== - -// Split text into words and punctuation tokens for dictionary lookup. -// Returns vector of strings: each is either a word (letters), whitespace, or punctuation. -static std::vector split_into_tokens(const std::string& text) { - std::vector tokens; - auto chars = utf8_split(text); - std::string current_word; - - auto flush_word = [&]() { - if (!current_word.empty()) { - tokens.push_back(current_word); - current_word.clear(); - } - }; - - for (size_t i = 0; i < chars.size(); i++) { - uint32_t cp = utf8_codepoint(chars[i]); - - // Whitespace - if (cp == ' ' || cp == '\t' || cp == '\n' || cp == '\r') { - flush_word(); - tokens.push_back(" "); - continue; - } - - // ASCII punctuation that should be passed through - if (cp == ',' || cp == '.' || cp == '!' || cp == '?' || - cp == ';' || cp == ':' || cp == '-' || cp == '\'' || cp == '"') { - flush_word(); - tokens.push_back(chars[i]); - continue; - } - - // Inverted punctuation (Spanish) - if (cp == 0xBF || cp == 0xA1) { - flush_word(); - tokens.push_back(chars[i]); - continue; - } - - // Japanese/Chinese punctuation - if (cp == 0x3001 || cp == 0x3002 || cp == 0xFF0C || cp == 0xFF0E || - cp == 0xFF01 || cp == 0xFF1F) { - flush_word(); - tokens.push_back(","); // normalize CJK punctuation to comma - continue; - } - - // Devanagari danda - if (cp == 0x0964 || cp == 0x0965) { - flush_word(); - tokens.push_back("."); - continue; - } - - // Everything else is part of a word - current_word += chars[i]; - } - flush_word(); - return tokens; -} - -// Map punctuation token to IPA-compatible output. -static std::string punct_to_ipa(const std::string& tok) { - if (tok == "," || tok == "." || tok == "!" || tok == "?" || - tok == ";" || tok == ":" || tok == "-" || tok == "'") { - return tok; - } - return ""; -} - -// Lowercase a UTF-8 string (handles ASCII letters and common Latin accented chars). -static std::string utf8_to_lower(const std::string& s) { - auto chars = utf8_split(s); - std::string result; - for (auto& ch : chars) { - uint32_t cp = utf8_codepoint(ch); - if (cp >= 'A' && cp <= 'Z') { - result += static_cast(cp + 32); - } else if (cp >= 0xC0 && cp <= 0xD6) { - result += utf8_encode(cp + 32); - } else if (cp >= 0xD8 && cp <= 0xDE) { - result += utf8_encode(cp + 32); - } else { - result += ch; - } - } - return result; -} - -// Check if a token is whitespace. -static bool is_ws_token(const std::string& tok) { - for (char c : tok) { - if (c != ' ' && c != '\t' && c != '\n' && c != '\r') return false; - } - return !tok.empty(); -} - -// Check if a token is punctuation. -static bool is_punct_tok(const std::string& tok) { - if (tok.empty()) return false; - if (tok.size() == 1) { - char c = tok[0]; - return c == ',' || c == '.' || c == '!' || c == '?' || - c == ';' || c == ':' || c == '-' || c == '\'' || c == '"'; - } - uint32_t cp = utf8_codepoint(tok); - return cp == 0xBF || cp == 0xA1; -} - -/// Generic dictionary-first phonemizer. -/// Splits text into words, looks up each in dict, falls back to g2p_fn. -static std::string dict_first_phonemize( - const std::string& text, - const std::unordered_map& dict, - std::string (*g2p_fn)(const std::string&)) -{ - auto tokens = split_into_tokens(text); - std::string result; - - for (auto& tok : tokens) { - if (is_ws_token(tok)) { - result += " "; - continue; - } - if (is_punct_tok(tok)) { - auto mapped = punct_to_ipa(tok); - if (!mapped.empty()) result += mapped; - continue; - } - - // Try dictionary lookup (lowercase) - auto lower = utf8_to_lower(tok); - auto it = dict.find(lower); - if (it != dict.end() && !it->second.empty()) { - result += kokoro_postprocess(it->second); - continue; - } - - // Fallback to rule-based G2P - result += g2p_fn(tok); - } - - return result; -} - -std::string multilingual::french_phonemize( - const std::string& text, - const std::unordered_map& dict) -{ - return dict_first_phonemize(text, dict, french_g2p); -} - -std::string multilingual::spanish_phonemize( - const std::string& text, - const std::unordered_map& dict) -{ - return dict_first_phonemize(text, dict, spanish_g2p); -} - -std::string multilingual::italian_phonemize( - const std::string& text, - const std::unordered_map& dict) -{ - return dict_first_phonemize(text, dict, italian_g2p); -} - -std::string multilingual::portuguese_phonemize( - const std::string& text, - const std::unordered_map& dict) -{ - return dict_first_phonemize(text, dict, portuguese_g2p); -} - -std::string multilingual::hindi_phonemize( - const std::string& text, - const std::unordered_map& dict) -{ - return dict_first_phonemize(text, dict, hindi_g2p); -} - -std::string multilingual::japanese_phonemize(const std::string& text) { - return japanese_g2p(text); -} - -std::string multilingual::chinese_phonemize(const std::string& text) { - return chinese_g2p(text); -} diff --git a/sdk/src/main/cpp/models/kokoro_multilingual.h b/sdk/src/main/cpp/models/kokoro_multilingual.h deleted file mode 100644 index 387cfb3..0000000 --- a/sdk/src/main/cpp/models/kokoro_multilingual.h +++ /dev/null @@ -1,51 +0,0 @@ -#pragma once - -#include -#include - -/// Non-English phonemizers for Kokoro TTS. -/// -/// Dictionary-first approach with rule-based G2P fallback: -/// 1. Split text into words (whitespace + punctuation boundaries) -/// 2. For each word: try dictionary lookup (lowercase), if found use it -/// 3. If not found: apply rule-based grapheme-to-phoneme conversion -/// 4. Pass punctuation tokens through -/// -/// Languages: -/// - French, Spanish, Portuguese, Italian, Hindi — dictionary + rule-based -/// - Japanese — Katakana/Hiragana tables, kanji passthrough -/// - Chinese — Pinyin->IPA conversion (requires pre-segmented pinyin input) -namespace multilingual { - -// --- Dictionary-first phonemizers (preferred entry points) --- - -std::string french_phonemize(const std::string& text, - const std::unordered_map& dict); - -std::string spanish_phonemize(const std::string& text, - const std::unordered_map& dict); - -std::string italian_phonemize(const std::string& text, - const std::unordered_map& dict); - -std::string portuguese_phonemize(const std::string& text, - const std::unordered_map& dict); - -std::string hindi_phonemize(const std::string& text, - const std::unordered_map& dict); - -std::string japanese_phonemize(const std::string& text); - -std::string chinese_phonemize(const std::string& text); - -// --- Rule-based G2P fallback (used when word not in dictionary) --- - -std::string french_g2p(const std::string& text); -std::string spanish_g2p(const std::string& text); -std::string portuguese_g2p(const std::string& text); -std::string italian_g2p(const std::string& text); -std::string japanese_g2p(const std::string& text); -std::string chinese_g2p(const std::string& text); -std::string hindi_g2p(const std::string& text); - -} // namespace multilingual diff --git a/sdk/src/main/cpp/models/kokoro_phonemizer.cpp b/sdk/src/main/cpp/models/kokoro_phonemizer.cpp deleted file mode 100644 index 05c3d03..0000000 --- a/sdk/src/main/cpp/models/kokoro_phonemizer.cpp +++ /dev/null @@ -1,456 +0,0 @@ -#include "kokoro_phonemizer.h" -#include "kokoro_multilingual.h" -#include -#include - -// --------------------------------------------------------------------------- -// UTF-8 helpers -// --------------------------------------------------------------------------- - -/// Iterate UTF-8 string one character (potentially multi-byte) at a time. -static std::vector utf8_chars(const std::string& s) { - std::vector chars; - size_t i = 0; - while (i < s.size()) { - size_t len = 1; - unsigned char c = static_cast(s[i]); - if ((c & 0xE0) == 0xC0) len = 2; - else if ((c & 0xF0) == 0xE0) len = 3; - else if ((c & 0xF8) == 0xF0) len = 4; - chars.push_back(s.substr(i, len)); - i += len; - } - return chars; -} - -static std::string to_lower(const std::string& s) { - std::string result = s; - std::transform(result.begin(), result.end(), result.begin(), - [](unsigned char c) { return std::tolower(c); }); - return result; -} - -static std::string capitalize(const std::string& s) { - if (s.empty()) return s; - std::string result = s; - result[0] = static_cast(std::toupper(static_cast(result[0]))); - return result; -} - -static bool is_punct(char c) { - return std::ispunct(static_cast(c)) != 0; -} - -static bool is_whitespace(const std::string& s) { - for (char c : s) if (!std::isspace(static_cast(c))) return false; - return !s.empty(); -} - -static bool is_all_punct(const std::string& s) { - for (char c : s) if (!is_punct(c)) return false; - return !s.empty(); -} - -static bool ends_with(const std::string& s, const std::string& suffix) { - if (suffix.size() > s.size()) return false; - return s.compare(s.size() - suffix.size(), suffix.size(), suffix) == 0; -} - -static void replace_all(std::string& s, const std::string& from, const std::string& to) { - size_t pos = 0; - while ((pos = s.find(from, pos)) != std::string::npos) { - s.replace(pos, from.size(), to); - pos += to.size(); - } -} - -// --------------------------------------------------------------------------- -// Loading -// --------------------------------------------------------------------------- - -bool KokoroPhonemizer::load_vocab(const std::string& path) { - auto text = json::read_file(path); - if (text.empty()) return false; - vocab_ = json::parse_vocab_index(text); - return !vocab_.empty(); -} - -bool KokoroPhonemizer::load_dictionaries(const std::string& dir) { - auto gold_text = json::read_file(dir + "/us_gold.json"); - if (!gold_text.empty()) { - gold_dict_ = json::parse_dictionary(gold_text); - grow_dictionary(gold_dict_); - } - - auto silver_text = json::read_file(dir + "/us_silver.json"); - if (!silver_text.empty()) { - silver_dict_ = json::parse_dictionary(silver_text); - grow_dictionary(silver_dict_); - } - - return !gold_dict_.empty() || !silver_dict_.empty(); -} - -bool KokoroPhonemizer::load_language_dict( - const std::string& lang, const std::string& path) -{ - auto text = json::read_file(path); - if (text.empty()) return false; - - // Language dicts are flat {"word": "phonemes"} format - auto dict = json::parse_flat_object(text); - if (dict.empty()) return false; - - lang_dicts_[lang] = std::move(dict); - return true; -} - -void KokoroPhonemizer::set_language(const std::string& lang) { - language_ = lang; -} - -void KokoroPhonemizer::grow_dictionary( - std::unordered_map& dict) -{ - std::unordered_map additions; - for (auto& [key, entry] : dict) { - auto lower = to_lower(key); - if (key == lower && !key.empty()) { - auto cap = capitalize(key); - if (dict.find(cap) == dict.end()) additions[cap] = entry; - } - if (!key.empty() && std::isupper(static_cast(key[0]))) { - if (dict.find(lower) == dict.end()) additions[lower] = entry; - } - } - for (auto& [k, v] : additions) dict[k] = std::move(v); -} - -// --------------------------------------------------------------------------- -// Tokenization -// --------------------------------------------------------------------------- - -std::vector KokoroPhonemizer::tokenize( - const std::string& text, int max_length) -{ - auto phonemes = text_to_phonemes(text); - std::vector ids = {BOS_ID}; - - // Tokenize IPA string character by character - // Spaces dropped (not in vocab) — matches iOS behavior - auto chars = utf8_chars(phonemes); - for (auto& ch : chars) { - auto it = vocab_.find(ch); - if (it != vocab_.end()) { - ids.push_back(it->second); - } - // Unknown chars (including spaces) silently dropped - } - - ids.push_back(EOS_ID); - - if (static_cast(ids.size()) > max_length) { - ids.resize(max_length - 1); - ids.push_back(EOS_ID); - } - - return ids; -} - -std::vector KokoroPhonemizer::pad( - const std::vector& ids, int length) -{ - if (static_cast(ids.size()) >= length) { - return std::vector(ids.begin(), ids.begin() + length); - } - auto result = ids; - result.resize(length, PAD_ID); - return result; -} - -// --------------------------------------------------------------------------- -// Text → Phonemes pipeline -// --------------------------------------------------------------------------- - -std::string KokoroPhonemizer::text_to_phonemes(const std::string& text) { - // Route non-English languages to multilingual phonemizers - if (language_ == "fr") { - auto it = lang_dicts_.find("fr"); - static const std::unordered_map empty; - return multilingual::french_phonemize(text, it != lang_dicts_.end() ? it->second : empty); - } - if (language_ == "es") { - auto it = lang_dicts_.find("es"); - static const std::unordered_map empty; - return multilingual::spanish_phonemize(text, it != lang_dicts_.end() ? it->second : empty); - } - if (language_ == "it") { - auto it = lang_dicts_.find("it"); - static const std::unordered_map empty; - return multilingual::italian_phonemize(text, it != lang_dicts_.end() ? it->second : empty); - } - if (language_ == "pt") { - auto it = lang_dicts_.find("pt"); - static const std::unordered_map empty; - return multilingual::portuguese_phonemize(text, it != lang_dicts_.end() ? it->second : empty); - } - if (language_ == "hi") { - auto it = lang_dicts_.find("hi"); - static const std::unordered_map empty; - return multilingual::hindi_phonemize(text, it != lang_dicts_.end() ? it->second : empty); - } - if (language_ == "ja") { - return multilingual::japanese_phonemize(text); - } - if (language_ == "zh") { - return multilingual::chinese_phonemize(text); - } - - // English (default) - auto normalized = normalize_text(text); - auto words = split_words(normalized); - - std::string result; - for (auto& word : words) { - if (is_whitespace(word)) { - result += " "; - continue; - } - if (is_all_punct(word)) { - auto mapped = punctuation_to_phoneme(word); - if (!mapped.empty()) result += mapped; - continue; - } - auto phonemes = resolve_word(word); - result += phonemes; - } - return result; -} - -// --------------------------------------------------------------------------- -// Text normalization -// --------------------------------------------------------------------------- - -std::string KokoroPhonemizer::normalize_text(const std::string& text) { - std::string result = text; - - struct Contraction { const char* from; const char* to; }; - static const Contraction contractions[] = { - {"can't", "can not"}, {"won't", "will not"}, {"don't", "do not"}, - {"doesn't", "does not"}, {"didn't", "did not"}, {"isn't", "is not"}, - {"aren't", "are not"}, {"wasn't", "was not"}, {"weren't", "were not"}, - {"couldn't", "could not"}, {"wouldn't", "would not"}, {"shouldn't", "should not"}, - {"haven't", "have not"}, {"hasn't", "has not"}, {"hadn't", "had not"}, - {"i'm", "i am"}, {"i've", "i have"}, {"i'll", "i will"}, {"i'd", "i would"}, - {"you're", "you are"}, {"you've", "you have"}, {"you'll", "you will"}, - {"he's", "he is"}, {"she's", "she is"}, {"it's", "it is"}, - {"we're", "we are"}, {"we've", "we have"}, {"we'll", "we will"}, - {"they're", "they are"}, {"they've", "they have"}, {"they'll", "they will"}, - {"that's", "that is"}, {"there's", "there is"}, {"let's", "let us"}, - }; - - auto lower = to_lower(result); - for (auto& c : contractions) { - if (lower.find(c.from) != std::string::npos) { - // Case-insensitive replace - std::string from_lower(c.from); - size_t pos = lower.find(from_lower); - while (pos != std::string::npos) { - result.replace(pos, from_lower.size(), c.to); - lower = to_lower(result); - pos = lower.find(from_lower, pos + std::string(c.to).size()); - } - } - } - - // Collapse multiple spaces - replace_all(result, " ", " "); - - // Trim - size_t start = result.find_first_not_of(" \t\n\r"); - size_t end = result.find_last_not_of(" \t\n\r"); - if (start == std::string::npos) return ""; - return result.substr(start, end - start + 1); -} - -std::vector KokoroPhonemizer::split_words(const std::string& text) { - std::vector words; - std::string current; - - for (char c : text) { - if (std::isspace(static_cast(c))) { - if (!current.empty()) { words.push_back(current); current.clear(); } - words.emplace_back(1, ' '); - } else if (is_punct(c)) { - if (!current.empty()) { words.push_back(current); current.clear(); } - words.emplace_back(1, c); - } else { - current += c; - } - } - if (!current.empty()) words.push_back(current); - return words; -} - -// --------------------------------------------------------------------------- -// Word resolution -// --------------------------------------------------------------------------- - -std::string KokoroPhonemizer::resolve_word(const std::string& word) { - auto lower = to_lower(word); - auto sp = special_case(lower); - if (!sp.empty()) return sp; - auto dict = lookup_dict(lower); - if (!dict.empty()) return dict; - auto stemmed = stem_and_lookup(lower); - if (!stemmed.empty()) return stemmed; - // Fallback: return word as-is (will be mostly dropped during tokenization) - return lower; -} - -std::string KokoroPhonemizer::lookup_dict(const std::string& word) { - auto it = gold_dict_.find(word); - if (it != gold_dict_.end()) return resolve_entry(it->second); - it = silver_dict_.find(word); - if (it != silver_dict_.end()) return resolve_entry(it->second); - return ""; -} - -std::string KokoroPhonemizer::resolve_entry(const json::DictEntry& entry) { - if (!entry.is_heteronym()) return entry.simple; - auto it = entry.pos_map.find("DEFAULT"); - if (it != entry.pos_map.end()) return it->second; - if (!entry.pos_map.empty()) return entry.pos_map.begin()->second; - return ""; -} - -std::string KokoroPhonemizer::special_case(const std::string& word) { - if (word == "the") return "\xC3\xB0\xC9\x99"; // ðə - if (word == "a") return "\xC9\x90"; // ɐ - if (word == "an") return "\xC9\x99n"; // ən - if (word == "to") return "t\xCA\x8A"; // tʊ - if (word == "of") return "\xCA\x8Cv"; // ʌv - if (word == "i") return "a\xC9\xAA"; // aɪ - return ""; -} - -std::string KokoroPhonemizer::punctuation_to_phoneme(const std::string& text) { - if (text == "," || text == "." || text == "!" || text == "?" || - text == ";" || text == ":" || text == "-" || text == "'") { - return text; - } - return ""; -} - -// --------------------------------------------------------------------------- -// Suffix stemming -// --------------------------------------------------------------------------- - -std::string KokoroPhonemizer::stem_and_lookup(const std::string& word) { - auto r = stem_s(word); - if (!r.empty()) return r; - r = stem_ed(word); - if (!r.empty()) return r; - r = stem_ing(word); - if (!r.empty()) return r; - return ""; -} - -std::string KokoroPhonemizer::stem_s(const std::string& word) { - if (!ends_with(word, "s") || word.size() <= 2) return ""; - - if (ends_with(word, "ies")) { - auto stem = word.substr(0, word.size() - 3) + "y"; - auto phonemes = lookup_dict(stem); - if (!phonemes.empty()) return phonemes + "z"; - } - - if (ends_with(word, "es") && word.size() > 3) { - auto stem = word.substr(0, word.size() - 2); - auto phonemes = lookup_dict(stem); - if (!phonemes.empty()) { - if (!phonemes.empty()) { - char last = phonemes.back(); - // After sibilants: +ɪz - if (last == 's' || last == 'z') { - return phonemes + "\xC9\xAA" "z"; // ɪz - } - } - return phonemes + "z"; - } - } - - auto stem = word.substr(0, word.size() - 1); - auto phonemes = lookup_dict(stem); - if (!phonemes.empty()) { - // Voiceless consonants: +s, otherwise +z - char last = phonemes.back(); - if (last == 'p' || last == 't' || last == 'k' || last == 'f') { - return phonemes + "s"; - } - return phonemes + "z"; - } - return ""; -} - -std::string KokoroPhonemizer::stem_ed(const std::string& word) { - if (!ends_with(word, "ed") || word.size() <= 3) return ""; - - if (ends_with(word, "ied")) { - auto stem = word.substr(0, word.size() - 3) + "y"; - auto phonemes = lookup_dict(stem); - if (!phonemes.empty()) return phonemes + "d"; - } - - auto stem_base = word.substr(0, word.size() - 2); - if (stem_base.size() >= 2) { - char last = stem_base.back(); - char prev = stem_base[stem_base.size() - 2]; - if (last == prev) { - // Doubled consonant — try dedoubled stem - auto dedoubled = stem_base.substr(0, stem_base.size() - 1); - auto phonemes = lookup_dict(dedoubled); - if (!phonemes.empty()) return phonemes + ed_suffix(phonemes); - } - } - - auto phonemes = lookup_dict(stem_base); - if (!phonemes.empty()) return phonemes + ed_suffix(phonemes); - return ""; -} - -std::string KokoroPhonemizer::ed_suffix(const std::string& phonemes) { - if (phonemes.empty()) return "d"; - char last = phonemes.back(); - if (last == 't' || last == 'd') return "\xC9\xAA" "d"; // ɪd - if (last == 'p' || last == 'k' || last == 'f' || last == 's') { - return "t"; - } - return "d"; -} - -std::string KokoroPhonemizer::stem_ing(const std::string& word) { - if (!ends_with(word, "ing") || word.size() <= 4) return ""; - - auto stem = word.substr(0, word.size() - 3); - - if (stem.size() >= 2) { - char last = stem.back(); - char prev = stem[stem.size() - 2]; - if (last == prev) { - auto dedoubled = stem.substr(0, stem.size() - 1); - auto phonemes = lookup_dict(dedoubled); - if (!phonemes.empty()) return phonemes + "\xC9\xAA\xC5\x8B"; // ɪŋ - } - } - - auto phonemes = lookup_dict(stem); - if (!phonemes.empty()) return phonemes + "\xC9\xAA\xC5\x8B"; // ɪŋ - - // Try stem + "e" (e.g., "making" → "make") - auto stem_e = stem + "e"; - phonemes = lookup_dict(stem_e); - if (!phonemes.empty()) return phonemes + "\xC9\xAA\xC5\x8B"; // ɪŋ - - return ""; -} diff --git a/sdk/src/main/cpp/models/kokoro_phonemizer.h b/sdk/src/main/cpp/models/kokoro_phonemizer.h deleted file mode 100644 index 0f4db02..0000000 --- a/sdk/src/main/cpp/models/kokoro_phonemizer.h +++ /dev/null @@ -1,85 +0,0 @@ -#pragma once - -#include -#include -#include -#include "../util/json.h" - -/// GPL-free phonemizer for Kokoro TTS — ported from speech-swift. -/// -/// Three-tier approach for English (all Apache-2.0 / BSD compatible): -/// 1. Dictionary lookup �� gold + silver IPA dictionaries from misaki -/// 2. Suffix stemming — strips -s/-ed/-ing, looks up stem, applies phonological rules -/// 3. BART G2P — encoder-decoder neural model for OOV words (optional ONNX) -/// -/// Non-English languages use dictionary-first with rule-based G2P fallback. -/// -/// No eSpeak-NG dependency. -class KokoroPhonemizer { -public: - // Kokoro's vocab uses '$' (token id 0) as the BOS / EOS / padding symbol — - // see vocab_index.json: '$' -> 0, ';' -> 1, ':' -> 2. Earlier code used 1 - // and 2, which prepended a literal semicolon and appended a colon to every - // utterance, throwing off the model's prosody and dropping the first word. - // Verified by round-tripping prompts through speech_synthesize + - // speech_transcribe: with the wrong wrap "Hello world" came back as - // "I wrote"; with id 0 it round-trips to "Hello world". - static constexpr int PAD_ID = 0; - static constexpr int BOS_ID = 0; - static constexpr int EOS_ID = 0; - - KokoroPhonemizer() = default; - - /// Load IPA symbol → token ID vocabulary from vocab_index.json. - bool load_vocab(const std::string& path); - - /// Load pronunciation dictionaries (us_gold.json, us_silver.json). - bool load_dictionaries(const std::string& dir); - - /// Load a language-specific pronunciation dictionary (dict_fr.json, etc.). - /// Returns true if the dictionary was loaded successfully. - bool load_language_dict(const std::string& lang, const std::string& path); - - /// Set the active language for phonemization. - /// Supported: "en" (default), "fr", "es", "it", "pt", "hi", "ja", "zh". - void set_language(const std::string& lang); - - /// Convert text → phoneme token IDs (with BOS/EOS, max 510). - std::vector tokenize(const std::string& text, int max_length = 510); - - /// Pad token IDs to fixed length. - std::vector pad(const std::vector& ids, int length); - - /// Convert text to IPA phoneme string. - std::string text_to_phonemes(const std::string& text); - -private: - - std::string normalize_text(const std::string& text); - std::vector split_words(const std::string& text); - std::string resolve_word(const std::string& word); - std::string lookup_dict(const std::string& word); - std::string special_case(const std::string& word); - std::string stem_and_lookup(const std::string& word); - std::string stem_s(const std::string& word); - std::string stem_ed(const std::string& word); - std::string stem_ing(const std::string& word); - std::string ed_suffix(const std::string& phonemes); - std::string punctuation_to_phoneme(const std::string& text); - - void grow_dictionary(std::unordered_map& dict); - std::string resolve_entry(const json::DictEntry& entry); - - // IPA symbol → token ID - std::unordered_map vocab_; - - // English pronunciation dictionaries - std::unordered_map gold_dict_; - std::unordered_map silver_dict_; - - // Active language (default: English) - std::string language_ = "en"; - - // Non-English pronunciation dictionaries keyed by language code - std::unordered_map> lang_dicts_; -}; diff --git a/sdk/src/main/cpp/models/kokoro_tts.cpp b/sdk/src/main/cpp/models/kokoro_tts.cpp deleted file mode 100644 index 63af1e5..0000000 --- a/sdk/src/main/cpp/models/kokoro_tts.cpp +++ /dev/null @@ -1,258 +0,0 @@ -#include "kokoro_tts.h" -#include "onnx_engine.h" -#include -#include -#include - -static constexpr int MAX_PHONEMES = 128; - -KokoroTts::KokoroTts( - const std::string& model_path, - const std::string& voices_dir, - const std::string& data_dir, - bool nnapi) - : voices_dir_(voices_dir) -{ - auto& engine = OnnxEngine::get(); - api_ = engine.api(); - session_ = engine.load(model_path, nnapi); - - // Load phonemizer vocabulary and dictionaries - phonemizer_.load_vocab(data_dir + "/vocab_index.json"); - phonemizer_.load_dictionaries(data_dir); - - // Load optional non-English pronunciation dictionaries - for (const char* lang : {"fr", "es", "it", "pt", "hi"}) { - phonemizer_.load_language_dict(lang, - data_dir + "/dict_" + lang + ".json"); - } - - // Load default voice - set_voice("af_heart"); -} - -KokoroTts::~KokoroTts() { - if (session_) api_->ReleaseSession(session_); -} - -void KokoroTts::set_voice(const std::string& name) { - voice_embedding_ = load_voice_embedding(name); -} - -std::vector KokoroTts::load_voice_embedding(const std::string& name) { - std::string path = voices_dir_ + "/" + name + ".bin"; - std::ifstream file(path, std::ios::binary); - if (!file.is_open()) { - LOGE("Voice file not found: %s", path.c_str()); - return std::vector(256, 0.0f); - } - - std::vector embedding(256); - file.read(reinterpret_cast(embedding.data()), 256 * sizeof(float)); - return embedding; -} - -void KokoroTts::auto_switch_voice(const std::string& lang) { - if (lang == current_lang_) return; - current_lang_ = lang; - - // Map language to default voice - struct LangVoice { const char* lang; const char* voice; }; - static const LangVoice map[] = { - {"en", "af_heart"}, - {"fr", "ff_siwis"}, - {"es", "ef_dora"}, - {"it", "if_sara"}, - {"pt", "pf_dora"}, - {"hi", "hf_alpha"}, - {"ja", "jf_alpha"}, - {"zh", "zf_xiaobei"}, - {"ko", "kf_somi"}, - }; - - for (auto& entry : map) { - if (lang == entry.lang) { - auto emb = load_voice_embedding(entry.voice); - if (emb[0] != 0.0f || emb[1] != 0.0f) { // check not zeroed (missing file) - voice_embedding_ = std::move(emb); - LOGI("TTS: auto-switched voice to %s for language %s", entry.voice, entry.lang); - } - return; - } - } - // Unknown language — keep current voice -} - -void KokoroTts::synthesize( - const char* text, const char* language, - ChunkCallback on_chunk, void* ctx) -{ - cancelled_ = false; - - // Set language and auto-switch voice if language changed - std::string lang = (language && language[0]) ? language : "en"; - phonemizer_.set_language(lang); - auto_switch_voice(lang); - auto* mem = OnnxEngine::get().cpu_memory(); - - // Text → phoneme token IDs - auto raw_tokens = phonemizer_.tokenize(text, MAX_PHONEMES); - if (raw_tokens.empty() || cancelled_) return; - - size_t token_count = raw_tokens.size(); - - LOGI("TTS: text='%.60s' tokens=%zu", text, token_count); - - // Pad to fixed MAX_PHONEMES with attention mask - std::vector input_ids(MAX_PHONEMES, 0); - std::vector attention_mask(MAX_PHONEMES, 0); - for (size_t i = 0; i < token_count && i < MAX_PHONEMES; i++) { - input_ids[i] = raw_tokens[i]; - attention_mask[i] = 1; - } - - // --- input tensors --- - - const int64_t ids_shape[] = {1, MAX_PHONEMES}; - - // input_ids [1, 128] - OrtValue* t_ids = nullptr; - ort_check(api_, api_->CreateTensorWithDataAsOrtValue( - mem, input_ids.data(), input_ids.size() * sizeof(int64_t), - ids_shape, 2, ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, &t_ids)); - - // attention_mask [1, 128] - OrtValue* t_mask = nullptr; - ort_check(api_, api_->CreateTensorWithDataAsOrtValue( - mem, attention_mask.data(), attention_mask.size() * sizeof(int64_t), - ids_shape, 2, ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, &t_mask)); - - // ref_s / voice embedding [1, 256] - const int64_t style_shape[] = {1, 256}; - OrtValue* t_style = nullptr; - ort_check(api_, api_->CreateTensorWithDataAsOrtValue( - mem, voice_embedding_.data(), voice_embedding_.size() * sizeof(float), - style_shape, 2, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &t_style)); - - // speed [1] - float speed = 0.85f; - const int64_t speed_shape[] = {1}; - OrtValue* t_speed = nullptr; - ort_check(api_, api_->CreateTensorWithDataAsOrtValue( - mem, &speed, sizeof(float), - speed_shape, 1, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &t_speed)); - - // random_phases [1, 9] - float phases[9]; - for (int i = 0; i < 9; i++) - phases[i] = static_cast(rand()) / static_cast(RAND_MAX); - const int64_t phases_shape[] = {1, 9}; - OrtValue* t_phases = nullptr; - ort_check(api_, api_->CreateTensorWithDataAsOrtValue( - mem, phases, sizeof(phases), - phases_shape, 2, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &t_phases)); - - // --- run --- - - const char* in_names[] = {"input_ids", "attention_mask", "ref_s", "speed", "random_phases"}; - const char* out_names[] = {"audio", "audio_length_samples", "pred_dur"}; - OrtValue* inputs[] = {t_ids, t_mask, t_style, t_speed, t_phases}; - OrtValue* outputs[] = {nullptr, nullptr, nullptr}; - - ort_check(api_, api_->Run( - session_, nullptr, - in_names, inputs, 5, - out_names, 3, outputs)); - - if (!cancelled_) { - float* audio = nullptr; - ort_check(api_, api_->GetTensorMutableData(outputs[0], (void**)&audio)); - - // Get valid sample count from model - int64_t* len_ptr = nullptr; - ort_check(api_, api_->GetTensorMutableData(outputs[1], (void**)&len_ptr)); - size_t valid_samples = static_cast(len_ptr[0]); - - // Inspect peak before any processing — short prompts (≤5 tokens) can - // make the E2E ONNX export numerically explode (peak in the hundreds). - // Treat that as a synthesis failure rather than amplifying garbage. - float peak = 0.0f; - for (size_t i = 0; i < valid_samples; i++) { - float a = std::abs(audio[i]); - if (a > peak) peak = a; - } - if (peak > 2.0f) { - LOGI("TTS: dropping output, peak=%.2f indicates numerical instability " - "(short prompt? text='%.40s')", peak, text); - // Cleanup outputs and return without emitting audio - for (int i = 2; i >= 0; i--) api_->ReleaseValue(outputs[i]); - api_->ReleaseValue(t_phases); - api_->ReleaseValue(t_speed); - api_->ReleaseValue(t_style); - api_->ReleaseValue(t_mask); - api_->ReleaseValue(t_ids); - return; - } - - // Trim trailing artifacts — Kokoro's E2E model often emits 100-300 ms - // of low-energy noise + occasional loud spike past the real speech. - // Walk backwards through 50 ms windows; the last window above the - // silence floor is where speech ended. Sustained-energy threshold - // (50 ms window) avoids mistaking isolated artifact spikes for - // speech. Mirrors KokoroTTSModel.synthesize() in speech-swift. - constexpr int sample_rate = 24000; - constexpr float silence_rms = 0.030f; - const size_t win = std::max(1, sample_rate / 20); // 50 ms - size_t speech_end = valid_samples; - if (valid_samples > win) { - for (size_t i = valid_samples - win; i > 0; i -= win / 2) { - float sum_sq = 0.0f; - for (size_t j = 0; j < win; j++) { - float v = audio[i + j]; - sum_sq += v * v; - } - float rms = std::sqrt(sum_sq / static_cast(win)); - if (rms > silence_rms) { - speech_end = i + win; - break; - } - if (i < win / 2) break; - } - } - if (speech_end < valid_samples) { - for (size_t k = speech_end; k < valid_samples; k++) audio[k] = 0.0f; - } - // ~10 ms linear fade-out at the new tail boundary so the seam is smooth. - const size_t fade_out = std::min(speech_end, sample_rate / 100); - if (fade_out >= 2) { - const size_t start = speech_end - fade_out; - const float denom = static_cast(fade_out - 1); - for (size_t k = 0; k < fade_out; k++) { - float gain = static_cast(fade_out - 1 - k) / denom; - audio[start + k] *= gain; - } - } - // 5 ms fade-in to prevent click at start. - const size_t fade_in = std::min(120, speech_end); - for (size_t i = 0; i < fade_in; i++) { - audio[i] *= static_cast(i) / static_cast(fade_in); - } - - LOGI("TTS: valid=%zu speech_end=%zu peak=%.4f", valid_samples, speech_end, peak); - - on_chunk(audio, speech_end, true, ctx); - } - - // --- cleanup --- - - for (int i = 2; i >= 0; i--) api_->ReleaseValue(outputs[i]); - api_->ReleaseValue(t_phases); - api_->ReleaseValue(t_speed); - api_->ReleaseValue(t_style); - api_->ReleaseValue(t_mask); - api_->ReleaseValue(t_ids); -} - -void KokoroTts::cancel() { - cancelled_ = true; -} diff --git a/sdk/src/main/cpp/models/kokoro_tts.h b/sdk/src/main/cpp/models/kokoro_tts.h deleted file mode 100644 index dc732d8..0000000 --- a/sdk/src/main/cpp/models/kokoro_tts.h +++ /dev/null @@ -1,41 +0,0 @@ -#pragma once - -#include -#include -#include -#include "kokoro_phonemizer.h" - -/// Kokoro 82M — lightweight text-to-speech via ONNX Runtime. -/// Non-autoregressive, single-pass synthesis. -/// Output: 24 kHz PCM Float32. -class KokoroTts { -public: - KokoroTts(const std::string& model_path, - const std::string& voices_dir, - const std::string& data_dir, - bool nnapi = true); - ~KokoroTts(); - - using ChunkCallback = void(*)(const float* samples, size_t length, - bool is_final, void* ctx); - - void synthesize(const char* text, const char* language, - ChunkCallback on_chunk, void* ctx); - void cancel(); - int output_sample_rate() const { return 24000; } - - void set_voice(const std::string& name); - -private: - std::vector load_voice_embedding(const std::string& name); - void auto_switch_voice(const std::string& language); - - const OrtApi* api_; - OrtSession* session_ = nullptr; - - KokoroPhonemizer phonemizer_; - std::vector voice_embedding_; - std::string voices_dir_; - std::string current_lang_; - bool cancelled_ = false; -}; diff --git a/sdk/src/main/cpp/models/onnx_backend.h b/sdk/src/main/cpp/models/onnx_backend.h deleted file mode 100644 index 0111b43..0000000 --- a/sdk/src/main/cpp/models/onnx_backend.h +++ /dev/null @@ -1,131 +0,0 @@ -#pragma once - -#include "inference_engine.h" -#include "onnx_engine.h" -#include -#include -#include - -/// ONNX Runtime output tensor — wraps OrtValue*. -class OnnxOutputTensor : public OutputTensor { -public: - OnnxOutputTensor(const OrtApi* api, OrtValue* value) : api_(api), value_(value) {} - - ~OnnxOutputTensor() override { - if (value_) api_->ReleaseValue(value_); - } - - float* data_float() override { - float* data = nullptr; - ort_check(api_, api_->GetTensorMutableData(value_, (void**)&data)); - return data; - } - - int64_t* data_int64() override { - int64_t* data = nullptr; - ort_check(api_, api_->GetTensorMutableData(value_, (void**)&data)); - return data; - } - - std::vector shape() override { - OrtTensorTypeAndShapeInfo* info = nullptr; - ort_check(api_, api_->GetTensorTypeAndShape(value_, &info)); - size_t dim_count = 0; - api_->GetDimensionsCount(info, &dim_count); - std::vector dims(dim_count); - api_->GetDimensions(info, dims.data(), dim_count); - api_->ReleaseTensorTypeAndShapeInfo(info); - return dims; - } - - size_t element_count() override { - auto s = shape(); - size_t n = 1; - for (auto d : s) n *= static_cast(d); - return n; - } - -private: - const OrtApi* api_; - OrtValue* value_; -}; - -/// ONNX Runtime session — wraps OrtSession*. -class OnnxSession : public InferenceSession { -public: - OnnxSession(const OrtApi* api, OrtSession* session) - : api_(api), session_(session) {} - - ~OnnxSession() override { - if (session_) api_->ReleaseSession(session_); - } - - std::vector> run( - const std::vector& input_names, - const std::vector& inputs, - const std::vector& output_names) override - { - auto* mem = OnnxEngine::get().cpu_memory(); - size_t num_in = inputs.size(); - size_t num_out = output_names.size(); - - // Create input OrtValues - std::vector ort_inputs(num_in, nullptr); - for (size_t i = 0; i < num_in; i++) { - auto& t = inputs[i]; - ONNXTensorElementDataType ort_dtype; - switch (t.dtype) { - case DType::FLOAT32: ort_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; break; - case DType::INT64: ort_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64; break; - case DType::INT32: ort_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32; break; - case DType::INT8: ort_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8; break; - } - ort_check(api_, api_->CreateTensorWithDataAsOrtValue( - mem, - const_cast(t.data), - t.byte_size(), - t.shape.data(), - t.shape.size(), - ort_dtype, - &ort_inputs[i])); - } - - // Prepare output array - std::vector ort_outputs(num_out, nullptr); - - // Run - ort_check(api_, api_->Run( - session_, nullptr, - input_names.data(), ort_inputs.data(), num_in, - output_names.data(), num_out, ort_outputs.data())); - - // Release inputs - for (auto* v : ort_inputs) api_->ReleaseValue(v); - - // Wrap outputs - std::vector> results; - results.reserve(num_out); - for (auto* v : ort_outputs) { - results.push_back(std::make_unique(api_, v)); - } - return results; - } - -private: - const OrtApi* api_; - OrtSession* session_; -}; - -/// ONNX Runtime backend — delegates to OnnxEngine singleton. -class OnnxBackend : public InferenceBackend { -public: - std::unique_ptr load( - const std::string& path, bool hw_accel = true) override - { - auto& engine = OnnxEngine::get(); - OrtSession* session = engine.load(path, hw_accel); - return std::make_unique(engine.api(), session); - } - - Backend type() const override { return Backend::ONNX; } -}; diff --git a/sdk/src/main/cpp/models/onnx_engine.h b/sdk/src/main/cpp/models/onnx_engine.h deleted file mode 100644 index 4dcee0f..0000000 --- a/sdk/src/main/cpp/models/onnx_engine.h +++ /dev/null @@ -1,122 +0,0 @@ -#pragma once - -#include -#include -#include - -#ifdef __ANDROID__ -#include -#define LOG_TAG "Speech" -#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__) -#define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__) -#else -#include -#define LOGI(...) do { fprintf(stderr, "[speech] "); fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while(0) -#define LOGE(...) do { fprintf(stderr, "[speech ERROR] "); fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while(0) -#endif - -inline void ort_check(const OrtApi* api, OrtStatus* status) { - if (status != nullptr) { - const char* msg = api->GetErrorMessage(status); - std::string err(msg); - api->ReleaseStatus(status); - throw std::runtime_error("ORT: " + err); - } -} - -/// Singleton ONNX Runtime environment shared across all models. -class OnnxEngine { -public: - static OnnxEngine& get() { - static OnnxEngine instance; - return instance; - } - - const OrtApi* api() const { return api_; } - OrtEnv* env() const { return env_; } - - /// True if any model fell back from NNAPI to CPU during session creation. - bool had_nnapi_fallback() const { return nnapi_fallback_; } - const std::string& nnapi_fallback_reason() const { return nnapi_fallback_reason_; } - - OrtSession* load(const std::string& path, bool nnapi = true) { - OrtSessionOptions* opts = nullptr; - ort_check(api_, api_->CreateSessionOptions(&opts)); - api_->SetSessionGraphOptimizationLevel(opts, ORT_ENABLE_ALL); - api_->SetIntraOpNumThreads(opts, 2); - - if (nnapi) { - LOGI("Loading model with hardware acceleration: %s", - path.substr(path.find_last_of('/') + 1).c_str()); -#ifdef __ANDROID__ - const char* keys[] = {"nnapi_flags"}; - const char* values[] = {"0"}; - OrtStatus* s = api_->SessionOptionsAppendExecutionProvider( - opts, "NNAPI", keys, values, 1); -#else - const char* keys[] = {"backend_path"}; - const char* values[] = {"libQnnHtp.so"}; - OrtStatus* s = api_->SessionOptionsAppendExecutionProvider( - opts, "QNN", keys, values, 1); -#endif - if (s != nullptr) { - LOGI("Hardware EP unavailable, using CPU"); - api_->ReleaseStatus(s); - } - } - - OrtSession* session = nullptr; - OrtStatus* create_status = api_->CreateSession(env_, path.c_str(), opts, &session); - - // If session creation fails with NNAPI, retry CPU-only - if (create_status != nullptr && nnapi) { - const char* msg = api_->GetErrorMessage(create_status); - LOGI("NNAPI session failed (%s), retrying CPU-only", msg); - nnapi_fallback_ = true; - nnapi_fallback_reason_ = msg; - api_->ReleaseStatus(create_status); - api_->ReleaseSessionOptions(opts); - - opts = nullptr; - ort_check(api_, api_->CreateSessionOptions(&opts)); - api_->SetSessionGraphOptimizationLevel(opts, ORT_ENABLE_ALL); - api_->SetIntraOpNumThreads(opts, 4); - - ort_check(api_, api_->CreateSession(env_, path.c_str(), opts, &session)); - } else if (create_status != nullptr) { - // CPU-only also failed — propagate the error - const char* msg = api_->GetErrorMessage(create_status); - std::string err(msg); - api_->ReleaseStatus(create_status); - api_->ReleaseSessionOptions(opts); - throw std::runtime_error("ORT: " + err); - } - - api_->ReleaseSessionOptions(opts); - return session; - } - - OrtMemoryInfo* cpu_memory() const { return mem_; } - - ~OnnxEngine() { - if (mem_) api_->ReleaseMemoryInfo(mem_); - if (env_) api_->ReleaseEnv(env_); - } - -private: - OnnxEngine() { - api_ = OrtGetApiBase()->GetApi(ORT_API_VERSION); - ort_check(api_, api_->CreateEnv(ORT_LOGGING_LEVEL_WARNING, "speech", &env_)); - ort_check(api_, api_->CreateCpuMemoryInfo( - OrtArenaAllocator, OrtMemTypeDefault, &mem_)); - } - - OnnxEngine(const OnnxEngine&) = delete; - OnnxEngine& operator=(const OnnxEngine&) = delete; - - const OrtApi* api_ = nullptr; - OrtEnv* env_ = nullptr; - OrtMemoryInfo* mem_ = nullptr; - bool nnapi_fallback_ = false; - std::string nnapi_fallback_reason_; -}; diff --git a/sdk/src/main/cpp/models/parakeet_stt.cpp b/sdk/src/main/cpp/models/parakeet_stt.cpp deleted file mode 100644 index df73bcc..0000000 --- a/sdk/src/main/cpp/models/parakeet_stt.cpp +++ /dev/null @@ -1,412 +0,0 @@ -#include "parakeet_stt.h" -#include "onnx_engine.h" -#include "../audio/mel.h" -#include "../util/json.h" -#include -#include -#include - -// --------------------------------------------------------------------------- -// SentencePiece U+2581 → space, then trim -// --------------------------------------------------------------------------- - -static void replace_sp_marker(std::string& s) { - const std::string marker = "\xE2\x96\x81"; - size_t pos = 0; - while ((pos = s.find(marker, pos)) != std::string::npos) { - s.replace(pos, marker.size(), " "); - pos += 1; - } -} - -// --------------------------------------------------------------------------- -// Construction -// --------------------------------------------------------------------------- - -ParakeetStt::ParakeetStt( - const std::string& encoder_path, - const std::string& decoder_joint_path, - const std::string& vocab_path, - bool nnapi) -{ - auto& engine = OnnxEngine::get(); - api_ = engine.api(); - encoder_ = engine.load(encoder_path, nnapi); - decoder_joint_ = engine.load(decoder_joint_path, false); - - load_vocab(vocab_path); -} - -ParakeetStt::~ParakeetStt() { - if (decoder_joint_) api_->ReleaseSession(decoder_joint_); - if (encoder_) api_->ReleaseSession(encoder_); -} - -// --------------------------------------------------------------------------- -// Vocabulary -// --------------------------------------------------------------------------- - -bool ParakeetStt::load_vocab(const std::string& path) { - auto text = json::read_file(path); - if (text.empty()) return false; - - auto flat = json::parse_flat_object(text); - for (auto& [key, val] : flat) { - try { - int id = std::stoi(key); - vocab_[id] = val; - - // Index language tokens like <|en|>, <|fr|>, etc. - if (val.size() >= 5 && val.size() <= 6 && - val.substr(0, 2) == "<|" && val.substr(val.size() - 2) == "|>") { - std::string code = val.substr(2, val.size() - 4); - lang_tokens_[id] = code; - } - } catch (...) {} - } - - // Update config based on actual vocab size - if (!vocab_.empty()) { - cfg_.vocab_size = static_cast(vocab_.size()); - cfg_.blank_id = cfg_.vocab_size; - cfg_.total_logits = cfg_.vocab_size + 1 + cfg_.num_dur_bins; - } - - LOGI("Parakeet vocab: %zu tokens, %zu language tokens, blank=%d", - vocab_.size(), lang_tokens_.size(), cfg_.blank_id); - return !vocab_.empty(); -} - -std::string ParakeetStt::decode_tokens(const std::vector& token_ids) { - std::string pieces; - for (int id : token_ids) { - auto it = vocab_.find(id); - if (it != vocab_.end()) pieces += it->second; - } - replace_sp_marker(pieces); - - size_t start = pieces.find_first_not_of(' '); - if (start == std::string::npos) return ""; - size_t end = pieces.find_last_not_of(' '); - return pieces.substr(start, end - start + 1); -} - -// --------------------------------------------------------------------------- -// Mel spectrogram -// --------------------------------------------------------------------------- - -std::vector ParakeetStt::compute_mel(const float* audio, size_t length) { - std::vector emphasized(length); - emphasized[0] = audio[0]; - for (size_t i = 1; i < length; i++) { - emphasized[i] = audio[i] - cfg_.pre_emphasis * audio[i - 1]; - } - - auto mel = mel_spectrogram( - emphasized.data(), emphasized.size(), - cfg_.sample_rate, cfg_.n_fft, cfg_.hop_length, - cfg_.win_length, cfg_.num_mel_bins); - - // Per-feature normalization (NeMo AudioToMelSpectrogramPreprocessor) - // mel layout: [num_mel_bins * num_frames], mel[m * num_frames + t] - int num_frames = static_cast(mel.size() / cfg_.num_mel_bins); - if (num_frames > 1) { - for (int m = 0; m < cfg_.num_mel_bins; m++) { - float sum = 0, sq_sum = 0; - for (int t = 0; t < num_frames; t++) { - float v = mel[m * num_frames + t]; - sum += v; - sq_sum += v * v; - } - float mean = sum / num_frames; - float var = sq_sum / num_frames - mean * mean; - float std = (var > 0) ? std::sqrt(var) : 1.0f; - for (int t = 0; t < num_frames; t++) { - mel[m * num_frames + t] = (mel[m * num_frames + t] - mean) / std; - } - } - } - - return mel; -} - -// --------------------------------------------------------------------------- -// Transcribe -// --------------------------------------------------------------------------- - -ParakeetStt::Result ParakeetStt::transcribe( - const float* audio, size_t length, int /*sample_rate*/) -{ - auto* mem = OnnxEngine::get().cpu_memory(); - - // --- mel spectrogram [B, 128, T] --- - - auto mel = compute_mel(audio, length); - int64_t num_frames = static_cast(mel.size() / cfg_.num_mel_bins); - const int64_t mel_shape[] = {1, static_cast(cfg_.num_mel_bins), num_frames}; - - OrtValue* t_mel = nullptr; - ort_check(api_, api_->CreateTensorWithDataAsOrtValue( - mem, mel.data(), mel.size() * sizeof(float), - mel_shape, 3, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &t_mel)); - - int64_t mel_len = num_frames; - const int64_t len_shape[] = {1}; - OrtValue* t_len = nullptr; - ort_check(api_, api_->CreateTensorWithDataAsOrtValue( - mem, &mel_len, sizeof(int64_t), - len_shape, 1, ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, &t_len)); - - // --- encoder: audio_signal, length → outputs, encoded_lengths --- - - const char* enc_in[] = {"audio_signal", "length"}; - const char* enc_out[] = {"outputs", "encoded_lengths"}; - OrtValue* enc_inputs[] = {t_mel, t_len}; - OrtValue* enc_outputs[] = {nullptr, nullptr}; - - ort_check(api_, api_->Run( - encoder_, nullptr, enc_in, enc_inputs, 2, enc_out, 2, enc_outputs)); - - // Get encoded shape [B, 1024, T'] - OrtTensorTypeAndShapeInfo* info = nullptr; - ort_check(api_, api_->GetTensorTypeAndShape(enc_outputs[0], &info)); - size_t dim_count = 0; - api_->GetDimensionsCount(info, &dim_count); - std::vector enc_shape(dim_count); - api_->GetDimensions(info, enc_shape.data(), dim_count); - api_->ReleaseTensorTypeAndShapeInfo(info); - - float* encoded = nullptr; - ort_check(api_, api_->GetTensorMutableData(enc_outputs[0], (void**)&encoded)); - - int64_t* enc_len_ptr = nullptr; - ort_check(api_, api_->GetTensorMutableData(enc_outputs[1], (void**)&enc_len_ptr)); - int64_t enc_len = enc_len_ptr[0]; - int64_t hidden = (dim_count >= 3) ? enc_shape[1] : cfg_.encoder_hidden; - - LOGI("STT: frames=%lld enc_len=%lld hidden=%lld audio=%zu enc_range=[%.4f,%.4f]", - num_frames, enc_len, hidden, length, - [&]{ float mn=encoded[0]; for(size_t i=1;i<(size_t)(hidden*enc_len);i++) if(encoded[i]mx) mx=encoded[i]; return mx; }()); - - // --- TDT greedy decode --- - - auto result = tdt_decode(encoded, enc_len, hidden); - - LOGI("STT: text='%.60s' conf=%.4f", result.text.c_str(), result.confidence); - - // --- cleanup --- - - api_->ReleaseValue(enc_outputs[1]); - api_->ReleaseValue(enc_outputs[0]); - api_->ReleaseValue(t_len); - api_->ReleaseValue(t_mel); - - return result; -} - -// --------------------------------------------------------------------------- -// TDT greedy decoding with fused decoder_joint model -// --------------------------------------------------------------------------- - -ParakeetStt::Result ParakeetStt::tdt_decode( - const float* encoded, int64_t enc_len, int64_t hidden) -{ - auto* mem = OnnxEngine::get().cpu_memory(); - - std::vector token_ids; - std::string detected_language; - float log_prob_sum = 0.0f; - int log_prob_count = 0; - - // LSTM states: [2, 1, 640] - int64_t state_size = cfg_.decoder_layers * 1 * cfg_.decoder_hidden; - std::vector h_state(state_size, 0.0f); - std::vector c_state(state_size, 0.0f); - const int64_t lstm_shape[] = { - static_cast(cfg_.decoder_layers), 1, - static_cast(cfg_.decoder_hidden) - }; - - int64_t prev_token = static_cast(cfg_.blank_id); - int64_t t = 0; - - while (t < enc_len) { - // Encoder frame at time t: [1, hidden, 1] - // NeMo encoder output is [B, hidden, T'] so frame is at offset t - // But decoder_joint expects encoder_outputs [B, hidden, T'] - // For greedy decode, we pass a single frame [1, hidden, 1] - std::vector enc_frame(hidden); - for (int64_t h = 0; h < hidden; h++) { - enc_frame[h] = encoded[h * enc_len + t]; // [B, H, T] layout - } - - const int64_t enc_frame_shape[] = {1, hidden, 1}; - OrtValue* t_enc = nullptr; - ort_check(api_, api_->CreateTensorWithDataAsOrtValue( - mem, enc_frame.data(), enc_frame.size() * sizeof(float), - enc_frame_shape, 3, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &t_enc)); - - // Target: previous token [1, 1] - const int64_t tok_shape[] = {1, 1}; - OrtValue* t_tok = nullptr; - ort_check(api_, api_->CreateTensorWithDataAsOrtValue( - mem, &prev_token, sizeof(int64_t), - tok_shape, 2, ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, &t_tok)); - - // Target length: [1] = 1 - int64_t tgt_len = 1; - const int64_t tgt_len_shape[] = {1}; - OrtValue* t_tgt_len = nullptr; - ort_check(api_, api_->CreateTensorWithDataAsOrtValue( - mem, &tgt_len, sizeof(int64_t), - tgt_len_shape, 1, ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, &t_tgt_len)); - - // LSTM states - OrtValue* t_h = nullptr; - ort_check(api_, api_->CreateTensorWithDataAsOrtValue( - mem, h_state.data(), h_state.size() * sizeof(float), - lstm_shape, 3, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &t_h)); - - OrtValue* t_c = nullptr; - ort_check(api_, api_->CreateTensorWithDataAsOrtValue( - mem, c_state.data(), c_state.size() * sizeof(float), - lstm_shape, 3, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &t_c)); - - // Run decoder_joint (v3 uses "prednet_lengths_orig" instead of "target_length") - const char* in_names[] = {"encoder_outputs", "targets", "prednet_lengths_orig", - "input_states_1", "input_states_2"}; - const char* out_names[] = {"outputs", "prednet_lengths", - "output_states_1", "output_states_2"}; - OrtValue* inputs[] = {t_enc, t_tok, t_tgt_len, t_h, t_c}; - OrtValue* outputs[] = {nullptr, nullptr, nullptr, nullptr}; - - ort_check(api_, api_->Run( - decoder_joint_, nullptr, - in_names, inputs, 5, - out_names, 4, outputs)); - - // Logits: [1, 1, 1, total_logits] — token logits + duration logits - float* logits = nullptr; - ort_check(api_, api_->GetTensorMutableData(outputs[0], (void**)&logits)); - - int token_end = cfg_.vocab_size + 1; // includes blank - - // Greedy argmax: token - int best_token = 0; - float best_score = logits[0]; - for (int i = 1; i < token_end; i++) { - if (logits[i] > best_score) { - best_score = logits[i]; - best_token = i; - } - } - - if (best_token == cfg_.blank_id) { - // Blank: advance time, keep LSTM state unchanged - t += 1; - } else { - if (best_token >= cfg_.first_text_token && best_token < cfg_.vocab_size) { - // Check if this is a language token - auto lang_it = lang_tokens_.find(best_token); - if (lang_it != lang_tokens_.end()) { - if (detected_language.empty()) { - detected_language = lang_it->second; - } - // Don't add language tokens to output text - } else { - token_ids.push_back(best_token); - log_prob_sum += best_score; - log_prob_count++; - } - } - - // Duration logits start after token logits - float* dur_logits = logits + token_end; - int dur_idx = 0; - float best_dur = dur_logits[0]; - for (int d = 1; d < cfg_.num_dur_bins; d++) { - if (dur_logits[d] > best_dur) { - best_dur = dur_logits[d]; - dur_idx = d; - } - } - t += std::max(cfg_.duration_bins[dur_idx], 1); - - prev_token = best_token; - - // Update LSTM states only on non-blank emission - float* h_out = nullptr; - ort_check(api_, api_->GetTensorMutableData(outputs[2], (void**)&h_out)); - std::memcpy(h_state.data(), h_out, state_size * sizeof(float)); - - float* c_out = nullptr; - ort_check(api_, api_->GetTensorMutableData(outputs[3], (void**)&c_out)); - std::memcpy(c_state.data(), c_out, state_size * sizeof(float)); - } - - // Cleanup - for (int i = 3; i >= 0; i--) api_->ReleaseValue(outputs[i]); - api_->ReleaseValue(t_c); - api_->ReleaseValue(t_h); - api_->ReleaseValue(t_tgt_len); - api_->ReleaseValue(t_tok); - api_->ReleaseValue(t_enc); - } - - // --- build result --- - - Result result; - result.text = decode_tokens(token_ids); - result.language = detected_language; - - if (log_prob_count > 0) { - float mean_logit = log_prob_sum / static_cast(log_prob_count); - result.confidence = 1.0f / (1.0f + std::exp(-mean_logit * 0.1f)); - } - - if (!result.language.empty()) { - LOGI("STT: detected language=%s", result.language.c_str()); - } - - return result; -} - -// --------------------------------------------------------------------------- -// Streaming: accumulate audio and re-transcribe -// --------------------------------------------------------------------------- - -void ParakeetStt::begin_stream(int sample_rate) { - stream_buffer_.clear(); - stream_sample_rate_ = sample_rate; - streaming_ = true; -} - -ParakeetStt::Result ParakeetStt::push_chunk(const float* audio, size_t length) { - stream_buffer_.insert(stream_buffer_.end(), audio, audio + length); - - // Need at least 0.5s of audio for meaningful transcription - if (stream_buffer_.size() < static_cast(stream_sample_rate_ / 2)) { - return {}; - } - - return transcribe(stream_buffer_.data(), stream_buffer_.size(), stream_sample_rate_); -} - -ParakeetStt::Result ParakeetStt::end_stream() { - streaming_ = false; - if (stream_buffer_.empty()) return {}; - - auto result = transcribe(stream_buffer_.data(), stream_buffer_.size(), stream_sample_rate_); - stream_buffer_.clear(); - return result; -} - -void ParakeetStt::cancel_stream() { - stream_buffer_.clear(); - streaming_ = false; -} - -void ParakeetStt::flush_stream() { - // No-op — single-utterance sessions only -} diff --git a/sdk/src/main/cpp/models/parakeet_stt.h b/sdk/src/main/cpp/models/parakeet_stt.h deleted file mode 100644 index 5bd974f..0000000 --- a/sdk/src/main/cpp/models/parakeet_stt.h +++ /dev/null @@ -1,78 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -/// Parakeet TDT v3 (0.6B) — speech recognition via ONNX Runtime. -/// FastConformer encoder + fused LSTM decoder/joint network. -/// Exported via NeMo as 2 ONNX models: encoder + decoder_joint. -/// Input: PCM Float32 audio at 16 kHz. -/// Output: transcribed text with language detection. -class ParakeetStt { -public: - struct Config { - int num_mel_bins = 128; - int sample_rate = 16000; - int n_fft = 512; - int hop_length = 160; - int win_length = 400; - float pre_emphasis = 0.97f; - int encoder_hidden = 1024; - int decoder_hidden = 640; - int decoder_layers = 2; - int vocab_size = 1024; // SentencePiece BPE - int blank_id = 1024; // vocab_size - int num_dur_bins = 5; - int duration_bins[5] = {0, 1, 2, 3, 4}; - int total_logits = 1030; // vocab_size+1 + num_dur_bins - int first_text_token = 0; // Only token 0 () is special - }; - - struct Result { - std::string text; - std::string language; - float confidence = 0.0f; - }; - - /// Load encoder + decoder_joint ONNX models and vocabulary. - ParakeetStt(const std::string& encoder_path, - const std::string& decoder_joint_path, - const std::string& vocab_path, - bool nnapi = true); - ~ParakeetStt(); - - Result transcribe(const float* audio, size_t length, int sample_rate); - int input_sample_rate() const { return cfg_.sample_rate; } - - // Streaming: accumulate audio and re-transcribe on each push_chunk call - bool supports_streaming() const { return true; } - void begin_stream(int sample_rate); - Result push_chunk(const float* audio, size_t length); - Result end_stream(); - void cancel_stream(); - void flush_stream(); - -private: - bool load_vocab(const std::string& path); - std::vector compute_mel(const float* audio, size_t length); - Result tdt_decode(const float* encoded, int64_t enc_len, int64_t hidden); - std::string decode_tokens(const std::vector& token_ids); - - const OrtApi* api_; - OrtSession* encoder_ = nullptr; - OrtSession* decoder_joint_ = nullptr; - Config cfg_; - - // SentencePiece vocabulary: token ID → token string - std::unordered_map vocab_; - - // Language tokens: token ID → ISO 639-1 code (e.g. 64 → "en", 71 → "fr") - std::unordered_map lang_tokens_; - - // Streaming state - std::vector stream_buffer_; - int stream_sample_rate_ = 16000; - bool streaming_ = false; -}; diff --git a/sdk/src/main/cpp/models/silero_vad.cpp b/sdk/src/main/cpp/models/silero_vad.cpp deleted file mode 100644 index 978797b..0000000 --- a/sdk/src/main/cpp/models/silero_vad.cpp +++ /dev/null @@ -1,74 +0,0 @@ -#include "silero_vad.h" -#include "onnx_engine.h" -#include - -SileroVad::SileroVad(const std::string& model_path, bool nnapi) { - auto& engine = OnnxEngine::get(); - api_ = engine.api(); - session_ = engine.load(model_path, nnapi); - reset(); -} - -SileroVad::~SileroVad() { - if (session_) api_->ReleaseSession(session_); -} - -void SileroVad::reset() { - state_.fill(0.0f); -} - -float SileroVad::process_chunk(const float* samples, size_t length) { - auto* mem = OnnxEngine::get().cpu_memory(); - - // --- input tensors --- - - const int64_t input_shape[] = {1, static_cast(length)}; - OrtValue* t_input = nullptr; - ort_check(api_, api_->CreateTensorWithDataAsOrtValue( - mem, const_cast(samples), length * sizeof(float), - input_shape, 2, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &t_input)); - - // sr is a scalar (no shape dimensions) - OrtValue* t_sr = nullptr; - ort_check(api_, api_->CreateTensorWithDataAsOrtValue( - mem, &sr_, sizeof(int64_t), - nullptr, 0, ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, &t_sr)); - - const int64_t state_shape[] = {2, 1, 128}; - OrtValue* t_state = nullptr; - ort_check(api_, api_->CreateTensorWithDataAsOrtValue( - mem, state_.data(), state_.size() * sizeof(float), - state_shape, 3, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &t_state)); - - // --- run --- - - const char* in_names[] = {"input", "state", "sr"}; - const char* out_names[] = {"output", "stateN"}; - OrtValue* inputs[] = {t_input, t_state, t_sr}; - OrtValue* outputs[] = {nullptr, nullptr}; - - ort_check(api_, api_->Run( - session_, nullptr, - in_names, inputs, 3, - out_names, 2, outputs)); - - // --- extract --- - - float* out_data = nullptr; - ort_check(api_, api_->GetTensorMutableData(outputs[0], (void**)&out_data)); - float prob = out_data[0]; - - float* new_state = nullptr; - ort_check(api_, api_->GetTensorMutableData(outputs[1], (void**)&new_state)); - std::memcpy(state_.data(), new_state, state_.size() * sizeof(float)); - - // --- cleanup --- - - api_->ReleaseValue(outputs[1]); - api_->ReleaseValue(outputs[0]); - api_->ReleaseValue(t_state); - api_->ReleaseValue(t_sr); - api_->ReleaseValue(t_input); - - return prob; -} diff --git a/sdk/src/main/cpp/models/silero_vad.h b/sdk/src/main/cpp/models/silero_vad.h deleted file mode 100644 index 7c7307c..0000000 --- a/sdk/src/main/cpp/models/silero_vad.h +++ /dev/null @@ -1,29 +0,0 @@ -#pragma once - -#include -#include -#include - -/// Silero VAD v5 — voice activity detection via ONNX Runtime. -/// Input: 512 samples (32 ms @ 16 kHz) per chunk. -/// Output: speech probability [0, 1]. -class SileroVad { -public: - explicit SileroVad(const std::string& model_path, bool nnapi = false); - ~SileroVad(); - - float process_chunk(const float* samples, size_t length); - void reset(); - - int input_sample_rate() const { return 16000; } - size_t chunk_size() const { return 512; } - -private: - const OrtApi* api_; - OrtSession* session_ = nullptr; - - // LSTM state carried across chunks (Silero v5: [2, 1, 128]) - static constexpr size_t kStateSize = 2 * 1 * 128; - std::array state_{}; - int64_t sr_ = 16000; -}; diff --git a/sdk/src/main/cpp/models/soc_detect.cpp b/sdk/src/main/cpp/models/soc_detect.cpp deleted file mode 100644 index 5d8c546..0000000 --- a/sdk/src/main/cpp/models/soc_detect.cpp +++ /dev/null @@ -1,88 +0,0 @@ -#include "inference_engine.h" -#include "onnx_backend.h" - -#ifdef __ANDROID__ -#include -#include -#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, "Speech", __VA_ARGS__) -#else -#include -#define LOGI(...) do { fprintf(stderr, "[speech] "); fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while(0) -#endif - -#include - -enum class SocVendor { GOOGLE_TENSOR, QUALCOMM, SAMSUNG, MEDIATEK, UNKNOWN }; - -static SocVendor detect_soc() { -#ifdef __ANDROID__ - char value[92] = {}; - - // Google Tensor: ro.hardware.chipname starts with "gs" or "zuma" - __system_property_get("ro.hardware.chipname", value); - std::string chipname(value); - if (chipname.find("gs") == 0 || chipname.find("zuma") == 0) { - LOGI("SoC: Google Tensor (%s)", chipname.c_str()); - return SocVendor::GOOGLE_TENSOR; - } - - // Qualcomm: ro.board.platform starts with "msm", "sm", "sdm" - __system_property_get("ro.board.platform", value); - std::string platform(value); - if (platform.find("msm") == 0 || platform.find("sm") == 0 || - platform.find("sdm") == 0 || platform.find("lahaina") != std::string::npos || - platform.find("taro") != std::string::npos || platform.find("kalama") != std::string::npos || - platform.find("pineapple") != std::string::npos || platform.find("sun") != std::string::npos) { - LOGI("SoC: Qualcomm (%s)", platform.c_str()); - return SocVendor::QUALCOMM; - } - - // Samsung Exynos - __system_property_get("ro.hardware", value); - std::string hardware(value); - if (hardware.find("exynos") != std::string::npos) { - LOGI("SoC: Samsung Exynos (%s)", hardware.c_str()); - return SocVendor::SAMSUNG; - } - - LOGI("SoC: Unknown (chipname=%s, platform=%s, hardware=%s)", - chipname.c_str(), platform.c_str(), hardware.c_str()); -#endif - return SocVendor::UNKNOWN; -} - -Backend detect_optimal_backend() { - SocVendor soc = detect_soc(); - switch (soc) { -#ifdef SPEECH_LITERT - case SocVendor::GOOGLE_TENSOR: - return Backend::LITERT; -#endif - default: - return Backend::ONNX; - } -} - -std::unique_ptr create_backend(Backend preference) { - Backend actual = preference; - if (actual == Backend::AUTO) { - actual = detect_optimal_backend(); - } - -#ifdef SPEECH_LITERT - if (actual == Backend::LITERT) { - // LiteRT backend will be implemented in litert_backend.cpp - // For now, fall back to ONNX - LOGI("LiteRT backend not yet available, using ONNX"); - actual = Backend::ONNX; - } -#endif - - if (actual == Backend::LITERT) { - LOGI("LiteRT requested but not compiled in, using ONNX"); - actual = Backend::ONNX; - } - - LOGI("Inference backend: ONNX Runtime"); - return std::make_unique(); -} diff --git a/sdk/src/main/cpp/util/json.h b/sdk/src/main/cpp/util/json.h deleted file mode 100644 index 5e44bd9..0000000 --- a/sdk/src/main/cpp/util/json.h +++ /dev/null @@ -1,241 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include - -/// Minimal JSON parser for our specific model config files. -/// Handles flat objects with string/int values and one level of nesting. -namespace json { - -using Dict = std::unordered_map; - -inline std::string read_file(const std::string& path) { - std::ifstream f(path); - if (!f.is_open()) return ""; - std::ostringstream ss; - ss << f.rdbuf(); - return ss.str(); -} - -inline void skip_ws(const std::string& s, size_t& i) { - while (i < s.size() && (s[i] == ' ' || s[i] == '\t' || s[i] == '\n' || s[i] == '\r')) i++; -} - -inline std::string parse_string(const std::string& s, size_t& i) { - if (i >= s.size() || s[i] != '"') return ""; - i++; // skip opening quote - std::string result; - while (i < s.size() && s[i] != '"') { - if (s[i] == '\\' && i + 1 < s.size()) { - i++; - switch (s[i]) { - case '"': result += '"'; break; - case '\\': result += '\\'; break; - case 'n': result += '\n'; break; - case 't': result += '\t'; break; - case 'u': { - // Parse \uXXXX → UTF-8 - if (i + 4 < s.size()) { - std::string hex = s.substr(i + 1, 4); - unsigned long cp = std::stoul(hex, nullptr, 16); - i += 4; - if (cp < 0x80) { - result += static_cast(cp); - } else if (cp < 0x800) { - result += static_cast(0xC0 | (cp >> 6)); - result += static_cast(0x80 | (cp & 0x3F)); - } else { - result += static_cast(0xE0 | (cp >> 12)); - result += static_cast(0x80 | ((cp >> 6) & 0x3F)); - result += static_cast(0x80 | (cp & 0x3F)); - } - } - break; - } - default: result += s[i]; break; - } - } else { - result += s[i]; - } - i++; - } - if (i < s.size()) i++; // skip closing quote - return result; -} - -inline std::string parse_value_raw(const std::string& s, size_t& i) { - skip_ws(s, i); - if (i >= s.size()) return ""; - - if (s[i] == '"') return parse_string(s, i); - - // Number, bool, null - std::string val; - while (i < s.size() && s[i] != ',' && s[i] != '}' && s[i] != ']' - && s[i] != ' ' && s[i] != '\n' && s[i] != '\r') { - val += s[i++]; - } - return val; -} - -/// Skip a JSON value (string, number, object, array) -inline void skip_value(const std::string& s, size_t& i) { - skip_ws(s, i); - if (i >= s.size()) return; - if (s[i] == '"') { parse_string(s, i); return; } - if (s[i] == '{') { - int depth = 1; i++; - while (i < s.size() && depth > 0) { - if (s[i] == '{') { depth++; i++; } - else if (s[i] == '}') { depth--; i++; } - else if (s[i] == '"') { parse_string(s, i); } - else { i++; } - } - return; - } - if (s[i] == '[') { - int depth = 1; i++; - while (i < s.size() && depth > 0) { - if (s[i] == '[') { depth++; i++; } - else if (s[i] == ']') { depth--; i++; } - else if (s[i] == '"') { parse_string(s, i); } - else { i++; } - } - return; - } - parse_value_raw(s, i); -} - -/// Parse {"key": "value", ...} → map -/// Works for string and integer values (ints stored as strings). -inline Dict parse_flat_object(const std::string& text) { - Dict result; - size_t i = 0; - skip_ws(text, i); - if (i >= text.size() || text[i] != '{') return result; - i++; - - while (i < text.size()) { - skip_ws(text, i); - if (text[i] == '}') break; - if (text[i] == ',') { i++; continue; } - - auto key = parse_string(text, i); - skip_ws(text, i); - if (i < text.size() && text[i] == ':') i++; - skip_ws(text, i); - - if (i < text.size() && text[i] == '{') { - // Nested object — skip it for flat parsing - skip_value(text, i); - } else { - auto val = parse_value_raw(text, i); - result[key] = val; - } - } - return result; -} - -/// Heteronym entry: either a simple string or POS-tagged map. -struct DictEntry { - std::string simple; - std::unordered_map pos_map; // empty if simple - bool is_heteronym() const { return !pos_map.empty(); } -}; - -/// Parse pronunciation dictionary: {"word": "phonemes", "word2": {"VERB": "p1", "DEFAULT": "p2"}} -inline std::unordered_map parse_dictionary(const std::string& text) { - std::unordered_map result; - size_t i = 0; - skip_ws(text, i); - if (i >= text.size() || text[i] != '{') return result; - i++; - - while (i < text.size()) { - skip_ws(text, i); - if (text[i] == '}') break; - if (text[i] == ',') { i++; continue; } - - auto key = parse_string(text, i); - skip_ws(text, i); - if (i < text.size() && text[i] == ':') i++; - skip_ws(text, i); - - DictEntry entry; - if (i < text.size() && text[i] == '"') { - entry.simple = parse_string(text, i); - } else if (i < text.size() && text[i] == '{') { - // Nested POS map - i++; // skip { - while (i < text.size()) { - skip_ws(text, i); - if (text[i] == '}') { i++; break; } - if (text[i] == ',') { i++; continue; } - auto pos = parse_string(text, i); - skip_ws(text, i); - if (i < text.size() && text[i] == ':') i++; - skip_ws(text, i); - if (i < text.size() && text[i] == 'n') { - // null value - skip_value(text, i); - } else { - auto pron = parse_value_raw(text, i); - entry.pos_map[pos] = pron; - } - } - } else { - skip_value(text, i); - } - result[key] = std::move(entry); - } - return result; -} - -/// Parse vocab_index.json: {"vocab": {"sym": id, ...}} or flat {"sym": id, ...} -inline std::unordered_map parse_vocab_index(const std::string& text) { - std::unordered_map result; - size_t i = 0; - skip_ws(text, i); - if (i >= text.size() || text[i] != '{') return result; - i++; - - // Check if nested under "vocab" key - size_t save = i; - skip_ws(text, i); - auto first_key = parse_string(text, i); - skip_ws(text, i); - if (i < text.size() && text[i] == ':') i++; - skip_ws(text, i); - - size_t obj_start; - if (first_key == "vocab" && i < text.size() && text[i] == '{') { - obj_start = i + 1; - } else { - // Flat format — restart - i = save; - obj_start = i; - } - - i = obj_start; - while (i < text.size()) { - skip_ws(text, i); - if (text[i] == '}') break; - if (text[i] == ',') { i++; continue; } - - auto sym = parse_string(text, i); - skip_ws(text, i); - if (i < text.size() && text[i] == ':') i++; - auto val = parse_value_raw(text, i); - - try { - result[sym] = std::stoi(val); - } catch (...) {} - } - return result; -} - -} // namespace json diff --git a/setup.sh b/setup.sh index d49af03..37f5f29 100755 --- a/setup.sh +++ b/setup.sh @@ -66,24 +66,6 @@ else echo "ONNX Runtime already installed" fi -# --- .gitignore --- - -cat > "${ROOT}/.gitignore" << 'GITIGNORE' -# Build -.gradle/ -build/ -*.iml -.idea/ -local.properties - -# ONNX Runtime (downloaded by setup.sh) -/ort/ - -# Native build artifacts -.cxx/ -.externalNativeBuild/ -GITIGNORE - echo "" echo "Done. Open the project in Android Studio or run:" echo " ./gradlew :app:assembleDebug" diff --git a/speech-core b/speech-core index 679869d..ba75579 160000 --- a/speech-core +++ b/speech-core @@ -1 +1 @@ -Subproject commit 679869d9e91ec159611a086e7e5825daa073e72e +Subproject commit ba755794e6aabf9b98580ce8e591c1abd5ee2387