From 962b042520b09546377fe2d208e108f467f45554 Mon Sep 17 00:00:00 2001
From: Ivan <ivan@Ivans-MacBook-Pro.local>
Date: Wed, 13 May 2026 17:00:25 +0200
Subject: [PATCH] Make speech-android Android-only by deleting code that moved
 to speech-core
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

speech-core PRs #19 and #20 lifted all the model wrappers, audio utilities,
and Linux examples out of this repo. This PR finishes the migration by
deleting the now-duplicated source and slimming the native side to a single
~250-line JNI bridge.

Net change: 51 files, +717 / -7412.

Bumped:
- speech-core submodule pointer: 679869d → ba75579 (PR #19 + #20 merged)

Deleted (now in speech-core):
- sdk/src/main/cpp/audio/  — fft, mel, stft (live at speech_core::audio)
- sdk/src/main/cpp/util/   — json.h
- sdk/src/main/cpp/models/ — silero_vad, parakeet_stt, kokoro_tts +
  phonemizer + multilingual, deepfilter, onnx_engine, inference_engine,
  onnx_backend, soc_detect
- linux/                   — moved verbatim to speech-core/examples/linux/
                             (libspeech.so, demo, CLIs, integration test)

Rewrote:
- sdk/src/main/cpp/jni_bridge.cpp (388 → 269 lines) — the model wrappers
  in speech_core::* directly implement VADInterface / STTInterface /
  TTSInterface / EnhancerInterface, so the 100+ lines of C-vtable adapter
  boilerplate (vad_process_chunk, stt_transcribe, tts_synthesize, etc.)
  that wrapped each model class into sc_*_vtable_t structs are gone. The
  bridge now constructs speech_core::SileroVad / ParakeetStt / KokoroTts
  and hands references to speech_core::VoicePipeline.
- sdk/src/main/cpp/CMakeLists.txt — replaced the manual list of speech-core
  source files with add_subdirectory(${SPEECH_CORE_DIR}) using
  SPEECH_CORE_WITH_ONNX=ON. Link speech_android against speech_core_models.

Compatibility:
- Kotlin contract unchanged. NativeBridge.onEvent still receives the same
  int event-type values (0..11). The new speech_core::EventType enum has
  ResponseDone and ResponseAudioDelta swapped relative to the old C ABI
  (sc_event_t.type) — added to_kotlin_event() to map explicitly so the
  Kotlin side keeps working without any change.
- Public Kotlin API (SpeechPipeline, SpeechConfig, SpeechEvent) untouched.

Docs:
- README.md rewritten as Android-only (Linux/Yocto/QNN sections moved
  to a one-line cross-link pointing at speech-core/examples/linux).
- All 9 README translations updated to mirror the new structure
  (zh, ja, ko, es, de, fr, hi, pt, ru) with existing high-quality
  translations preserved where the underlying English text is unchanged.
- AGENTS.md rewritten — Android-only scope, points contributors at
  speech-core for any C++ / model / Linux changes.
- .gitignore drops the linux/tests/models/ and /ort-linux/ entries that
  are no longer relevant.
- setup.sh trimmed to just the Android ORT download + submodule init
  (it was previously rewriting the .gitignore on every invocation).

Verified locally:
- ./gradlew :sdk:externalNativeBuildDebug — BUILD SUCCESSFUL, 5.6 MB
  libspeech_android.so produced for arm64-v8a, links libonnxruntime.so
  and libc++_shared.so cleanly.
- ./gradlew :sdk:assembleDebug :sdk:test — BUILD SUCCESSFUL, 77 tasks.

Next: connectedAndroidTest needs to run on an emulator (downloads
1.2 GB of models on first run); will run that in CI rather than locally.
---
 .gitignore                                    |    6 +-
 AGENTS.md                                     |  140 +-
 README.md                                     |  139 +-
 README_de.md                                  |  139 +-
 README_es.md                                  |  169 +-
 README_fr.md                                  |  135 +-
 README_hi.md                                  |  135 +-
 README_ja.md                                  |  139 +-
 README_ko.md                                  |  139 +-
 README_pt.md                                  |  168 +-
 README_ru.md                                  |  139 +-
 README_zh.md                                  |  134 +-
 linux/CMakeLists.txt                          |  102 -
 linux/README.md                               |  151 --
 linux/demo/main.cpp                           |  135 --
 linux/include/speech.h                        |   67 -
 linux/setup_linux.sh                          |   60 -
 linux/src/speech.cpp                          |  259 ---
 linux/tests/download_models.sh                |   40 -
 linux/tests/test_pipeline.cpp                 |  284 ---
 linux/toolchain-aarch64.cmake                 |   11 -
 linux/tools/phonemize.cpp                     |   47 -
 linux/tools/synthesize.cpp                    |  110 -
 linux/tools/transcribe.cpp                    |  262 ---
 sdk/src/main/cpp/CMakeLists.txt               |   76 +-
 sdk/src/main/cpp/audio/fft.cpp                |   92 -
 sdk/src/main/cpp/audio/fft.h                  |   13 -
 sdk/src/main/cpp/audio/mel.cpp                |  163 --
 sdk/src/main/cpp/audio/mel.h                  |   20 -
 sdk/src/main/cpp/audio/stft.cpp               |   64 -
 sdk/src/main/cpp/audio/stft.h                 |   33 -
 sdk/src/main/cpp/jni_bridge.cpp               |  308 +--
 sdk/src/main/cpp/models/deepfilter.cpp        |  192 --
 sdk/src/main/cpp/models/deepfilter.h          |   58 -
 sdk/src/main/cpp/models/inference_engine.h    |   73 -
 .../main/cpp/models/kokoro_multilingual.cpp   | 1841 -----------------
 sdk/src/main/cpp/models/kokoro_multilingual.h |   51 -
 sdk/src/main/cpp/models/kokoro_phonemizer.cpp |  456 ----
 sdk/src/main/cpp/models/kokoro_phonemizer.h   |   85 -
 sdk/src/main/cpp/models/kokoro_tts.cpp        |  258 ---
 sdk/src/main/cpp/models/kokoro_tts.h          |   41 -
 sdk/src/main/cpp/models/onnx_backend.h        |  131 --
 sdk/src/main/cpp/models/onnx_engine.h         |  122 --
 sdk/src/main/cpp/models/parakeet_stt.cpp      |  412 ----
 sdk/src/main/cpp/models/parakeet_stt.h        |   78 -
 sdk/src/main/cpp/models/silero_vad.cpp        |   74 -
 sdk/src/main/cpp/models/silero_vad.h          |   29 -
 sdk/src/main/cpp/models/soc_detect.cpp        |   88 -
 sdk/src/main/cpp/util/json.h                  |  241 ---
 setup.sh                                      |   18 -
 speech-core                                   |    2 +-
 51 files changed, 717 insertions(+), 7412 deletions(-)
 delete mode 100644 linux/CMakeLists.txt
 delete mode 100644 linux/README.md
 delete mode 100644 linux/demo/main.cpp
 delete mode 100644 linux/include/speech.h
 delete mode 100755 linux/setup_linux.sh
 delete mode 100644 linux/src/speech.cpp
 delete mode 100755 linux/tests/download_models.sh
 delete mode 100644 linux/tests/test_pipeline.cpp
 delete mode 100644 linux/toolchain-aarch64.cmake
 delete mode 100644 linux/tools/phonemize.cpp
 delete mode 100644 linux/tools/synthesize.cpp
 delete mode 100644 linux/tools/transcribe.cpp
 delete mode 100644 sdk/src/main/cpp/audio/fft.cpp
 delete mode 100644 sdk/src/main/cpp/audio/fft.h
 delete mode 100644 sdk/src/main/cpp/audio/mel.cpp
 delete mode 100644 sdk/src/main/cpp/audio/mel.h
 delete mode 100644 sdk/src/main/cpp/audio/stft.cpp
 delete mode 100644 sdk/src/main/cpp/audio/stft.h
 delete mode 100644 sdk/src/main/cpp/models/deepfilter.cpp
 delete mode 100644 sdk/src/main/cpp/models/deepfilter.h
 delete mode 100644 sdk/src/main/cpp/models/inference_engine.h
 delete mode 100644 sdk/src/main/cpp/models/kokoro_multilingual.cpp
 delete mode 100644 sdk/src/main/cpp/models/kokoro_multilingual.h
 delete mode 100644 sdk/src/main/cpp/models/kokoro_phonemizer.cpp
 delete mode 100644 sdk/src/main/cpp/models/kokoro_phonemizer.h
 delete mode 100644 sdk/src/main/cpp/models/kokoro_tts.cpp
 delete mode 100644 sdk/src/main/cpp/models/kokoro_tts.h
 delete mode 100644 sdk/src/main/cpp/models/onnx_backend.h
 delete mode 100644 sdk/src/main/cpp/models/onnx_engine.h
 delete mode 100644 sdk/src/main/cpp/models/parakeet_stt.cpp
 delete mode 100644 sdk/src/main/cpp/models/parakeet_stt.h
 delete mode 100644 sdk/src/main/cpp/models/silero_vad.cpp
 delete mode 100644 sdk/src/main/cpp/models/silero_vad.h
 delete mode 100644 sdk/src/main/cpp/models/soc_detect.cpp
 delete mode 100644 sdk/src/main/cpp/util/json.h

diff --git a/.gitignore b/.gitignore
index 25af2ea..f28a3e6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,12 +5,8 @@ build/
 .idea/
 local.properties
 
-# ONNX Runtime (downloaded by setup.sh / setup_linux.sh)
+# ONNX Runtime (downloaded by setup.sh)
 /ort/
-/ort-linux/
-
-# Test models (downloaded by linux/tests/download_models.sh)
-linux/tests/models/
 
 # Native build artifacts
 .cxx/
diff --git a/AGENTS.md b/AGENTS.md
index 74e0840..20dcfae 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -2,17 +2,23 @@
 
 ## Project
 
-speech-android — on-device speech SDK for Android and embedded Linux (VAD + STT + TTS + noise cancellation).
+speech-android — on-device speech SDK for Android (VAD + STT + TTS + noise cancellation).
+
+Thin Kotlin SDK + JNI bridge over the [speech-core](https://github.com/soniqo/speech-core)
+C++ engine, which provides the orchestration pipeline AND the ONNX Runtime
+model wrappers (Silero VAD, Parakeet STT, Kokoro TTS, DeepFilterNet3). This
+repo owns only the Android packaging and a single ~250-line JNI bridge.
+
+Linux/automotive support moved to [speech-core's `examples/linux/`](https://github.com/soniqo/speech-core/tree/main/examples/linux).
 
 ## Structure
 
-- `speech-core/` — C++17 git submodule, pipeline orchestration (do not modify directly)
-- `sdk/src/main/cpp/` — ONNX Runtime model implementations, JNI bridge, audio DSP
+- `speech-core/` — git submodule (do not modify directly; open PRs against soniqo/speech-core)
+- `sdk/src/main/cpp/` — `jni_bridge.cpp` + `CMakeLists.txt`. That's it. All model code lives in speech-core.
 - `sdk/src/main/kotlin/com/soniqo/speech/` — Kotlin public SDK
 - `sdk/src/androidTest/` — instrumented e2e tests
-- `linux/` — embedded Linux C API (automotive/Yocto)
 - `app/` — demo application
-- `setup.sh` — downloads ONNX Runtime, initializes submodule
+- `setup.sh` — downloads ONNX Runtime, initializes the speech-core submodule
 
 ## Build
 
@@ -24,68 +30,13 @@ speech-android — on-device speech SDK for Android and embedded Linux (VAD + ST
 
 ## Tests
 
-### Android (emulator or device)
-
-```bash
-./gradlew :sdk:connectedAndroidTest
-```
-
-Models download automatically via `ModelManager.ensureModels()`.
-23 tests across 5 suites: SileroVadTest, ParakeetSttTest, KokoroTtsTest, PipelineE2ETest, BargeInTest.
-
-### Linux
-
-```bash
-# 1. Download ONNX Runtime
-linux/setup_linux.sh
-
-# 2. Download test models
-linux/tests/download_models.sh
-
-# 3. Build
-cd linux && cmake -B build -DORT_DIR=../ort-linux && cmake --build build
-
-# 4. Run (set model dir)
-SPEECH_MODEL_DIR=tests/models ./build/speech_test
-```
-
-11 tests: config, lifecycle, speech detection, concurrency, null safety.
-
-## Models
-
-ONNX models hosted on HuggingFace under `aufklarer/` org. INT8 is default.
-Parakeet TDT v3 — multilingual STT (114 languages, 8192 BPE vocab).
-ModelManager.kt handles download and caching.
-
-## Key files
-
-- `jni_bridge.cpp` — wires ONNX models to speech-core C API via vtables
-- `SpeechPipeline.kt` — main public API
-- `parakeet_stt.cpp` — STT with TDT greedy decoder + per-feature mel normalization
-- `kokoro_tts.cpp` + `kokoro_phonemizer.cpp` — TTS with dictionary-based phonemizer
-- `silero_vad.cpp` — voice activity detection
-- `deepfilter.cpp` — noise cancellation with STFT/ERB processing
-- `onnx_engine.h` — platform-aware ONNX Runtime wrapper (Android NNAPI / Linux QNN)
-- `linux/src/speech.cpp` — Linux C API implementation
-- `linux/include/speech.h` — Linux public C header
-
-## Workflow
-
-- **Never push directly to main.** Create a feature branch, open a PR, and merge after review.
-- Branch naming: `feat/description`, `fix/description`, `chore/description`
-- PRs should include: summary, test plan, and link to related issues
-- Tag releases from main after PR is merged: `git tag v0.0.X && git push origin v0.0.X`
-- CI runs on tags: builds SDK, runs unit tests, publishes to Maven Central + GitHub Packages, creates GitHub Release with APK
-
-## Testing
-
 ### Unit tests (no device needed)
 
 ```bash
 ./gradlew :sdk:test
 ```
 
-15 tests: download retry, resume, timeout, validation, edge cases.
+Download retry / resume / timeout / validation / edge cases.
 
 ### E2E tests (arm64 emulator or device)
 
@@ -93,7 +44,11 @@ ModelManager.kt handles download and caching.
 ./gradlew :sdk:connectedAndroidTest
 ```
 
-31 tests across 7 suites: SileroVadTest, ParakeetSttTest, KokoroTtsTest, KokoroMultilingualTest, PipelineE2ETest, BargeInTest, DeepFilterTest.
+Suites: `SileroVadTest`, `ParakeetSttTest`, `KokoroTtsTest`,
+`KokoroMultilingualTest`, `PipelineE2ETest`, `BargeInTest`, `DeepFilterTest`.
+
+Models (~1.2GB) download on first run via `ModelManager.ensureModels()`.
+Subsequent runs use the device-side cache.
 
 #### Emulator setup (arm64, 4GB RAM required)
 
@@ -104,29 +59,50 @@ echo "no" | avdmanager create avd -n speech_test -k "system-images;android-35-ex
 /opt/homebrew/share/android-commandlinetools/emulator/emulator -avd speech_test -no-window -no-audio -no-boot-anim -gpu swiftshader_indirect -memory 4096
 ```
 
-Models (~1.2GB) download on first run. Subsequent runs use cache.
+## Models
+
+ONNX models hosted on HuggingFace under [`aufklarer/`](https://huggingface.co/aufklarer)
+org. INT8 quantized by default.
 
-### Linux
+- `aufklarer/Silero-VAD-v5-ONNX` — VAD
+- `aufklarer/Parakeet-TDT-v3-ONNX` — STT (114 languages, 8192 BPE vocab)
+- `aufklarer/Kokoro-82M-ONNX` — TTS + phonemizer dicts + voice embeddings
+- `aufklarer/DeepFilterNet3-ONNX` — noise enhancer
 
-```bash
-linux/setup_linux.sh
-linux/tests/download_models.sh
-cd linux && cmake -B build -DORT_DIR=../ort-linux && cmake --build build
-SPEECH_MODEL_DIR=tests/models ./build/speech_test
-```
+`ModelManager.kt` handles download and caching. See speech-core's
+[`docs/models.md`](https://github.com/soniqo/speech-core/blob/main/docs/models.md)
+for the full model-file inventory.
+
+## Key files
+
+- `sdk/src/main/cpp/jni_bridge.cpp` — constructs `speech_core::SileroVad`/`ParakeetStt`/`KokoroTts` and feeds them to `speech_core::VoicePipeline`. No vtable adapters — the model wrappers implement the interfaces directly.
+- `sdk/src/main/cpp/CMakeLists.txt` — pulls speech-core in via `add_subdirectory` with `SPEECH_CORE_WITH_ONNX=ON`; the speech_core_models target provides every model wrapper.
+- `sdk/src/main/kotlin/com/soniqo/speech/SpeechPipeline.kt` — main public Kotlin API.
+- `sdk/src/main/kotlin/com/soniqo/speech/NativeBridge.kt` — JNI surface (must stay in lockstep with `jni_bridge.cpp`).
+- `sdk/src/main/kotlin/com/soniqo/speech/ModelManager.kt` — model download + caching.
+
+Native code that used to live here (`models/*.{cpp,h}`, `audio/{fft,mel,stft}.cpp`,
+`util/json.h`, `onnx_engine.h`) is now under speech-core. Modify it via a
+speech-core PR, then bump the submodule pointer here.
+
+## Workflow
 
-11 tests: config, lifecycle, speech detection, concurrency, null safety.
+- **Never push directly to main.** Create a feature branch, open a PR, merge after review.
+- Branch naming: `feat/description`, `fix/description`, `chore/description`.
+- PRs should include: summary, test plan, and link to related issues.
+- Tag releases from main after merge: `git tag v0.0.X && git push origin v0.0.X`.
+- CI runs on tags: builds SDK, runs unit tests, publishes to Maven Central + GitHub Packages, creates GitHub Release with APK.
 
 ## Guidelines
 
-- Keep native code in C++17, no external deps beyond ONNX Runtime, OkHttp, and speech-core
-- Kotlin SDK should be minimal — thin wrapper over JNI
-- All model tensor names/shapes must match actual ONNX exports
-- Test on arm64-v8a (Snapdragon) as primary target
-- No Claude attribution in commits, PRs, or model cards
-- **Never push directly to main — always use a PR**
-- **Always ask for confirmation before creating a git commit**
-- **Always ask for confirmation before any action visible to others** — pushing to any branch, opening / commenting on / reviewing / closing / merging PRs or issues, posting to Slack or any external service. The git commit rule above is one instance of this broader principle: never create externally visible artifacts without explicit confirmation.
-- **Run unit tests (`./gradlew :sdk:test`) after making code changes**
-- **Run e2e tests (`./gradlew :sdk:connectedAndroidTest`) before tagging a release**
-- **README translations must stay in sync.** Any change to `README.md` must be mirrored in all translated copies: `README_zh.md`, `README_ja.md`, `README_ko.md`, `README_es.md`, `README_de.md`, `README_fr.md`, `README_hi.md`, `README_pt.md`, `README_ru.md`
+- Keep native code in C++17. No external deps beyond ONNX Runtime, OkHttp, and speech-core.
+- Kotlin SDK stays minimal — thin wrapper over JNI.
+- All model tensor names/shapes must match the published ONNX exports under `aufklarer/`.
+- Test on arm64-v8a (Snapdragon) as primary target.
+- **No Claude attribution** in commits, PRs, or model cards. Strip both the `🤖 Generated with [Claude Code]` footer and the `Co-Authored-By: Claude …` trailer from defaults.
+- **Never push directly to main — always use a PR**.
+- **Always ask for confirmation before creating a git commit**.
+- **Always ask for confirmation before any externally-visible action** — pushing to any branch, opening / commenting on / reviewing / closing / merging PRs or issues, posting to Slack or any external service. The git commit rule above is one instance of this broader principle.
+- **Run unit tests (`./gradlew :sdk:test`) after making code changes**.
+- **Run e2e tests (`./gradlew :sdk:connectedAndroidTest`) before tagging a release**.
+- **README translations must stay in sync.** Any change to `README.md` must be mirrored in all translated copies: `README_zh.md`, `README_ja.md`, `README_ko.md`, `README_es.md`, `README_de.md`, `README_fr.md`, `README_hi.md`, `README_pt.md`, `README_ru.md`.
diff --git a/README.md b/README.md
index 91dba12..95ae7ea 100644
--- a/README.md
+++ b/README.md
@@ -2,18 +2,15 @@
 
 📖 Read in: [English](README.md) · [中文](README_zh.md) · [日本語](README_ja.md) · [한국어](README_ko.md) · [Español](README_es.md) · [Deutsch](README_de.md) · [Français](README_fr.md) · [हिन्दी](README_hi.md) · [Português](README_pt.md) · [Русский](README_ru.md)
 
-On-device speech SDK for Android and embedded Linux, powered by [ONNX Runtime](https://onnxruntime.ai) and [speech-core](https://github.com/soniqo/speech-core).
+On-device speech SDK for Android, powered by [ONNX Runtime](https://onnxruntime.ai) and [speech-core](https://github.com/soniqo/speech-core).
 
 Speech recognition (114 languages), text-to-speech (8 languages), voice activity detection, and noise cancellation — all running locally. No cloud APIs, no data leaves the device.
 
-**[Demo APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[Models](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (Apple counterpart) · **[speech-core](https://github.com/soniqo/speech-core)** (pipeline engine)
+**[Demo APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[Models](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (Apple counterpart) · **[speech-core](https://github.com/soniqo/speech-core)** (pipeline engine + Linux/embedded build)
 
-## Platforms
+## Scope
 
-| Platform | API | Acceleration | Directory |
-| --- | --- | --- | --- |
-| Android | Kotlin (`SpeechPipeline`) | NNAPI (Snapdragon, Exynos, Tensor) | `sdk/` |
-| Embedded Linux | C (`speech.h`) | QNN (Hexagon DSP) | `linux/` |
+This repo is the **Android packaging**: Kotlin SDK, JNI bridge, demo app. The C++ engine and ONNX model wrappers (Silero VAD, Parakeet STT, Kokoro TTS, DeepFilterNet3) live in [speech-core](https://github.com/soniqo/speech-core) and are pulled in via a git submodule. Linux / automotive (Yocto, Qualcomm SA8295P/SA8255P) lives at [speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux).
 
 ## Models
 
@@ -24,15 +21,13 @@ Speech recognition (114 languages), text-to-speech (8 languages), voice activity
 | [Silero VAD v5](https://huggingface.co/aufklarer/Silero-VAD-v5-ONNX) | Voice activity detection | 2 MB | Any |
 | [DeepFilterNet3](https://huggingface.co/aufklarer/DeepFilterNet3-ONNX) | Noise cancellation | ~8 MB | Any |
 
-Models are downloaded automatically on first launch (Android) or placed manually (Linux).
-
-## Android
+Models are downloaded automatically on first launch via `ModelManager.ensureModels()`.
 
-### Try the demo
+## Try the demo
 
 Download the [signed APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk) and install on any arm64 Android device (8+). Models (~1.2 GB) download automatically on first launch.
 
-### Add dependency
+## Add dependency
 
 ```kotlin
 dependencies {
@@ -40,7 +35,7 @@ dependencies {
 }
 ```
 
-### Kotlin usage
+## Kotlin usage
 
 ```kotlin
 val modelDir = ModelManager.ensureModels(context)
@@ -63,7 +58,7 @@ pipeline.start()
 pipeline.pushAudio(samples)
 ```
 
-### Build from source
+## Build from source
 
 ```bash
 git clone --recursive https://github.com/soniqo/speech-android.git
@@ -73,7 +68,10 @@ cd speech-android
 ./gradlew :sdk:connectedAndroidTest   # 34 e2e tests
 ```
 
-### Demo app
+`./setup.sh` initializes the speech-core submodule and downloads ONNX Runtime
+into `./ort/`.
+
+## Demo app
 
 The [`app/`](app/) module is a minimal voice assistant demo with:
 
@@ -87,7 +85,7 @@ The [`app/`](app/) module is a minimal voice assistant demo with:
 ./gradlew :app:installDebug
 ```
 
-### System voice input (`RecognitionService`)
+## System voice input (`RecognitionService`)
 
 The SDK ships a ready-made `audio.soniqo.speech.service.SpeechRecognitionService`
 that plugs into Android's framework `SpeechRecognizer` API — no code to write.
@@ -159,53 +157,6 @@ Measured on Android emulator (arm64-v8a, no NNAPI). Real hardware is significant
 | Kokoro 82M | TTS | 1.9s output | 1,075ms | 0.58 |
 | Silero VAD v5 | VAD | 32ms chunk | <1ms | <0.01 |
 
-## Embedded Linux
-
-Minimal C API for automotive and embedded platforms. See [`linux/README.md`](linux/README.md) for full documentation.
-
-### C API usage
-
-```c
-#include <speech.h>
-
-void on_event(const speech_event_t* event, void* ctx) {
-    if (event->type == SPEECH_EVENT_TRANSCRIPTION)
-        printf("%s\n", event->text);
-}
-
-speech_config_t cfg = speech_config_default();
-cfg.model_dir = "/opt/speech/models";
-cfg.use_qnn = true;  // Hexagon DSP acceleration
-
-speech_pipeline_t p = speech_create(cfg, on_event, NULL);
-speech_start(p);
-speech_push_audio(p, pcm_samples, 512);
-```
-
-### Build
-
-```bash
-cd linux && ./setup_linux.sh
-cmake -B build -DORT_DIR=../ort-linux
-cmake --build build
-./build/speech_demo --model-dir /path/to/models
-```
-
-### Test
-
-```bash
-linux/tests/download_models.sh              # download ONNX models
-SPEECH_MODEL_DIR=tests/models ./build/speech_test   # 12 tests
-```
-
-### Cross-compile for Yocto
-
-```bash
-source /opt/poky/environment-setup-aarch64-poky-linux
-cmake -B build -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64.cmake -DORT_DIR=...
-cmake --build build
-```
-
 ## Pipeline
 
 ```text
@@ -220,41 +171,51 @@ Barge-in supported: speaking during TTS playback interrupts and starts a new tra
 
 ```text
 ┌──────────────────────────────────────────────┐
-│   Android: SpeechPipeline (Kotlin/JNI)       │
-│   Linux:   speech.h (C API)                  │
-└──────────────────┬───────────────────────────┘
-                   │
-┌──────────────────┴───────────────────────────┐
-│            speech-core (C++ submodule)        │
-│   Turn detection · Interruptions · Context   │
-└──┬────────┬────────┬────────┬────────────────┘
-   │        │        │        │  vtables
-┌──┴──┐  ┌──┴──┐  ┌──┴──┐  ┌─┴────────┐
-│ VAD │  │ STT │  │ TTS │  │ Enhancer │
-│Silero│  │Para-│  │Koko-│  │DeepFilter│
-│     │  │keet │  │ro   │  │Net3      │
-└──┬──┘  └──┬──┘  └──┬──┘  └─┬────────┘
-   └────────┴────────┴────────┘
-       ONNX Runtime (CPU / NNAPI / QNN)
+│      SpeechPipeline (Kotlin)                 │
+│            │                                 │
+│            ▼                                 │
+│      jni_bridge.cpp  (~250 lines)            │
+│            │                                 │
+│            ▼                                 │
+│  ┌──────────────────────────────────────┐    │
+│  │  speech_core_models (git submodule)  │    │
+│  │   SileroVad / ParakeetStt /          │    │
+│  │   KokoroTts / DeepFilterEnhancer     │    │
+│  │            │                         │    │
+│  │            ▼                         │    │
+│  │  speech_core  (orchestration:        │    │
+│  │   pipeline · turn · interruptions)   │    │
+│  └──────────────────────────────────────┘    │
+│            │                                 │
+│            ▼                                 │
+│      ONNX Runtime (CPU / NNAPI)              │
+└──────────────────────────────────────────────┘
 ```
 
+Each model class directly implements the corresponding speech-core interface
+(`VADInterface`, `STTInterface`, `TTSInterface`, `EnhancerInterface`) — the
+JNI bridge instantiates them and hands references to `VoicePipeline`. No
+C-vtable adapter boilerplate.
+
 ## Hardware Acceleration
 
-| Platform | Chipset | Acceleration |
-| --- | --- | --- |
-| Android | Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU |
-| Android | Samsung Exynos 2200+ | NNAPI → Samsung NPU |
-| Android | Google Tensor G2+ | NNAPI → Google TPU |
-| Automotive | SA8295P / SA8255P | QNN → Hexagon DSP |
-| Any | CPU fallback | XNNPACK |
+| Chipset | Acceleration |
+| --- | --- |
+| Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU |
+| Samsung Exynos 2200+ | NNAPI → Samsung NPU |
+| Google Tensor G2+ | NNAPI → Google TPU |
+| CPU fallback | XNNPACK |
+
+For automotive Qualcomm SA8295P / SA8255P with QNN (Hexagon DSP), see
+[speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux).
 
 ## Related
 
-| Repository | Platform |
+| Repository | Scope |
 | --- | --- |
 | [speech-swift](https://github.com/soniqo/speech-swift) | Apple (macOS, iOS) — MLX + CoreML |
-| [speech-core](https://github.com/soniqo/speech-core) | Cross-platform C++ pipeline engine |
-| **speech-android** | Android + embedded Linux — ONNX Runtime |
+| [speech-core](https://github.com/soniqo/speech-core) | Cross-platform C++ pipeline engine + ONNX model wrappers + Linux/embedded examples |
+| **speech-android** | Android wrapper — Kotlin SDK + JNI bridge over speech-core |
 
 ## License
 
diff --git a/README_de.md b/README_de.md
index 8bdffd3..b2147fa 100644
--- a/README_de.md
+++ b/README_de.md
@@ -2,18 +2,15 @@
 
 📖 Sprachen: [English](README.md) · [中文](README_zh.md) · [日本語](README_ja.md) · [한국어](README_ko.md) · [Español](README_es.md) · [Deutsch](README_de.md) · [Français](README_fr.md) · [हिन्दी](README_hi.md) · [Português](README_pt.md) · [Русский](README_ru.md)
 
-On-Device Speech-SDK für Android und Embedded Linux, basierend auf [ONNX Runtime](https://onnxruntime.ai) und [speech-core](https://github.com/soniqo/speech-core).
+On-Device Speech-SDK für Android, basierend auf [ONNX Runtime](https://onnxruntime.ai) und [speech-core](https://github.com/soniqo/speech-core).
 
 Spracherkennung (114 Sprachen), Text-to-Speech (8 Sprachen), Sprachaktivitätserkennung und Rauschunterdrückung — alles lokal ausgeführt. Keine Cloud-APIs, keine Daten verlassen das Gerät.
 
-**[Demo-APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[Modelle](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (Apple-Pendant) · **[speech-core](https://github.com/soniqo/speech-core)** (Pipeline-Engine)
+**[Demo-APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[Modelle](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (Apple-Pendant) · **[speech-core](https://github.com/soniqo/speech-core)** (Pipeline-Engine + Linux/Embedded-Build)
 
-## Plattformen
+## Geltungsbereich
 
-| Plattform | API | Beschleunigung | Verzeichnis |
-| --- | --- | --- | --- |
-| Android | Kotlin (`SpeechPipeline`) | NNAPI (Snapdragon, Exynos, Tensor) | `sdk/` |
-| Embedded Linux | C (`speech.h`) | QNN (Hexagon DSP) | `linux/` |
+Dieses Repo ist das **Android-Packaging**: Kotlin-SDK, JNI-Bridge, Demo-App. Die C++-Engine und die ONNX-Modell-Wrapper (Silero VAD, Parakeet STT, Kokoro TTS, DeepFilterNet3) liegen in [speech-core](https://github.com/soniqo/speech-core) und werden über ein Git-Submodul eingebunden. Linux / Automotive (Yocto, Qualcomm SA8295P/SA8255P) befindet sich unter [speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux).
 
 ## Modelle
 
@@ -24,15 +21,13 @@ Spracherkennung (114 Sprachen), Text-to-Speech (8 Sprachen), Sprachaktivitätser
 | [Silero VAD v5](https://huggingface.co/aufklarer/Silero-VAD-v5-ONNX) | Sprachaktivitätserkennung | 2 MB | Beliebig |
 | [DeepFilterNet3](https://huggingface.co/aufklarer/DeepFilterNet3-ONNX) | Rauschunterdrückung | ~8 MB | Beliebig |
 
-Modelle werden beim ersten Start automatisch heruntergeladen (Android) oder manuell platziert (Linux).
-
-## Android
+Modelle werden beim ersten Start automatisch über `ModelManager.ensureModels()` heruntergeladen.
 
-### Demo ausprobieren
+## Demo ausprobieren
 
 Lade das [signierte APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk) herunter und installiere es auf einem beliebigen arm64-Android-Gerät (8+). Modelle (~1,2 GB) werden beim ersten Start automatisch heruntergeladen.
 
-### Abhängigkeit hinzufügen
+## Abhängigkeit hinzufügen
 
 ```kotlin
 dependencies {
@@ -40,7 +35,7 @@ dependencies {
 }
 ```
 
-### Kotlin-Verwendung
+## Kotlin-Verwendung
 
 ```kotlin
 val modelDir = ModelManager.ensureModels(context)
@@ -63,7 +58,7 @@ pipeline.start()
 pipeline.pushAudio(samples)
 ```
 
-### Aus dem Quellcode bauen
+## Aus dem Quellcode bauen
 
 ```bash
 git clone --recursive https://github.com/soniqo/speech-android.git
@@ -73,7 +68,10 @@ cd speech-android
 ./gradlew :sdk:connectedAndroidTest   # 34 e2e-Tests
 ```
 
-### Demo-App
+`./setup.sh` initialisiert das speech-core-Submodul und lädt die ONNX Runtime
+nach `./ort/` herunter.
+
+## Demo-App
 
 Das Modul [`app/`](app/) ist eine minimale Sprachassistenten-Demo mit:
 
@@ -87,7 +85,7 @@ Das Modul [`app/`](app/) ist eine minimale Sprachassistenten-Demo mit:
 ./gradlew :app:installDebug
 ```
 
-### Systemweite Spracheingabe (`RecognitionService`)
+## Systemweite Spracheingabe (`RecognitionService`)
 
 Das SDK enthält einen einsatzbereiten `audio.soniqo.speech.service.SpeechRecognitionService`, der sich in die `SpeechRecognizer`-API des Android-Frameworks einklinkt — kein Code zu schreiben. Sobald deine App als Standard-Spracherkennung ausgewählt ist, erhält jede Drittanbieter-App, die `SpeechRecognizer.createSpeechRecognizer(context)` (ohne `ComponentName`) aufruft, vollständiges On-Device-STT über deine Pipeline.
 
@@ -143,53 +141,6 @@ Gemessen auf einem Android-Emulator (arm64-v8a, ohne NNAPI). Echte Hardware ist
 | Kokoro 82M | TTS | 1,9s Ausgabe | 1.075ms | 0,58 |
 | Silero VAD v5 | VAD | 32ms-Block | <1ms | <0,01 |
 
-## Embedded Linux
-
-Minimale C-API für Automotive- und Embedded-Plattformen. Vollständige Dokumentation siehe [`linux/README.md`](linux/README.md).
-
-### C-API-Verwendung
-
-```c
-#include <speech.h>
-
-void on_event(const speech_event_t* event, void* ctx) {
-    if (event->type == SPEECH_EVENT_TRANSCRIPTION)
-        printf("%s\n", event->text);
-}
-
-speech_config_t cfg = speech_config_default();
-cfg.model_dir = "/opt/speech/models";
-cfg.use_qnn = true;  // Hexagon-DSP-Beschleunigung
-
-speech_pipeline_t p = speech_create(cfg, on_event, NULL);
-speech_start(p);
-speech_push_audio(p, pcm_samples, 512);
-```
-
-### Bauen
-
-```bash
-cd linux && ./setup_linux.sh
-cmake -B build -DORT_DIR=../ort-linux
-cmake --build build
-./build/speech_demo --model-dir /path/to/models
-```
-
-### Testen
-
-```bash
-linux/tests/download_models.sh              # ONNX-Modelle herunterladen
-SPEECH_MODEL_DIR=tests/models ./build/speech_test   # 12 Tests
-```
-
-### Cross-Compile für Yocto
-
-```bash
-source /opt/poky/environment-setup-aarch64-poky-linux
-cmake -B build -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64.cmake -DORT_DIR=...
-cmake --build build
-```
-
 ## Pipeline
 
 ```text
@@ -204,41 +155,51 @@ Barge-In wird unterstützt: Sprechen während der TTS-Wiedergabe unterbricht und
 
 ```text
 ┌──────────────────────────────────────────────┐
-│   Android: SpeechPipeline (Kotlin/JNI)       │
-│   Linux:   speech.h (C API)                  │
-└──────────────────┬───────────────────────────┘
-                   │
-┌──────────────────┴───────────────────────────┐
-│            speech-core (C++ submodule)        │
-│   Turn detection · Interruptions · Context   │
-└──┬────────┬────────┬────────┬────────────────┘
-   │        │        │        │  vtables
-┌──┴──┐  ┌──┴──┐  ┌──┴──┐  ┌─┴────────┐
-│ VAD │  │ STT │  │ TTS │  │ Enhancer │
-│Silero│  │Para-│  │Koko-│  │DeepFilter│
-│     │  │keet │  │ro   │  │Net3      │
-└──┬──┘  └──┬──┘  └──┬──┘  └─┬────────┘
-   └────────┴────────┴────────┘
-       ONNX Runtime (CPU / NNAPI / QNN)
+│      SpeechPipeline (Kotlin)                 │
+│            │                                 │
+│            ▼                                 │
+│      jni_bridge.cpp  (~250 Zeilen)           │
+│            │                                 │
+│            ▼                                 │
+│  ┌──────────────────────────────────────┐    │
+│  │  speech_core_models (Git-Submodul)   │    │
+│  │   SileroVad / ParakeetStt /          │    │
+│  │   KokoroTts / DeepFilterEnhancer     │    │
+│  │            │                         │    │
+│  │            ▼                         │    │
+│  │  speech_core  (Orchestrierung:       │    │
+│  │   Pipeline · Turn · Interruptions)   │    │
+│  └──────────────────────────────────────┘    │
+│            │                                 │
+│            ▼                                 │
+│      ONNX Runtime (CPU / NNAPI)              │
+└──────────────────────────────────────────────┘
 ```
 
+Jede Modellklasse implementiert direkt die entsprechende speech-core-Schnittstelle
+(`VADInterface`, `STTInterface`, `TTSInterface`, `EnhancerInterface`) — die
+JNI-Bridge instanziiert sie und übergibt Referenzen an `VoicePipeline`. Kein
+C-vtable-Adapter-Boilerplate.
+
 ## Hardwarebeschleunigung
 
-| Plattform | Chipsatz | Beschleunigung |
-| --- | --- | --- |
-| Android | Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU |
-| Android | Samsung Exynos 2200+ | NNAPI → Samsung NPU |
-| Android | Google Tensor G2+ | NNAPI → Google TPU |
-| Automotive | SA8295P / SA8255P | QNN → Hexagon DSP |
-| Beliebig | CPU-Fallback | XNNPACK |
+| Chipsatz | Beschleunigung |
+| --- | --- |
+| Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU |
+| Samsung Exynos 2200+ | NNAPI → Samsung NPU |
+| Google Tensor G2+ | NNAPI → Google TPU |
+| CPU-Fallback | XNNPACK |
+
+Für Automotive Qualcomm SA8295P / SA8255P mit QNN (Hexagon DSP) siehe
+[speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux).
 
 ## Verwandte Projekte
 
-| Repository | Plattform |
+| Repository | Geltungsbereich |
 | --- | --- |
 | [speech-swift](https://github.com/soniqo/speech-swift) | Apple (macOS, iOS) — MLX + CoreML |
-| [speech-core](https://github.com/soniqo/speech-core) | Plattformübergreifende C++-Pipeline-Engine |
-| **speech-android** | Android + Embedded Linux — ONNX Runtime |
+| [speech-core](https://github.com/soniqo/speech-core) | Plattformübergreifende C++-Pipeline-Engine + ONNX-Modell-Wrapper + Linux/Embedded-Beispiele |
+| **speech-android** | Android-Wrapper — Kotlin-SDK + JNI-Bridge über speech-core |
 
 ## Lizenz
 
diff --git a/README_es.md b/README_es.md
index 575055c..c5ae3d3 100644
--- a/README_es.md
+++ b/README_es.md
@@ -2,18 +2,15 @@
 
 📖 Idiomas: [English](README.md) · [中文](README_zh.md) · [日本語](README_ja.md) · [한국어](README_ko.md) · [Español](README_es.md) · [Deutsch](README_de.md) · [Français](README_fr.md) · [हिन्दी](README_hi.md) · [Português](README_pt.md) · [Русский](README_ru.md)
 
-SDK de voz en el dispositivo para Android y Linux embebido, impulsado por [ONNX Runtime](https://onnxruntime.ai) y [speech-core](https://github.com/soniqo/speech-core).
+SDK de voz en el dispositivo para Android, impulsado por [ONNX Runtime](https://onnxruntime.ai) y [speech-core](https://github.com/soniqo/speech-core).
 
 Reconocimiento de voz (114 idiomas), texto a voz (8 idiomas), detección de actividad de voz y cancelación de ruido — todo ejecutándose localmente. Sin APIs en la nube, ningún dato sale del dispositivo.
 
-**[APK de demostración](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[Modelos](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (contraparte Apple) · **[speech-core](https://github.com/soniqo/speech-core)** (motor de pipeline)
+**[APK de demostración](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[Modelos](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (contraparte Apple) · **[speech-core](https://github.com/soniqo/speech-core)** (motor de pipeline + compilación Linux/embebido)
 
-## Plataformas
+## Alcance
 
-| Plataforma | API | Aceleración | Directorio |
-| --- | --- | --- | --- |
-| Android | Kotlin (`SpeechPipeline`) | NNAPI (Snapdragon, Exynos, Tensor) | `sdk/` |
-| Linux embebido | C (`speech.h`) | QNN (Hexagon DSP) | `linux/` |
+Este repositorio es el **empaquetado para Android**: SDK de Kotlin, puente JNI, app demo. El motor C++ y los envoltorios de modelos ONNX (Silero VAD, Parakeet STT, Kokoro TTS, DeepFilterNet3) viven en [speech-core](https://github.com/soniqo/speech-core) y se incorporan vía un submódulo git. Linux / automoción (Yocto, Qualcomm SA8295P/SA8255P) vive en [speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux).
 
 ## Modelos
 
@@ -24,15 +21,13 @@ Reconocimiento de voz (114 idiomas), texto a voz (8 idiomas), detección de acti
 | [Silero VAD v5](https://huggingface.co/aufklarer/Silero-VAD-v5-ONNX) | Detección de actividad de voz | 2 MB | Cualquiera |
 | [DeepFilterNet3](https://huggingface.co/aufklarer/DeepFilterNet3-ONNX) | Cancelación de ruido | ~8 MB | Cualquiera |
 
-Los modelos se descargan automáticamente al primer inicio (Android) o se colocan manualmente (Linux).
-
-## Android
+Los modelos se descargan automáticamente al primer inicio vía `ModelManager.ensureModels()`.
 
-### Prueba la demo
+## Prueba la demo
 
 Descarga el [APK firmado](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk) e instálalo en cualquier dispositivo Android arm64 (8+). Los modelos (~1.2 GB) se descargan automáticamente en el primer inicio.
 
-### Añadir dependencia
+## Añadir dependencia
 
 ```kotlin
 dependencies {
@@ -40,7 +35,7 @@ dependencies {
 }
 ```
 
-### Uso de Kotlin
+## Uso de Kotlin
 
 ```kotlin
 val modelDir = ModelManager.ensureModels(context)
@@ -63,7 +58,7 @@ pipeline.start()
 pipeline.pushAudio(samples)
 ```
 
-### Compilar desde fuente
+## Compilar desde fuente
 
 ```bash
 git clone --recursive https://github.com/soniqo/speech-android.git
@@ -73,7 +68,10 @@ cd speech-android
 ./gradlew :sdk:connectedAndroidTest   # 34 pruebas e2e
 ```
 
-### Aplicación demo
+`./setup.sh` inicializa el submódulo speech-core y descarga ONNX Runtime
+en `./ort/`.
+
+## Aplicación demo
 
 El módulo [`app/`](app/) es una demo mínima de asistente de voz con:
 
@@ -87,9 +85,14 @@ El módulo [`app/`](app/) es una demo mínima de asistente de voz con:
 ./gradlew :app:installDebug
 ```
 
-### Entrada de voz del sistema (`RecognitionService`)
+## Entrada de voz del sistema (`RecognitionService`)
 
-El SDK incluye un `audio.soniqo.speech.service.SpeechRecognitionService` listo para usar que se conecta a la API `SpeechRecognizer` del framework de Android — sin código que escribir. Una vez que tu app está seleccionada como reconocedor de voz predeterminado, cualquier app de terceros que llame a `SpeechRecognizer.createSpeechRecognizer(context)` (sin `ComponentName`) obtiene STT completamente en el dispositivo a través de tu pipeline.
+El SDK incluye un `audio.soniqo.speech.service.SpeechRecognitionService` listo
+para usar que se conecta a la API `SpeechRecognizer` del framework de Android
+— sin código que escribir. Una vez que tu app está seleccionada como
+reconocedor de voz predeterminado, cualquier app de terceros que llame a
+`SpeechRecognizer.createSpeechRecognizer(context)` (sin `ComponentName`)
+obtiene STT completamente en el dispositivo a través de tu pipeline.
 
 **1. Declara `RECORD_AUDIO` y el servicio en `AndroidManifest.xml`:**
 
@@ -118,20 +121,33 @@ El SDK incluye un `audio.soniqo.speech.service.SpeechRecognitionService` listo p
 <recognition-service xmlns:android="http://schemas.android.com/apk/res/android" />
 ```
 
-(Opcionalmente añade `android:settingsActivity="..."` para exponer un icono de engranaje en el selector de entrada de voz del sistema.)
+(Opcionalmente añade `android:settingsActivity="..."` para exponer un icono
+de engranaje en el selector de entrada de voz del sistema.)
 
-**3. Configura el servicio como predeterminado del sistema** (Ajustes → Sistema → Idiomas e introducción → Selector de entrada de voz en Android puro, o vía adb):
+**3. Configura el servicio como predeterminado del sistema** (Ajustes →
+Sistema → Idiomas e introducción → Selector de entrada de voz en Android
+puro, o vía adb):
 
 ```bash
 adb shell settings put secure voice_recognition_service \
   your.package/audio.soniqo.speech.service.SpeechRecognitionService
 ```
 
-**4. Verifica** ejecutando la pantalla *Recognizer test* de la app demo, que llama a `SpeechRecognizer.createSpeechRecognizer(ctx)` (sin componente) y registra cada callback del framework — útil para confirmar el round-trip del binder sin necesitar logcat.
+**4. Verifica** ejecutando la pantalla *Recognizer test* de la app demo, que
+llama a `SpeechRecognizer.createSpeechRecognizer(ctx)` (sin componente) y
+registra cada callback del framework — útil para confirmar el round-trip del
+binder sin necesitar logcat.
 
-El servicio implementa `onCheckRecognitionSupport` (API 33+) devolviendo los 27 idiomas BCP-47 que cubre Parakeet TDT v3, marcados como `installedOnDeviceLanguage` cuando los modelos están presentes (o `pendingOnDeviceLanguage` mientras se descargan). Se adquiere foco de audio con `AUDIOFOCUS_GAIN_TRANSIENT` durante la sesión.
+El servicio implementa `onCheckRecognitionSupport` (API 33+) devolviendo los
+27 idiomas BCP-47 que cubre Parakeet TDT v3, marcados como
+`installedOnDeviceLanguage` cuando los modelos están presentes (o
+`pendingOnDeviceLanguage` mientras se descargan). Se adquiere foco de audio
+con `AUDIOFOCUS_GAIN_TRANSIENT` durante la sesión.
 
-**Limitación:** Gboard, Samsung Keyboard y Google Assistant incluyen sus propios reconocedores y se saltan el predeterminado del sistema. Las apps que llaman explícitamente a la API `SpeechRecognizer` del framework (o construyen su propia UI sobre ella) son las que pasan por tu servicio.
+**Limitación:** Gboard, Samsung Keyboard y Google Assistant incluyen sus
+propios reconocedores y se saltan el predeterminado del sistema. Las apps
+que llaman explícitamente a la API `SpeechRecognizer` del framework (o
+construyen su propia UI sobre ella) son las que pasan por tu servicio.
 
 ## Rendimiento
 
@@ -143,53 +159,6 @@ Medido en emulador Android (arm64-v8a, sin NNAPI). El hardware real es significa
 | Kokoro 82M | TTS | 1.9s salida | 1,075ms | 0.58 |
 | Silero VAD v5 | VAD | bloque 32ms | <1ms | <0.01 |
 
-## Linux embebido
-
-API C mínima para plataformas automotrices y embebidas. Consulta [`linux/README.md`](linux/README.md) para la documentación completa.
-
-### Uso de la API C
-
-```c
-#include <speech.h>
-
-void on_event(const speech_event_t* event, void* ctx) {
-    if (event->type == SPEECH_EVENT_TRANSCRIPTION)
-        printf("%s\n", event->text);
-}
-
-speech_config_t cfg = speech_config_default();
-cfg.model_dir = "/opt/speech/models";
-cfg.use_qnn = true;  // Aceleración Hexagon DSP
-
-speech_pipeline_t p = speech_create(cfg, on_event, NULL);
-speech_start(p);
-speech_push_audio(p, pcm_samples, 512);
-```
-
-### Compilar
-
-```bash
-cd linux && ./setup_linux.sh
-cmake -B build -DORT_DIR=../ort-linux
-cmake --build build
-./build/speech_demo --model-dir /path/to/models
-```
-
-### Probar
-
-```bash
-linux/tests/download_models.sh              # descargar modelos ONNX
-SPEECH_MODEL_DIR=tests/models ./build/speech_test   # 12 pruebas
-```
-
-### Compilación cruzada para Yocto
-
-```bash
-source /opt/poky/environment-setup-aarch64-poky-linux
-cmake -B build -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64.cmake -DORT_DIR=...
-cmake --build build
-```
-
 ## Pipeline
 
 ```text
@@ -204,41 +173,51 @@ Soporte de barge-in: hablar durante la reproducción TTS interrumpe e inicia una
 
 ```text
 ┌──────────────────────────────────────────────┐
-│   Android: SpeechPipeline (Kotlin/JNI)       │
-│   Linux:   speech.h (C API)                  │
-└──────────────────┬───────────────────────────┘
-                   │
-┌──────────────────┴───────────────────────────┐
-│            speech-core (C++ submodule)        │
-│   Turn detection · Interruptions · Context   │
-└──┬────────┬────────┬────────┬────────────────┘
-   │        │        │        │  vtables
-┌──┴──┐  ┌──┴──┐  ┌──┴──┐  ┌─┴────────┐
-│ VAD │  │ STT │  │ TTS │  │ Enhancer │
-│Silero│  │Para-│  │Koko-│  │DeepFilter│
-│     │  │keet │  │ro   │  │Net3      │
-└──┬──┘  └──┬──┘  └──┬──┘  └─┬────────┘
-   └────────┴────────┴────────┘
-       ONNX Runtime (CPU / NNAPI / QNN)
+│      SpeechPipeline (Kotlin)                 │
+│            │                                 │
+│            ▼                                 │
+│      jni_bridge.cpp  (~250 líneas)           │
+│            │                                 │
+│            ▼                                 │
+│  ┌──────────────────────────────────────┐    │
+│  │  speech_core_models (submódulo git)  │    │
+│  │   SileroVad / ParakeetStt /          │    │
+│  │   KokoroTts / DeepFilterEnhancer     │    │
+│  │            │                         │    │
+│  │            ▼                         │    │
+│  │  speech_core  (orquestación:         │    │
+│  │   pipeline · turno · interrupciones) │    │
+│  └──────────────────────────────────────┘    │
+│            │                                 │
+│            ▼                                 │
+│      ONNX Runtime (CPU / NNAPI)              │
+└──────────────────────────────────────────────┘
 ```
 
+Cada clase de modelo implementa directamente la interfaz correspondiente de
+speech-core (`VADInterface`, `STTInterface`, `TTSInterface`,
+`EnhancerInterface`) — el puente JNI las instancia y entrega las referencias
+a `VoicePipeline`. Sin código repetitivo de adaptadores con vtables en C.
+
 ## Aceleración por hardware
 
-| Plataforma | Chipset | Aceleración |
-| --- | --- | --- |
-| Android | Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU |
-| Android | Samsung Exynos 2200+ | NNAPI → Samsung NPU |
-| Android | Google Tensor G2+ | NNAPI → Google TPU |
-| Automoción | SA8295P / SA8255P | QNN → Hexagon DSP |
-| Cualquiera | Fallback CPU | XNNPACK |
+| Chipset | Aceleración |
+| --- | --- |
+| Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU |
+| Samsung Exynos 2200+ | NNAPI → Samsung NPU |
+| Google Tensor G2+ | NNAPI → Google TPU |
+| Fallback CPU | XNNPACK |
+
+Para Qualcomm SA8295P / SA8255P de automoción con QNN (Hexagon DSP), consulta
+[speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux).
 
 ## Relacionados
 
-| Repositorio | Plataforma |
+| Repositorio | Alcance |
 | --- | --- |
 | [speech-swift](https://github.com/soniqo/speech-swift) | Apple (macOS, iOS) — MLX + CoreML |
-| [speech-core](https://github.com/soniqo/speech-core) | Motor de pipeline C++ multiplataforma |
-| **speech-android** | Android + Linux embebido — ONNX Runtime |
+| [speech-core](https://github.com/soniqo/speech-core) | Motor de pipeline C++ multiplataforma + envoltorios de modelos ONNX + ejemplos Linux/embebido |
+| **speech-android** | Envoltorio Android — SDK Kotlin + puente JNI sobre speech-core |
 
 ## Licencia
 
diff --git a/README_fr.md b/README_fr.md
index 9752333..a46ab04 100644
--- a/README_fr.md
+++ b/README_fr.md
@@ -2,18 +2,15 @@
 
 📖 Langues : [English](README.md) · [中文](README_zh.md) · [日本語](README_ja.md) · [한국어](README_ko.md) · [Español](README_es.md) · [Deutsch](README_de.md) · [Français](README_fr.md) · [हिन्दी](README_hi.md) · [Português](README_pt.md) · [Русский](README_ru.md)
 
-SDK vocal sur appareil pour Android et Linux embarqué, propulsé par [ONNX Runtime](https://onnxruntime.ai) et [speech-core](https://github.com/soniqo/speech-core).
+SDK vocal sur appareil pour Android, propulsé par [ONNX Runtime](https://onnxruntime.ai) et [speech-core](https://github.com/soniqo/speech-core).
 
 Reconnaissance vocale (114 langues), synthèse vocale (8 langues), détection d'activité vocale et suppression de bruit — tout fonctionne en local. Aucune API cloud, aucune donnée ne quitte l'appareil.
 
-**[APK de démo](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[Modèles](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (équivalent Apple) · **[speech-core](https://github.com/soniqo/speech-core)** (moteur de pipeline)
+**[APK de démo](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[Modèles](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (équivalent Apple) · **[speech-core](https://github.com/soniqo/speech-core)** (moteur de pipeline + build Linux/embarqué)
 
-## Plateformes
+## Périmètre
 
-| Plateforme | API | Accélération | Répertoire |
-| --- | --- | --- | --- |
-| Android | Kotlin (`SpeechPipeline`) | NNAPI (Snapdragon, Exynos, Tensor) | `sdk/` |
-| Linux embarqué | C (`speech.h`) | QNN (Hexagon DSP) | `linux/` |
+Ce dépôt fournit le **packaging Android** : SDK Kotlin, pont JNI, application de démo. Le moteur C++ et les wrappers de modèles ONNX (Silero VAD, Parakeet STT, Kokoro TTS, DeepFilterNet3) résident dans [speech-core](https://github.com/soniqo/speech-core) et sont intégrés via un sous-module git. Le volet Linux / automobile (Yocto, Qualcomm SA8295P/SA8255P) se trouve dans [speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux).
 
 ## Modèles
 
@@ -24,15 +21,13 @@ Reconnaissance vocale (114 langues), synthèse vocale (8 langues), détection d'
 | [Silero VAD v5](https://huggingface.co/aufklarer/Silero-VAD-v5-ONNX) | Détection d'activité vocale | 2 Mo | Toutes |
 | [DeepFilterNet3](https://huggingface.co/aufklarer/DeepFilterNet3-ONNX) | Suppression de bruit | ~8 Mo | Toutes |
 
-Les modèles sont téléchargés automatiquement au premier lancement (Android) ou placés manuellement (Linux).
-
-## Android
+Les modèles sont téléchargés automatiquement au premier lancement via `ModelManager.ensureModels()`.
 
-### Essayer la démo
+## Essayer la démo
 
 Téléchargez l'[APK signé](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk) et installez-le sur n'importe quel appareil Android arm64 (8+). Les modèles (~1,2 Go) sont téléchargés automatiquement au premier lancement.
 
-### Ajouter la dépendance
+## Ajouter la dépendance
 
 ```kotlin
 dependencies {
@@ -40,7 +35,7 @@ dependencies {
 }
 ```
 
-### Utilisation Kotlin
+## Utilisation Kotlin
 
 ```kotlin
 val modelDir = ModelManager.ensureModels(context)
@@ -63,7 +58,7 @@ pipeline.start()
 pipeline.pushAudio(samples)
 ```
 
-### Compiler depuis les sources
+## Compiler depuis les sources
 
 ```bash
 git clone --recursive https://github.com/soniqo/speech-android.git
@@ -73,7 +68,10 @@ cd speech-android
 ./gradlew :sdk:connectedAndroidTest   # 34 tests e2e
 ```
 
-### Application de démo
+`./setup.sh` initialise le sous-module speech-core et télécharge ONNX Runtime
+dans `./ort/`.
+
+## Application de démo
 
 Le module [`app/`](app/) est une démo minimale d'assistant vocal avec :
 
@@ -87,7 +85,7 @@ Le module [`app/`](app/) est une démo minimale d'assistant vocal avec :
 ./gradlew :app:installDebug
 ```
 
-### Entrée vocale système (`RecognitionService`)
+## Entrée vocale système (`RecognitionService`)
 
 Le SDK fournit un `audio.soniqo.speech.service.SpeechRecognitionService` prêt à l'emploi qui s'intègre à l'API `SpeechRecognizer` du framework Android — aucun code à écrire. Une fois votre app sélectionnée comme reconnaisseur vocal par défaut, toute application tierce appelant `SpeechRecognizer.createSpeechRecognizer(context)` (sans `ComponentName`) obtient un STT entièrement on-device via votre pipeline.
 
@@ -143,53 +141,6 @@ Mesuré sur émulateur Android (arm64-v8a, sans NNAPI). Le matériel réel est n
 | Kokoro 82M | TTS | 1,9 s en sortie | 1 075 ms | 0,58 |
 | Silero VAD v5 | VAD | bloc 32 ms | <1 ms | <0,01 |
 
-## Linux embarqué
-
-API C minimale pour les plateformes automobiles et embarquées. Voir [`linux/README.md`](linux/README.md) pour la documentation complète.
-
-### Utilisation de l'API C
-
-```c
-#include <speech.h>
-
-void on_event(const speech_event_t* event, void* ctx) {
-    if (event->type == SPEECH_EVENT_TRANSCRIPTION)
-        printf("%s\n", event->text);
-}
-
-speech_config_t cfg = speech_config_default();
-cfg.model_dir = "/opt/speech/models";
-cfg.use_qnn = true;  // Accélération Hexagon DSP
-
-speech_pipeline_t p = speech_create(cfg, on_event, NULL);
-speech_start(p);
-speech_push_audio(p, pcm_samples, 512);
-```
-
-### Compiler
-
-```bash
-cd linux && ./setup_linux.sh
-cmake -B build -DORT_DIR=../ort-linux
-cmake --build build
-./build/speech_demo --model-dir /path/to/models
-```
-
-### Tester
-
-```bash
-linux/tests/download_models.sh              # télécharger les modèles ONNX
-SPEECH_MODEL_DIR=tests/models ./build/speech_test   # 12 tests
-```
-
-### Compilation croisée pour Yocto
-
-```bash
-source /opt/poky/environment-setup-aarch64-poky-linux
-cmake -B build -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64.cmake -DORT_DIR=...
-cmake --build build
-```
-
 ## Pipeline
 
 ```text
@@ -204,41 +155,47 @@ Le barge-in est pris en charge : parler pendant la lecture TTS l'interrompt et d
 
 ```text
 ┌──────────────────────────────────────────────┐
-│   Android: SpeechPipeline (Kotlin/JNI)       │
-│   Linux:   speech.h (C API)                  │
-└──────────────────┬───────────────────────────┘
-                   │
-┌──────────────────┴───────────────────────────┐
-│            speech-core (C++ submodule)        │
-│   Turn detection · Interruptions · Context   │
-└──┬────────┬────────┬────────┬────────────────┘
-   │        │        │        │  vtables
-┌──┴──┐  ┌──┴──┐  ┌──┴──┐  ┌─┴────────┐
-│ VAD │  │ STT │  │ TTS │  │ Enhancer │
-│Silero│  │Para-│  │Koko-│  │DeepFilter│
-│     │  │keet │  │ro   │  │Net3      │
-└──┬──┘  └──┬──┘  └──┬──┘  └─┬────────┘
-   └────────┴────────┴────────┘
-       ONNX Runtime (CPU / NNAPI / QNN)
+│      SpeechPipeline (Kotlin)                 │
+│            │                                 │
+│            ▼                                 │
+│      jni_bridge.cpp  (~250 lignes)           │
+│            │                                 │
+│            ▼                                 │
+│  ┌──────────────────────────────────────┐    │
+│  │  speech_core_models (sous-module)    │    │
+│  │   SileroVad / ParakeetStt /          │    │
+│  │   KokoroTts / DeepFilterEnhancer     │    │
+│  │            │                         │    │
+│  │            ▼                         │    │
+│  │  speech_core  (orchestration :       │    │
+│  │   pipeline · tour · interruptions)   │    │
+│  └──────────────────────────────────────┘    │
+│            │                                 │
+│            ▼                                 │
+│      ONNX Runtime (CPU / NNAPI)              │
+└──────────────────────────────────────────────┘
 ```
 
+Chaque classe de modèle implémente directement l'interface speech-core correspondante (`VADInterface`, `STTInterface`, `TTSInterface`, `EnhancerInterface`) — le pont JNI les instancie et transmet les références à `VoicePipeline`. Aucun boilerplate d'adaptateur de vtable C.
+
 ## Accélération matérielle
 
-| Plateforme | Chipset | Accélération |
-| --- | --- | --- |
-| Android | Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU |
-| Android | Samsung Exynos 2200+ | NNAPI → Samsung NPU |
-| Android | Google Tensor G2+ | NNAPI → Google TPU |
-| Automobile | SA8295P / SA8255P | QNN → Hexagon DSP |
-| Toutes | Repli CPU | XNNPACK |
+| Chipset | Accélération |
+| --- | --- |
+| Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU |
+| Samsung Exynos 2200+ | NNAPI → Samsung NPU |
+| Google Tensor G2+ | NNAPI → Google TPU |
+| Repli CPU | XNNPACK |
+
+Pour les plateformes automobiles Qualcomm SA8295P / SA8255P avec QNN (Hexagon DSP), voir [speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux).
 
 ## Projets liés
 
-| Dépôt | Plateforme |
+| Dépôt | Périmètre |
 | --- | --- |
 | [speech-swift](https://github.com/soniqo/speech-swift) | Apple (macOS, iOS) — MLX + CoreML |
-| [speech-core](https://github.com/soniqo/speech-core) | Moteur de pipeline C++ multiplateforme |
-| **speech-android** | Android + Linux embarqué — ONNX Runtime |
+| [speech-core](https://github.com/soniqo/speech-core) | Moteur de pipeline C++ multiplateforme + wrappers de modèles ONNX + exemples Linux/embarqué |
+| **speech-android** | Wrapper Android — SDK Kotlin + pont JNI sur speech-core |
 
 ## Licence
 
diff --git a/README_hi.md b/README_hi.md
index dde08c6..a1b91e3 100644
--- a/README_hi.md
+++ b/README_hi.md
@@ -2,18 +2,15 @@
 
 📖 भाषाएँ: [English](README.md) · [中文](README_zh.md) · [日本語](README_ja.md) · [한국어](README_ko.md) · [Español](README_es.md) · [Deutsch](README_de.md) · [Français](README_fr.md) · [हिन्दी](README_hi.md) · [Português](README_pt.md) · [Русский](README_ru.md)
 
-Android और एम्बेडेड Linux के लिए ऑन-डिवाइस स्पीच SDK, [ONNX Runtime](https://onnxruntime.ai) और [speech-core](https://github.com/soniqo/speech-core) द्वारा संचालित।
+Android के लिए ऑन-डिवाइस स्पीच SDK, [ONNX Runtime](https://onnxruntime.ai) और [speech-core](https://github.com/soniqo/speech-core) द्वारा संचालित।
 
 स्पीच रिकग्निशन (114 भाषाएँ), टेक्स्ट-टू-स्पीच (8 भाषाएँ), वॉयस एक्टिविटी डिटेक्शन, और शोर रद्दीकरण — सभी स्थानीय रूप से चलते हैं। कोई क्लाउड API नहीं, कोई डेटा डिवाइस से बाहर नहीं जाता।
 
-**[डेमो APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[मॉडल](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (Apple समकक्ष) · **[speech-core](https://github.com/soniqo/speech-core)** (पाइपलाइन इंजन)
+**[डेमो APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[मॉडल](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (Apple समकक्ष) · **[speech-core](https://github.com/soniqo/speech-core)** (पाइपलाइन इंजन + Linux/एम्बेडेड बिल्ड)
 
-## प्लेटफ़ॉर्म
+## स्कोप
 
-| प्लेटफ़ॉर्म | API | त्वरण | निर्देशिका |
-| --- | --- | --- | --- |
-| Android | Kotlin (`SpeechPipeline`) | NNAPI (Snapdragon, Exynos, Tensor) | `sdk/` |
-| एम्बेडेड Linux | C (`speech.h`) | QNN (Hexagon DSP) | `linux/` |
+यह रिपॉज़िटरी **Android पैकेजिंग** है: Kotlin SDK, JNI ब्रिज, डेमो ऐप। C++ इंजन और ONNX मॉडल रैपर (Silero VAD, Parakeet STT, Kokoro TTS, DeepFilterNet3) [speech-core](https://github.com/soniqo/speech-core) में रहते हैं और एक git सबमॉड्यूल के माध्यम से शामिल किए जाते हैं। Linux / ऑटोमोटिव (Yocto, Qualcomm SA8295P/SA8255P) [speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux) पर रहता है।
 
 ## मॉडल
 
@@ -24,15 +21,13 @@ Android और एम्बेडेड Linux के लिए ऑन-डिव
 | [Silero VAD v5](https://huggingface.co/aufklarer/Silero-VAD-v5-ONNX) | वॉयस एक्टिविटी डिटेक्शन | 2 MB | कोई भी |
 | [DeepFilterNet3](https://huggingface.co/aufklarer/DeepFilterNet3-ONNX) | शोर रद्दीकरण | ~8 MB | कोई भी |
 
-मॉडल पहले लॉन्च पर स्वचालित रूप से डाउनलोड होते हैं (Android) या मैन्युअल रूप से रखे जाते हैं (Linux)।
-
-## Android
+मॉडल पहले लॉन्च पर `ModelManager.ensureModels()` के माध्यम से स्वचालित रूप से डाउनलोड होते हैं।
 
-### डेमो आज़माएँ
+## डेमो आज़माएँ
 
 [हस्ताक्षरित APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk) डाउनलोड करें और किसी भी arm64 Android डिवाइस (8+) पर इंस्टॉल करें। मॉडल (~1.2 GB) पहले लॉन्च पर स्वचालित रूप से डाउनलोड होते हैं।
 
-### निर्भरता जोड़ें
+## निर्भरता जोड़ें
 
 ```kotlin
 dependencies {
@@ -40,7 +35,7 @@ dependencies {
 }
 ```
 
-### Kotlin उपयोग
+## Kotlin उपयोग
 
 ```kotlin
 val modelDir = ModelManager.ensureModels(context)
@@ -63,7 +58,7 @@ pipeline.start()
 pipeline.pushAudio(samples)
 ```
 
-### स्रोत से बिल्ड करें
+## स्रोत से बिल्ड करें
 
 ```bash
 git clone --recursive https://github.com/soniqo/speech-android.git
@@ -73,7 +68,10 @@ cd speech-android
 ./gradlew :sdk:connectedAndroidTest   # 34 e2e परीक्षण
 ```
 
-### डेमो ऐप
+`./setup.sh` speech-core सबमॉड्यूल को इनिशियलाइज़ करता है और ONNX Runtime को
+`./ort/` में डाउनलोड करता है।
+
+## डेमो ऐप
 
 [`app/`](app/) मॉड्यूल एक न्यूनतम वॉयस असिस्टेंट डेमो है जिसमें शामिल हैं:
 
@@ -87,7 +85,7 @@ cd speech-android
 ./gradlew :app:installDebug
 ```
 
-### सिस्टम वॉयस इनपुट (`RecognitionService`)
+## सिस्टम वॉयस इनपुट (`RecognitionService`)
 
 SDK एक उपयोग के लिए तैयार `audio.soniqo.speech.service.SpeechRecognitionService` शामिल करता है जो Android फ्रेमवर्क के `SpeechRecognizer` API से जुड़ता है — कोई कोड लिखने की आवश्यकता नहीं। एक बार आपका ऐप डिफ़ॉल्ट वॉयस रिकग्नाइज़र के रूप में चुना जाता है, कोई भी थर्ड-पार्टी ऐप जो `SpeechRecognizer.createSpeechRecognizer(context)` (बिना `ComponentName` के) कॉल करता है, आपकी पाइपलाइन के माध्यम से पूरी तरह से ऑन-डिवाइस STT प्राप्त करता है।
 
@@ -143,53 +141,6 @@ Android एमुलेटर (arm64-v8a, NNAPI के बिना) पर म
 | Kokoro 82M | TTS | 1.9 सेकंड आउटपुट | 1,075 मिलीसेकंड | 0.58 |
 | Silero VAD v5 | VAD | 32 मिलीसेकंड चंक | <1 मिलीसेकंड | <0.01 |
 
-## एम्बेडेड Linux
-
-ऑटोमोटिव और एम्बेडेड प्लेटफ़ॉर्म के लिए न्यूनतम C API। पूर्ण दस्तावेज़ के लिए [`linux/README.md`](linux/README.md) देखें।
-
-### C API उपयोग
-
-```c
-#include <speech.h>
-
-void on_event(const speech_event_t* event, void* ctx) {
-    if (event->type == SPEECH_EVENT_TRANSCRIPTION)
-        printf("%s\n", event->text);
-}
-
-speech_config_t cfg = speech_config_default();
-cfg.model_dir = "/opt/speech/models";
-cfg.use_qnn = true;  // Hexagon DSP त्वरण
-
-speech_pipeline_t p = speech_create(cfg, on_event, NULL);
-speech_start(p);
-speech_push_audio(p, pcm_samples, 512);
-```
-
-### बिल्ड
-
-```bash
-cd linux && ./setup_linux.sh
-cmake -B build -DORT_DIR=../ort-linux
-cmake --build build
-./build/speech_demo --model-dir /path/to/models
-```
-
-### परीक्षण
-
-```bash
-linux/tests/download_models.sh              # ONNX मॉडल डाउनलोड करें
-SPEECH_MODEL_DIR=tests/models ./build/speech_test   # 12 परीक्षण
-```
-
-### Yocto के लिए क्रॉस-कंपाइल
-
-```bash
-source /opt/poky/environment-setup-aarch64-poky-linux
-cmake -B build -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64.cmake -DORT_DIR=...
-cmake --build build
-```
-
 ## पाइपलाइन
 
 ```text
@@ -204,41 +155,47 @@ Idle → Listening → Transcribing → Speaking → Idle
 
 ```text
 ┌──────────────────────────────────────────────┐
-│   Android: SpeechPipeline (Kotlin/JNI)       │
-│   Linux:   speech.h (C API)                  │
-└──────────────────┬───────────────────────────┘
-                   │
-┌──────────────────┴───────────────────────────┐
-│            speech-core (C++ submodule)        │
-│   Turn detection · Interruptions · Context   │
-└──┬────────┬────────┬────────┬────────────────┘
-   │        │        │        │  vtables
-┌──┴──┐  ┌──┴──┐  ┌──┴──┐  ┌─┴────────┐
-│ VAD │  │ STT │  │ TTS │  │ Enhancer │
-│Silero│  │Para-│  │Koko-│  │DeepFilter│
-│     │  │keet │  │ro   │  │Net3      │
-└──┬──┘  └──┬──┘  └──┬──┘  └─┬────────┘
-   └────────┴────────┴────────┘
-       ONNX Runtime (CPU / NNAPI / QNN)
+│      SpeechPipeline (Kotlin)                 │
+│            │                                 │
+│            ▼                                 │
+│      jni_bridge.cpp  (~250 lines)            │
+│            │                                 │
+│            ▼                                 │
+│  ┌──────────────────────────────────────┐    │
+│  │  speech_core_models (git submodule)  │    │
+│  │   SileroVad / ParakeetStt /          │    │
+│  │   KokoroTts / DeepFilterEnhancer     │    │
+│  │            │                         │    │
+│  │            ▼                         │    │
+│  │  speech_core  (orchestration:        │    │
+│  │   pipeline · turn · interruptions)   │    │
+│  └──────────────────────────────────────┘    │
+│            │                                 │
+│            ▼                                 │
+│      ONNX Runtime (CPU / NNAPI)              │
+└──────────────────────────────────────────────┘
 ```
 
+प्रत्येक मॉडल क्लास सीधे संबंधित speech-core इंटरफ़ेस (`VADInterface`, `STTInterface`, `TTSInterface`, `EnhancerInterface`) को लागू करता है — JNI ब्रिज उन्हें इंस्टैंशिएट करता है और संदर्भ `VoicePipeline` को सौंपता है। कोई C-vtable अडैप्टर बॉइलरप्लेट नहीं।
+
 ## हार्डवेयर त्वरण
 
-| प्लेटफ़ॉर्म | चिपसेट | त्वरण |
-| --- | --- | --- |
-| Android | Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU |
-| Android | Samsung Exynos 2200+ | NNAPI → Samsung NPU |
-| Android | Google Tensor G2+ | NNAPI → Google TPU |
-| ऑटोमोटिव | SA8295P / SA8255P | QNN → Hexagon DSP |
-| कोई भी | CPU फ़ॉलबैक | XNNPACK |
+| चिपसेट | त्वरण |
+| --- | --- |
+| Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU |
+| Samsung Exynos 2200+ | NNAPI → Samsung NPU |
+| Google Tensor G2+ | NNAPI → Google TPU |
+| CPU फ़ॉलबैक | XNNPACK |
+
+ऑटोमोटिव Qualcomm SA8295P / SA8255P के लिए QNN (Hexagon DSP) के साथ, [speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux) देखें।
 
 ## संबंधित परियोजनाएँ
 
-| रिपॉज़िटरी | प्लेटफ़ॉर्म |
+| रिपॉज़िटरी | स्कोप |
 | --- | --- |
 | [speech-swift](https://github.com/soniqo/speech-swift) | Apple (macOS, iOS) — MLX + CoreML |
-| [speech-core](https://github.com/soniqo/speech-core) | क्रॉस-प्लेटफ़ॉर्म C++ पाइपलाइन इंजन |
-| **speech-android** | Android + एम्बेडेड Linux — ONNX Runtime |
+| [speech-core](https://github.com/soniqo/speech-core) | क्रॉस-प्लेटफ़ॉर्म C++ पाइपलाइन इंजन + ONNX मॉडल रैपर + Linux/एम्बेडेड उदाहरण |
+| **speech-android** | Android रैपर — speech-core के ऊपर Kotlin SDK + JNI ब्रिज |
 
 ## लाइसेंस
 
diff --git a/README_ja.md b/README_ja.md
index 76da318..abd78ea 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -2,18 +2,15 @@
 
 📖 言語: [English](README.md) · [中文](README_zh.md) · [日本語](README_ja.md) · [한국어](README_ko.md) · [Español](README_es.md) · [Deutsch](README_de.md) · [Français](README_fr.md) · [हिन्दी](README_hi.md) · [Português](README_pt.md) · [Русский](README_ru.md)
 
-[ONNX Runtime](https://onnxruntime.ai) と [speech-core](https://github.com/soniqo/speech-core) を活用した、Android および組み込み Linux 向けのオンデバイス音声 SDK。
+[ONNX Runtime](https://onnxruntime.ai) と [speech-core](https://github.com/soniqo/speech-core) を活用した、Android 向けオンデバイス音声 SDK。
 
 音声認識(114 言語)、テキスト読み上げ(8 言語)、音声活動検出、ノイズキャンセリング — すべてローカルで動作。クラウド API 不要、データはデバイスから外に出ません。
 
-**[デモ APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[モデル](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)**(Apple 版)· **[speech-core](https://github.com/soniqo/speech-core)**(パイプラインエンジン)
+**[デモ APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[モデル](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)**(Apple 版)· **[speech-core](https://github.com/soniqo/speech-core)**(パイプラインエンジン + Linux/組み込みビルド)
 
-## プラットフォーム
+## スコープ
 
-| プラットフォーム | API | アクセラレーション | ディレクトリ |
-| --- | --- | --- | --- |
-| Android | Kotlin (`SpeechPipeline`) | NNAPI(Snapdragon、Exynos、Tensor) | `sdk/` |
-| 組み込み Linux | C (`speech.h`) | QNN(Hexagon DSP) | `linux/` |
+このリポジトリは **Android パッケージング** を担当します:Kotlin SDK、JNI ブリッジ、デモアプリ。C++ エンジンおよび ONNX モデルラッパー(Silero VAD、Parakeet STT、Kokoro TTS、DeepFilterNet3)は [speech-core](https://github.com/soniqo/speech-core) に存在し、git サブモジュールとして取り込まれます。Linux / 自動車向け(Yocto、Qualcomm SA8295P/SA8255P)は [speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux) に存在します。
 
 ## モデル
 
@@ -24,15 +21,13 @@
 | [Silero VAD v5](https://huggingface.co/aufklarer/Silero-VAD-v5-ONNX) | 音声活動検出 | 2 MB | 任意 |
 | [DeepFilterNet3](https://huggingface.co/aufklarer/DeepFilterNet3-ONNX) | ノイズキャンセリング | ~8 MB | 任意 |
 
-モデルは初回起動時に自動ダウンロード(Android)または手動配置(Linux)されます。
-
-## Android
+モデルは初回起動時に `ModelManager.ensureModels()` 経由で自動ダウンロードされます。
 
-### デモを試す
+## デモを試す
 
 [署名済み APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk) をダウンロードし、任意の arm64 Android デバイス(8 以降)にインストールします。モデル(~1.2 GB)は初回起動時に自動ダウンロードされます。
 
-### 依存関係を追加
+## 依存関係を追加
 
 ```kotlin
 dependencies {
@@ -40,7 +35,7 @@ dependencies {
 }
 ```
 
-### Kotlin の使い方
+## Kotlin の使い方
 
 ```kotlin
 val modelDir = ModelManager.ensureModels(context)
@@ -63,7 +58,7 @@ pipeline.start()
 pipeline.pushAudio(samples)
 ```
 
-### ソースからビルド
+## ソースからビルド
 
 ```bash
 git clone --recursive https://github.com/soniqo/speech-android.git
@@ -73,7 +68,10 @@ cd speech-android
 ./gradlew :sdk:connectedAndroidTest   # 34 個の e2e テスト
 ```
 
-### デモアプリ
+`./setup.sh` は speech-core サブモジュールを初期化し、ONNX Runtime を
+`./ort/` にダウンロードします。
+
+## デモアプリ
 
 [`app/`](app/) モジュールは最小限の音声アシスタントデモで、以下を含みます:
 
@@ -87,7 +85,7 @@ cd speech-android
 ./gradlew :app:installDebug
 ```
 
-### システム音声入力(`RecognitionService`)
+## システム音声入力(`RecognitionService`)
 
 SDK には、Android フレームワークの `SpeechRecognizer` API に組み込めるすぐに使える `audio.soniqo.speech.service.SpeechRecognitionService` が含まれています — コードを書く必要はありません。アプリがデフォルトの音声認識サービスに選択されると、`SpeechRecognizer.createSpeechRecognizer(context)`(`ComponentName` なし)を呼び出す任意のサードパーティアプリが、あなたのパイプラインを通じて完全なオンデバイス STT を利用できます。
 
@@ -143,53 +141,6 @@ Android エミュレータ(arm64-v8a、NNAPI なし)で測定。実機ははる
 | Kokoro 82M | TTS | 1.9 秒出力 | 1,075 ミリ秒 | 0.58 |
 | Silero VAD v5 | VAD | 32 ミリ秒チャンク | <1 ミリ秒 | <0.01 |
 
-## 組み込み Linux
-
-自動車および組み込みプラットフォーム向けの最小限の C API。詳細は [`linux/README.md`](linux/README.md) を参照してください。
-
-### C API の使い方
-
-```c
-#include <speech.h>
-
-void on_event(const speech_event_t* event, void* ctx) {
-    if (event->type == SPEECH_EVENT_TRANSCRIPTION)
-        printf("%s\n", event->text);
-}
-
-speech_config_t cfg = speech_config_default();
-cfg.model_dir = "/opt/speech/models";
-cfg.use_qnn = true;  // Hexagon DSP アクセラレーション
-
-speech_pipeline_t p = speech_create(cfg, on_event, NULL);
-speech_start(p);
-speech_push_audio(p, pcm_samples, 512);
-```
-
-### ビルド
-
-```bash
-cd linux && ./setup_linux.sh
-cmake -B build -DORT_DIR=../ort-linux
-cmake --build build
-./build/speech_demo --model-dir /path/to/models
-```
-
-### テスト
-
-```bash
-linux/tests/download_models.sh              # ONNX モデルをダウンロード
-SPEECH_MODEL_DIR=tests/models ./build/speech_test   # 12 個のテスト
-```
-
-### Yocto 向けクロスコンパイル
-
-```bash
-source /opt/poky/environment-setup-aarch64-poky-linux
-cmake -B build -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64.cmake -DORT_DIR=...
-cmake --build build
-```
-
 ## パイプライン
 
 ```text
@@ -204,41 +155,51 @@ Idle → Listening → Transcribing → Speaking → Idle
 
 ```text
 ┌──────────────────────────────────────────────┐
-│   Android: SpeechPipeline (Kotlin/JNI)       │
-│   Linux:   speech.h (C API)                  │
-└──────────────────┬───────────────────────────┘
-                   │
-┌──────────────────┴───────────────────────────┐
-│            speech-core (C++ submodule)        │
-│   Turn detection · Interruptions · Context   │
-└──┬────────┬────────┬────────┬────────────────┘
-   │        │        │        │  vtables
-┌──┴──┐  ┌──┴──┐  ┌──┴──┐  ┌─┴────────┐
-│ VAD │  │ STT │  │ TTS │  │ Enhancer │
-│Silero│  │Para-│  │Koko-│  │DeepFilter│
-│     │  │keet │  │ro   │  │Net3      │
-└──┬──┘  └──┬──┘  └──┬──┘  └─┬────────┘
-   └────────┴────────┴────────┘
-       ONNX Runtime (CPU / NNAPI / QNN)
+│      SpeechPipeline (Kotlin)                 │
+│            │                                 │
+│            ▼                                 │
+│      jni_bridge.cpp  (~250 行)               │
+│            │                                 │
+│            ▼                                 │
+│  ┌──────────────────────────────────────┐    │
+│  │  speech_core_models (git サブモジュール) │    │
+│  │   SileroVad / ParakeetStt /          │    │
+│  │   KokoroTts / DeepFilterEnhancer     │    │
+│  │            │                         │    │
+│  │            ▼                         │    │
+│  │  speech_core  (オーケストレーション:    │    │
+│  │   パイプライン · ターン · 割り込み)     │    │
+│  └──────────────────────────────────────┘    │
+│            │                                 │
+│            ▼                                 │
+│      ONNX Runtime (CPU / NNAPI)              │
+└──────────────────────────────────────────────┘
 ```
 
+各モデルクラスは対応する speech-core インターフェース
+(`VADInterface`、`STTInterface`、`TTSInterface`、`EnhancerInterface`)を
+直接実装します — JNI ブリッジがそれらをインスタンス化し、参照を
+`VoicePipeline` に渡します。C vtable アダプタの定型コードは不要です。
+
 ## ハードウェアアクセラレーション
 
-| プラットフォーム | チップセット | アクセラレーション |
-| --- | --- | --- |
-| Android | Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU |
-| Android | Samsung Exynos 2200+ | NNAPI → Samsung NPU |
-| Android | Google Tensor G2+ | NNAPI → Google TPU |
-| 自動車 | SA8295P / SA8255P | QNN → Hexagon DSP |
-| 任意 | CPU フォールバック | XNNPACK |
+| チップセット | アクセラレーション |
+| --- | --- |
+| Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU |
+| Samsung Exynos 2200+ | NNAPI → Samsung NPU |
+| Google Tensor G2+ | NNAPI → Google TPU |
+| CPU フォールバック | XNNPACK |
+
+自動車向け Qualcomm SA8295P / SA8255P と QNN(Hexagon DSP)については、
+[speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux) を参照してください。
 
 ## 関連プロジェクト
 
-| リポジトリ | プラットフォーム |
+| リポジトリ | スコープ |
 | --- | --- |
 | [speech-swift](https://github.com/soniqo/speech-swift) | Apple(macOS、iOS)— MLX + CoreML |
-| [speech-core](https://github.com/soniqo/speech-core) | クロスプラットフォーム C++ パイプラインエンジン |
-| **speech-android** | Android + 組み込み Linux — ONNX Runtime |
+| [speech-core](https://github.com/soniqo/speech-core) | クロスプラットフォーム C++ パイプラインエンジン + ONNX モデルラッパー + Linux/組み込み例 |
+| **speech-android** | Android ラッパー — speech-core 上の Kotlin SDK + JNI ブリッジ |
 
 ## ライセンス
 
diff --git a/README_ko.md b/README_ko.md
index 941a796..8c7eec4 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -2,18 +2,15 @@
 
 📖 언어: [English](README.md) · [中文](README_zh.md) · [日本語](README_ja.md) · [한국어](README_ko.md) · [Español](README_es.md) · [Deutsch](README_de.md) · [Français](README_fr.md) · [हिन्दी](README_hi.md) · [Português](README_pt.md) · [Русский](README_ru.md)
 
-[ONNX Runtime](https://onnxruntime.ai)와 [speech-core](https://github.com/soniqo/speech-core) 기반의 Android 및 임베디드 Linux용 온디바이스 음성 SDK.
+[ONNX Runtime](https://onnxruntime.ai)와 [speech-core](https://github.com/soniqo/speech-core) 기반의 Android용 온디바이스 음성 SDK.
 
 음성 인식(114개 언어), 텍스트 음성 변환(8개 언어), 음성 활동 감지, 노이즈 캔슬링 — 모두 로컬에서 실행됩니다. 클라우드 API도, 디바이스 외부로 전송되는 데이터도 없습니다.
 
-**[데모 APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[모델](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)**(Apple 버전) · **[speech-core](https://github.com/soniqo/speech-core)**(파이프라인 엔진)
+**[데모 APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[모델](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)**(Apple 버전) · **[speech-core](https://github.com/soniqo/speech-core)**(파이프라인 엔진 + Linux/임베디드 빌드)
 
-## 플랫폼
+## 범위
 
-| 플랫폼 | API | 가속 | 디렉토리 |
-| --- | --- | --- | --- |
-| Android | Kotlin (`SpeechPipeline`) | NNAPI(Snapdragon, Exynos, Tensor) | `sdk/` |
-| 임베디드 Linux | C (`speech.h`) | QNN(Hexagon DSP) | `linux/` |
+이 저장소는 **Android 패키징**입니다: Kotlin SDK, JNI 브리지, 데모 앱. C++ 엔진과 ONNX 모델 래퍼(Silero VAD, Parakeet STT, Kokoro TTS, DeepFilterNet3)는 [speech-core](https://github.com/soniqo/speech-core)에 있으며 git 서브모듈을 통해 가져옵니다. Linux / 자동차(Yocto, Qualcomm SA8295P/SA8255P)는 [speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux)에 있습니다.
 
 ## 모델
 
@@ -24,15 +21,13 @@
 | [Silero VAD v5](https://huggingface.co/aufklarer/Silero-VAD-v5-ONNX) | 음성 활동 감지 | 2 MB | 모든 언어 |
 | [DeepFilterNet3](https://huggingface.co/aufklarer/DeepFilterNet3-ONNX) | 노이즈 캔슬링 | ~8 MB | 모든 언어 |
 
-모델은 첫 실행 시 자동 다운로드(Android)되거나 수동으로 배치(Linux)됩니다.
-
-## Android
+모델은 `ModelManager.ensureModels()`를 통해 첫 실행 시 자동으로 다운로드됩니다.
 
-### 데모 사용해보기
+## 데모 사용해보기
 
 [서명된 APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)를 다운로드하여 arm64 Android 기기(8 이상)에 설치하세요. 모델(~1.2 GB)은 첫 실행 시 자동으로 다운로드됩니다.
 
-### 의존성 추가
+## 의존성 추가
 
 ```kotlin
 dependencies {
@@ -40,7 +35,7 @@ dependencies {
 }
 ```
 
-### Kotlin 사용법
+## Kotlin 사용법
 
 ```kotlin
 val modelDir = ModelManager.ensureModels(context)
@@ -63,7 +58,7 @@ pipeline.start()
 pipeline.pushAudio(samples)
 ```
 
-### 소스에서 빌드
+## 소스에서 빌드
 
 ```bash
 git clone --recursive https://github.com/soniqo/speech-android.git
@@ -73,7 +68,10 @@ cd speech-android
 ./gradlew :sdk:connectedAndroidTest   # 34개 e2e 테스트
 ```
 
-### 데모 앱
+`./setup.sh`는 speech-core 서브모듈을 초기화하고 ONNX Runtime을
+`./ort/`로 다운로드합니다.
+
+## 데모 앱
 
 [`app/`](app/) 모듈은 최소한의 음성 비서 데모로 다음을 포함합니다:
 
@@ -87,7 +85,7 @@ cd speech-android
 ./gradlew :app:installDebug
 ```
 
-### 시스템 음성 입력(`RecognitionService`)
+## 시스템 음성 입력(`RecognitionService`)
 
 SDK는 Android 프레임워크 `SpeechRecognizer` API에 연결되는 바로 사용 가능한 `audio.soniqo.speech.service.SpeechRecognitionService`를 제공합니다 — 작성할 코드가 없습니다. 앱이 기본 음성 인식기로 선택되면, `SpeechRecognizer.createSpeechRecognizer(context)`(`ComponentName` 없이)를 호출하는 모든 타사 앱이 파이프라인을 통해 완전히 온디바이스 STT를 받을 수 있습니다.
 
@@ -143,53 +141,6 @@ Android 에뮬레이터(arm64-v8a, NNAPI 없음)에서 측정. 실제 하드웨
 | Kokoro 82M | TTS | 1.9초 출력 | 1,075ms | 0.58 |
 | Silero VAD v5 | VAD | 32ms 청크 | <1ms | <0.01 |
 
-## 임베디드 Linux
-
-자동차 및 임베디드 플랫폼을 위한 최소한의 C API. 전체 문서는 [`linux/README.md`](linux/README.md)를 참조하세요.
-
-### C API 사용법
-
-```c
-#include <speech.h>
-
-void on_event(const speech_event_t* event, void* ctx) {
-    if (event->type == SPEECH_EVENT_TRANSCRIPTION)
-        printf("%s\n", event->text);
-}
-
-speech_config_t cfg = speech_config_default();
-cfg.model_dir = "/opt/speech/models";
-cfg.use_qnn = true;  // Hexagon DSP 가속
-
-speech_pipeline_t p = speech_create(cfg, on_event, NULL);
-speech_start(p);
-speech_push_audio(p, pcm_samples, 512);
-```
-
-### 빌드
-
-```bash
-cd linux && ./setup_linux.sh
-cmake -B build -DORT_DIR=../ort-linux
-cmake --build build
-./build/speech_demo --model-dir /path/to/models
-```
-
-### 테스트
-
-```bash
-linux/tests/download_models.sh              # ONNX 모델 다운로드
-SPEECH_MODEL_DIR=tests/models ./build/speech_test   # 12개 테스트
-```
-
-### Yocto용 크로스 컴파일
-
-```bash
-source /opt/poky/environment-setup-aarch64-poky-linux
-cmake -B build -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64.cmake -DORT_DIR=...
-cmake --build build
-```
-
 ## 파이프라인
 
 ```text
@@ -204,41 +155,51 @@ Idle → Listening → Transcribing → Speaking → Idle
 
 ```text
 ┌──────────────────────────────────────────────┐
-│   Android: SpeechPipeline (Kotlin/JNI)       │
-│   Linux:   speech.h (C API)                  │
-└──────────────────┬───────────────────────────┘
-                   │
-┌──────────────────┴───────────────────────────┐
-│            speech-core (C++ submodule)        │
-│   Turn detection · Interruptions · Context   │
-└──┬────────┬────────┬────────┬────────────────┘
-   │        │        │        │  vtables
-┌──┴──┐  ┌──┴──┐  ┌──┴──┐  ┌─┴────────┐
-│ VAD │  │ STT │  │ TTS │  │ Enhancer │
-│Silero│  │Para-│  │Koko-│  │DeepFilter│
-│     │  │keet │  │ro   │  │Net3      │
-└──┬──┘  └──┬──┘  └──┬──┘  └─┬────────┘
-   └────────┴────────┴────────┘
-       ONNX Runtime (CPU / NNAPI / QNN)
+│      SpeechPipeline (Kotlin)                 │
+│            │                                 │
+│            ▼                                 │
+│      jni_bridge.cpp  (~250 lines)            │
+│            │                                 │
+│            ▼                                 │
+│  ┌──────────────────────────────────────┐    │
+│  │  speech_core_models (git submodule)  │    │
+│  │   SileroVad / ParakeetStt /          │    │
+│  │   KokoroTts / DeepFilterEnhancer     │    │
+│  │            │                         │    │
+│  │            ▼                         │    │
+│  │  speech_core  (orchestration:        │    │
+│  │   pipeline · turn · interruptions)   │    │
+│  └──────────────────────────────────────┘    │
+│            │                                 │
+│            ▼                                 │
+│      ONNX Runtime (CPU / NNAPI)              │
+└──────────────────────────────────────────────┘
 ```
 
+각 모델 클래스는 해당하는 speech-core 인터페이스(`VADInterface`,
+`STTInterface`, `TTSInterface`, `EnhancerInterface`)를 직접 구현합니다 —
+JNI 브리지가 이들을 인스턴스화하여 `VoicePipeline`에 참조를 전달합니다.
+C-vtable 어댑터 보일러플레이트가 없습니다.
+
 ## 하드웨어 가속
 
-| 플랫폼 | 칩셋 | 가속 |
-| --- | --- | --- |
-| Android | Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU |
-| Android | Samsung Exynos 2200+ | NNAPI → Samsung NPU |
-| Android | Google Tensor G2+ | NNAPI → Google TPU |
-| 자동차 | SA8295P / SA8255P | QNN → Hexagon DSP |
-| 모두 | CPU 폴백 | XNNPACK |
+| 칩셋 | 가속 |
+| --- | --- |
+| Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU |
+| Samsung Exynos 2200+ | NNAPI → Samsung NPU |
+| Google Tensor G2+ | NNAPI → Google TPU |
+| CPU 폴백 | XNNPACK |
+
+자동차용 Qualcomm SA8295P / SA8255P와 QNN(Hexagon DSP)은
+[speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux)를 참조하세요.
 
 ## 관련 프로젝트
 
-| 저장소 | 플랫폼 |
+| 저장소 | 범위 |
 | --- | --- |
 | [speech-swift](https://github.com/soniqo/speech-swift) | Apple(macOS, iOS) — MLX + CoreML |
-| [speech-core](https://github.com/soniqo/speech-core) | 크로스 플랫폼 C++ 파이프라인 엔진 |
-| **speech-android** | Android + 임베디드 Linux — ONNX Runtime |
+| [speech-core](https://github.com/soniqo/speech-core) | 크로스 플랫폼 C++ 파이프라인 엔진 + ONNX 모델 래퍼 + Linux/임베디드 예제 |
+| **speech-android** | Android 래퍼 — speech-core 위에 Kotlin SDK + JNI 브리지 |
 
 ## 라이선스
 
diff --git a/README_pt.md b/README_pt.md
index c149b52..17cace7 100644
--- a/README_pt.md
+++ b/README_pt.md
@@ -2,18 +2,15 @@
 
 📖 Idiomas: [English](README.md) · [中文](README_zh.md) · [日本語](README_ja.md) · [한국어](README_ko.md) · [Español](README_es.md) · [Deutsch](README_de.md) · [Français](README_fr.md) · [हिन्दी](README_hi.md) · [Português](README_pt.md) · [Русский](README_ru.md)
 
-SDK de voz no dispositivo para Android e Linux embarcado, baseado em [ONNX Runtime](https://onnxruntime.ai) e [speech-core](https://github.com/soniqo/speech-core).
+SDK de voz no dispositivo para Android, baseado em [ONNX Runtime](https://onnxruntime.ai) e [speech-core](https://github.com/soniqo/speech-core).
 
 Reconhecimento de fala (114 idiomas), texto para fala (8 idiomas), detecção de atividade vocal e cancelamento de ruído — tudo executado localmente. Sem APIs em nuvem, nenhum dado sai do dispositivo.
 
-**[APK de demonstração](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[Modelos](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (contraparte Apple) · **[speech-core](https://github.com/soniqo/speech-core)** (motor de pipeline)
+**[APK de demonstração](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[Modelos](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (contraparte Apple) · **[speech-core](https://github.com/soniqo/speech-core)** (motor de pipeline + build Linux/embarcado)
 
-## Plataformas
+## Escopo
 
-| Plataforma | API | Aceleração | Diretório |
-| --- | --- | --- | --- |
-| Android | Kotlin (`SpeechPipeline`) | NNAPI (Snapdragon, Exynos, Tensor) | `sdk/` |
-| Linux embarcado | C (`speech.h`) | QNN (Hexagon DSP) | `linux/` |
+Este repositório é o **empacotamento Android**: SDK Kotlin, ponte JNI, app de demonstração. O motor C++ e os wrappers de modelo ONNX (Silero VAD, Parakeet STT, Kokoro TTS, DeepFilterNet3) ficam em [speech-core](https://github.com/soniqo/speech-core) e são incorporados via submódulo git. Linux / automotivo (Yocto, Qualcomm SA8295P/SA8255P) está em [speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux).
 
 ## Modelos
 
@@ -24,15 +21,13 @@ Reconhecimento de fala (114 idiomas), texto para fala (8 idiomas), detecção de
 | [Silero VAD v5](https://huggingface.co/aufklarer/Silero-VAD-v5-ONNX) | Detecção de atividade vocal | 2 MB | Qualquer |
 | [DeepFilterNet3](https://huggingface.co/aufklarer/DeepFilterNet3-ONNX) | Cancelamento de ruído | ~8 MB | Qualquer |
 
-Os modelos são baixados automaticamente no primeiro lançamento (Android) ou colocados manualmente (Linux).
-
-## Android
+Os modelos são baixados automaticamente no primeiro lançamento via `ModelManager.ensureModels()`.
 
-### Experimente a demo
+## Experimente a demo
 
 Baixe o [APK assinado](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk) e instale em qualquer dispositivo Android arm64 (8+). Os modelos (~1,2 GB) são baixados automaticamente no primeiro lançamento.
 
-### Adicionar dependência
+## Adicionar dependência
 
 ```kotlin
 dependencies {
@@ -40,7 +35,7 @@ dependencies {
 }
 ```
 
-### Uso do Kotlin
+## Uso do Kotlin
 
 ```kotlin
 val modelDir = ModelManager.ensureModels(context)
@@ -63,7 +58,7 @@ pipeline.start()
 pipeline.pushAudio(samples)
 ```
 
-### Compilar a partir do código-fonte
+## Compilar a partir do código-fonte
 
 ```bash
 git clone --recursive https://github.com/soniqo/speech-android.git
@@ -73,7 +68,10 @@ cd speech-android
 ./gradlew :sdk:connectedAndroidTest   # 34 testes e2e
 ```
 
-### Aplicativo de demonstração
+`./setup.sh` inicializa o submódulo speech-core e baixa o ONNX Runtime
+para `./ort/`.
+
+## Aplicativo de demonstração
 
 O módulo [`app/`](app/) é uma demo mínima de assistente de voz com:
 
@@ -87,9 +85,14 @@ O módulo [`app/`](app/) é uma demo mínima de assistente de voz com:
 ./gradlew :app:installDebug
 ```
 
-### Entrada de voz do sistema (`RecognitionService`)
+## Entrada de voz do sistema (`RecognitionService`)
 
-O SDK fornece um `audio.soniqo.speech.service.SpeechRecognitionService` pronto para uso que se conecta à API `SpeechRecognizer` do framework do Android — sem código a escrever. Uma vez que seu app é selecionado como o reconhecedor de voz padrão, qualquer app de terceiros chamando `SpeechRecognizer.createSpeechRecognizer(context)` (sem `ComponentName`) obtém STT totalmente no dispositivo através do seu pipeline.
+O SDK fornece um `audio.soniqo.speech.service.SpeechRecognitionService` pronto
+para uso que se conecta à API `SpeechRecognizer` do framework do Android —
+sem código a escrever. Uma vez que seu app é selecionado como o reconhecedor
+de voz padrão, qualquer app de terceiros chamando
+`SpeechRecognizer.createSpeechRecognizer(context)` (sem `ComponentName`)
+obtém STT totalmente no dispositivo através do seu pipeline.
 
 **1. Declare `RECORD_AUDIO` e o serviço em `AndroidManifest.xml`:**
 
@@ -118,20 +121,32 @@ O SDK fornece um `audio.soniqo.speech.service.SpeechRecognitionService` pronto p
 <recognition-service xmlns:android="http://schemas.android.com/apk/res/android" />
 ```
 
-(Opcionalmente adicione `android:settingsActivity="..."` para expor um ícone de engrenagem no seletor de entrada de voz do sistema.)
+(Opcionalmente adicione `android:settingsActivity="..."` para expor um ícone
+de engrenagem no seletor de entrada de voz do sistema.)
 
-**3. Defina o serviço como padrão do sistema** (Configurações → Sistema → Idiomas e entrada → Seletor de entrada de voz no Android puro, ou via adb):
+**3. Defina o serviço como padrão do sistema** (Configurações → Sistema →
+Idiomas e entrada → Seletor de entrada de voz no Android puro, ou via adb):
 
 ```bash
 adb shell settings put secure voice_recognition_service \
   your.package/audio.soniqo.speech.service.SpeechRecognitionService
 ```
 
-**4. Verifique** executando a tela *Recognizer test* do app demo, que chama `SpeechRecognizer.createSpeechRecognizer(ctx)` (sem componente) e registra cada callback do framework — útil para confirmar o round-trip do binder sem precisar do logcat.
+**4. Verifique** executando a tela *Recognizer test* do app demo, que chama
+`SpeechRecognizer.createSpeechRecognizer(ctx)` (sem componente) e registra
+cada callback do framework — útil para confirmar o round-trip do binder sem
+precisar do logcat.
 
-O serviço implementa `onCheckRecognitionSupport` (API 33+) retornando os 27 idiomas BCP-47 cobertos pelo Parakeet TDT v3, marcados como `installedOnDeviceLanguage` quando os modelos estão presentes (ou `pendingOnDeviceLanguage` enquanto eles são baixados). O foco de áudio é adquirido com `AUDIOFOCUS_GAIN_TRANSIENT` pela duração de uma sessão.
+O serviço implementa `onCheckRecognitionSupport` (API 33+) retornando os
+27 idiomas BCP-47 cobertos pelo Parakeet TDT v3, marcados como
+`installedOnDeviceLanguage` quando os modelos estão presentes (ou
+`pendingOnDeviceLanguage` enquanto eles são baixados). O foco de áudio é
+adquirido com `AUDIOFOCUS_GAIN_TRANSIENT` pela duração de uma sessão.
 
-**Limitação:** Gboard, Samsung Keyboard e Google Assistant agrupam seus próprios reconhecedores e ignoram o padrão do sistema. Apps que chamam explicitamente a API `SpeechRecognizer` do framework (ou constroem sua própria UI em cima dela) são os que passam pelo seu serviço.
+**Limitação:** Gboard, Samsung Keyboard e Google Assistant agrupam seus
+próprios reconhecedores e ignoram o padrão do sistema. Apps que chamam
+explicitamente a API `SpeechRecognizer` do framework (ou constroem sua
+própria UI em cima dela) são os que passam pelo seu serviço.
 
 ## Desempenho
 
@@ -143,53 +158,6 @@ Medido em emulador Android (arm64-v8a, sem NNAPI). Hardware real é significativ
 | Kokoro 82M | TTS | 1,9s saída | 1.075ms | 0,58 |
 | Silero VAD v5 | VAD | bloco 32ms | <1ms | <0,01 |
 
-## Linux embarcado
-
-API C mínima para plataformas automotivas e embarcadas. Veja [`linux/README.md`](linux/README.md) para a documentação completa.
-
-### Uso da API C
-
-```c
-#include <speech.h>
-
-void on_event(const speech_event_t* event, void* ctx) {
-    if (event->type == SPEECH_EVENT_TRANSCRIPTION)
-        printf("%s\n", event->text);
-}
-
-speech_config_t cfg = speech_config_default();
-cfg.model_dir = "/opt/speech/models";
-cfg.use_qnn = true;  // Aceleração Hexagon DSP
-
-speech_pipeline_t p = speech_create(cfg, on_event, NULL);
-speech_start(p);
-speech_push_audio(p, pcm_samples, 512);
-```
-
-### Compilar
-
-```bash
-cd linux && ./setup_linux.sh
-cmake -B build -DORT_DIR=../ort-linux
-cmake --build build
-./build/speech_demo --model-dir /path/to/models
-```
-
-### Testar
-
-```bash
-linux/tests/download_models.sh              # baixar modelos ONNX
-SPEECH_MODEL_DIR=tests/models ./build/speech_test   # 12 testes
-```
-
-### Compilação cruzada para Yocto
-
-```bash
-source /opt/poky/environment-setup-aarch64-poky-linux
-cmake -B build -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64.cmake -DORT_DIR=...
-cmake --build build
-```
-
 ## Pipeline
 
 ```text
@@ -204,41 +172,51 @@ Suporte a barge-in: falar durante a reprodução TTS interrompe e inicia uma nov
 
 ```text
 ┌──────────────────────────────────────────────┐
-│   Android: SpeechPipeline (Kotlin/JNI)       │
-│   Linux:   speech.h (C API)                  │
-└──────────────────┬───────────────────────────┘
-                   │
-┌──────────────────┴───────────────────────────┐
-│            speech-core (C++ submodule)        │
-│   Turn detection · Interruptions · Context   │
-└──┬────────┬────────┬────────┬────────────────┘
-   │        │        │        │  vtables
-┌──┴──┐  ┌──┴──┐  ┌──┴──┐  ┌─┴────────┐
-│ VAD │  │ STT │  │ TTS │  │ Enhancer │
-│Silero│  │Para-│  │Koko-│  │DeepFilter│
-│     │  │keet │  │ro   │  │Net3      │
-└──┬──┘  └──┬──┘  └──┬──┘  └─┬────────┘
-   └────────┴────────┴────────┘
-       ONNX Runtime (CPU / NNAPI / QNN)
+│      SpeechPipeline (Kotlin)                 │
+│            │                                 │
+│            ▼                                 │
+│      jni_bridge.cpp  (~250 linhas)           │
+│            │                                 │
+│            ▼                                 │
+│  ┌──────────────────────────────────────┐    │
+│  │  speech_core_models (submódulo git)  │    │
+│  │   SileroVad / ParakeetStt /          │    │
+│  │   KokoroTts / DeepFilterEnhancer     │    │
+│  │            │                         │    │
+│  │            ▼                         │    │
+│  │  speech_core  (orquestração:         │    │
+│  │   pipeline · turn · interrupções)    │    │
+│  └──────────────────────────────────────┘    │
+│            │                                 │
+│            ▼                                 │
+│      ONNX Runtime (CPU / NNAPI)              │
+└──────────────────────────────────────────────┘
 ```
 
+Cada classe de modelo implementa diretamente a interface correspondente de
+speech-core (`VADInterface`, `STTInterface`, `TTSInterface`,
+`EnhancerInterface`) — a ponte JNI as instancia e entrega referências ao
+`VoicePipeline`. Sem boilerplate de adaptador C-vtable.
+
 ## Aceleração de hardware
 
-| Plataforma | Chipset | Aceleração |
-| --- | --- | --- |
-| Android | Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU |
-| Android | Samsung Exynos 2200+ | NNAPI → Samsung NPU |
-| Android | Google Tensor G2+ | NNAPI → Google TPU |
-| Automotivo | SA8295P / SA8255P | QNN → Hexagon DSP |
-| Qualquer | Fallback CPU | XNNPACK |
+| Chipset | Aceleração |
+| --- | --- |
+| Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU |
+| Samsung Exynos 2200+ | NNAPI → Samsung NPU |
+| Google Tensor G2+ | NNAPI → Google TPU |
+| Fallback CPU | XNNPACK |
+
+Para Qualcomm SA8295P / SA8255P automotivo com QNN (Hexagon DSP), veja
+[speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux).
 
 ## Projetos relacionados
 
-| Repositório | Plataforma |
+| Repositório | Escopo |
 | --- | --- |
 | [speech-swift](https://github.com/soniqo/speech-swift) | Apple (macOS, iOS) — MLX + CoreML |
-| [speech-core](https://github.com/soniqo/speech-core) | Motor de pipeline C++ multiplataforma |
-| **speech-android** | Android + Linux embarcado — ONNX Runtime |
+| [speech-core](https://github.com/soniqo/speech-core) | Motor de pipeline C++ multiplataforma + wrappers de modelo ONNX + exemplos Linux/embarcado |
+| **speech-android** | Wrapper Android — SDK Kotlin + ponte JNI sobre speech-core |
 
 ## Licença
 
diff --git a/README_ru.md b/README_ru.md
index 4b95abe..fc3a155 100644
--- a/README_ru.md
+++ b/README_ru.md
@@ -2,18 +2,15 @@
 
 📖 Языки: [English](README.md) · [中文](README_zh.md) · [日本語](README_ja.md) · [한국어](README_ko.md) · [Español](README_es.md) · [Deutsch](README_de.md) · [Français](README_fr.md) · [हिन्दी](README_hi.md) · [Português](README_pt.md) · [Русский](README_ru.md)
 
-Речевой SDK для устройств Android и встраиваемого Linux, основанный на [ONNX Runtime](https://onnxruntime.ai) и [speech-core](https://github.com/soniqo/speech-core).
+Локальный речевой SDK для Android, основанный на [ONNX Runtime](https://onnxruntime.ai) и [speech-core](https://github.com/soniqo/speech-core).
 
 Распознавание речи (114 языков), синтез речи (8 языков), определение голосовой активности и шумоподавление — всё работает локально. Никаких облачных API, никакие данные не покидают устройство.
 
-**[Демо APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[Модели](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (аналог для Apple) · **[speech-core](https://github.com/soniqo/speech-core)** (движок конвейера)
+**[Демо APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[Модели](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)** (аналог для Apple) · **[speech-core](https://github.com/soniqo/speech-core)** (движок конвейера + сборка для Linux/встраиваемых систем)
 
-## Платформы
+## Область применения
 
-| Платформа | API | Ускорение | Каталог |
-| --- | --- | --- | --- |
-| Android | Kotlin (`SpeechPipeline`) | NNAPI (Snapdragon, Exynos, Tensor) | `sdk/` |
-| Встраиваемый Linux | C (`speech.h`) | QNN (Hexagon DSP) | `linux/` |
+Этот репозиторий — **Android-обёртка**: Kotlin SDK, JNI-мост, демо-приложение. C++-движок и обёртки ONNX-моделей (Silero VAD, Parakeet STT, Kokoro TTS, DeepFilterNet3) находятся в [speech-core](https://github.com/soniqo/speech-core) и подключаются через git-submodule. Linux / автомобильные системы (Yocto, Qualcomm SA8295P/SA8255P) — в [speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux).
 
 ## Модели
 
@@ -24,15 +21,13 @@
 | [Silero VAD v5](https://huggingface.co/aufklarer/Silero-VAD-v5-ONNX) | Определение голосовой активности | 2 МБ | Любой |
 | [DeepFilterNet3](https://huggingface.co/aufklarer/DeepFilterNet3-ONNX) | Шумоподавление | ~8 МБ | Любой |
 
-Модели загружаются автоматически при первом запуске (Android) или размещаются вручную (Linux).
-
-## Android
+Модели загружаются автоматически при первом запуске через `ModelManager.ensureModels()`.
 
-### Попробовать демо
+## Попробовать демо
 
 Скачайте [подписанный APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk) и установите на любое arm64-устройство Android (8+). Модели (~1,2 ГБ) загружаются автоматически при первом запуске.
 
-### Добавить зависимость
+## Добавить зависимость
 
 ```kotlin
 dependencies {
@@ -40,7 +35,7 @@ dependencies {
 }
 ```
 
-### Использование Kotlin
+## Использование Kotlin
 
 ```kotlin
 val modelDir = ModelManager.ensureModels(context)
@@ -63,7 +58,7 @@ pipeline.start()
 pipeline.pushAudio(samples)
 ```
 
-### Сборка из исходного кода
+## Сборка из исходного кода
 
 ```bash
 git clone --recursive https://github.com/soniqo/speech-android.git
@@ -73,7 +68,10 @@ cd speech-android
 ./gradlew :sdk:connectedAndroidTest   # 34 e2e-теста
 ```
 
-### Демо-приложение
+`./setup.sh` инициализирует submodule speech-core и загружает ONNX Runtime
+в `./ort/`.
+
+## Демо-приложение
 
 Модуль [`app/`](app/) — минимальное демо голосового ассистента, включающее:
 
@@ -87,7 +85,7 @@ cd speech-android
 ./gradlew :app:installDebug
 ```
 
-### Системный голосовой ввод (`RecognitionService`)
+## Системный голосовой ввод (`RecognitionService`)
 
 SDK включает готовый к использованию `audio.soniqo.speech.service.SpeechRecognitionService`, который подключается к API `SpeechRecognizer` фреймворка Android — никакого кода писать не нужно. Как только ваше приложение выбрано в качестве распознавателя голоса по умолчанию, любое стороннее приложение, вызывающее `SpeechRecognizer.createSpeechRecognizer(context)` (без `ComponentName`), получает полностью локальный STT через ваш конвейер.
 
@@ -143,53 +141,6 @@ adb shell settings put secure voice_recognition_service \
 | Kokoro 82M | TTS | 1,9 с вывод | 1075 мс | 0,58 |
 | Silero VAD v5 | VAD | блок 32 мс | <1 мс | <0,01 |
 
-## Встраиваемый Linux
-
-Минимальный C API для автомобильных и встраиваемых платформ. Полную документацию см. в [`linux/README.md`](linux/README.md).
-
-### Использование C API
-
-```c
-#include <speech.h>
-
-void on_event(const speech_event_t* event, void* ctx) {
-    if (event->type == SPEECH_EVENT_TRANSCRIPTION)
-        printf("%s\n", event->text);
-}
-
-speech_config_t cfg = speech_config_default();
-cfg.model_dir = "/opt/speech/models";
-cfg.use_qnn = true;  // Ускорение Hexagon DSP
-
-speech_pipeline_t p = speech_create(cfg, on_event, NULL);
-speech_start(p);
-speech_push_audio(p, pcm_samples, 512);
-```
-
-### Сборка
-
-```bash
-cd linux && ./setup_linux.sh
-cmake -B build -DORT_DIR=../ort-linux
-cmake --build build
-./build/speech_demo --model-dir /path/to/models
-```
-
-### Тесты
-
-```bash
-linux/tests/download_models.sh              # загрузить модели ONNX
-SPEECH_MODEL_DIR=tests/models ./build/speech_test   # 12 тестов
-```
-
-### Кросс-компиляция для Yocto
-
-```bash
-source /opt/poky/environment-setup-aarch64-poky-linux
-cmake -B build -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64.cmake -DORT_DIR=...
-cmake --build build
-```
-
 ## Конвейер
 
 ```text
@@ -204,41 +155,51 @@ Idle → Listening → Transcribing → Speaking → Idle
 
 ```text
 ┌──────────────────────────────────────────────┐
-│   Android: SpeechPipeline (Kotlin/JNI)       │
-│   Linux:   speech.h (C API)                  │
-└──────────────────┬───────────────────────────┘
-                   │
-┌──────────────────┴───────────────────────────┐
-│            speech-core (C++ submodule)        │
-│   Turn detection · Interruptions · Context   │
-└──┬────────┬────────┬────────┬────────────────┘
-   │        │        │        │  vtables
-┌──┴──┐  ┌──┴──┐  ┌──┴──┐  ┌─┴────────┐
-│ VAD │  │ STT │  │ TTS │  │ Enhancer │
-│Silero│  │Para-│  │Koko-│  │DeepFilter│
-│     │  │keet │  │ro   │  │Net3      │
-└──┬──┘  └──┬──┘  └──┬──┘  └─┬────────┘
-   └────────┴────────┴────────┘
-       ONNX Runtime (CPU / NNAPI / QNN)
+│      SpeechPipeline (Kotlin)                 │
+│            │                                 │
+│            ▼                                 │
+│      jni_bridge.cpp  (~250 строк)            │
+│            │                                 │
+│            ▼                                 │
+│  ┌──────────────────────────────────────┐    │
+│  │  speech_core_models (git submodule)  │    │
+│  │   SileroVad / ParakeetStt /          │    │
+│  │   KokoroTts / DeepFilterEnhancer     │    │
+│  │            │                         │    │
+│  │            ▼                         │    │
+│  │  speech_core  (оркестрация:          │    │
+│  │   pipeline · turn · прерывания)      │    │
+│  └──────────────────────────────────────┘    │
+│            │                                 │
+│            ▼                                 │
+│      ONNX Runtime (CPU / NNAPI)              │
+└──────────────────────────────────────────────┘
 ```
 
+Каждый класс модели напрямую реализует соответствующий интерфейс speech-core
+(`VADInterface`, `STTInterface`, `TTSInterface`, `EnhancerInterface`) —
+JNI-мост создаёт их и передаёт ссылки в `VoicePipeline`. Никаких шаблонных
+обвязок через C-vtable.
+
 ## Аппаратное ускорение
 
-| Платформа | Чипсет | Ускорение |
-| --- | --- | --- |
-| Android | Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU |
-| Android | Samsung Exynos 2200+ | NNAPI → Samsung NPU |
-| Android | Google Tensor G2+ | NNAPI → Google TPU |
-| Автомобильная | SA8295P / SA8255P | QNN → Hexagon DSP |
-| Любая | Резерв CPU | XNNPACK |
+| Чипсет | Ускорение |
+| --- | --- |
+| Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU |
+| Samsung Exynos 2200+ | NNAPI → Samsung NPU |
+| Google Tensor G2+ | NNAPI → Google TPU |
+| Резерв CPU | XNNPACK |
+
+Для автомобильных Qualcomm SA8295P / SA8255P с QNN (Hexagon DSP) см.
+[speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux).
 
 ## Связанные проекты
 
-| Репозиторий | Платформа |
+| Репозиторий | Область |
 | --- | --- |
 | [speech-swift](https://github.com/soniqo/speech-swift) | Apple (macOS, iOS) — MLX + CoreML |
-| [speech-core](https://github.com/soniqo/speech-core) | Кроссплатформенный движок конвейера на C++ |
-| **speech-android** | Android + встраиваемый Linux — ONNX Runtime |
+| [speech-core](https://github.com/soniqo/speech-core) | Кроссплатформенный движок конвейера на C++ + обёртки ONNX-моделей + примеры для Linux/встраиваемых систем |
+| **speech-android** | Android-обёртка — Kotlin SDK + JNI-мост поверх speech-core |
 
 ## Лицензия
 
diff --git a/README_zh.md b/README_zh.md
index c92bc91..db1c562 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -2,18 +2,15 @@
 
 📖 阅读语言: [English](README.md) · [中文](README_zh.md) · [日本語](README_ja.md) · [한국어](README_ko.md) · [Español](README_es.md) · [Deutsch](README_de.md) · [Français](README_fr.md) · [हिन्दी](README_hi.md) · [Português](README_pt.md) · [Русский](README_ru.md)
 
-适用于 Android 和嵌入式 Linux 的设备端语音 SDK,基于 [ONNX Runtime](https://onnxruntime.ai) 和 [speech-core](https://github.com/soniqo/speech-core) 构建。
+适用于 Android 的设备端语音 SDK,基于 [ONNX Runtime](https://onnxruntime.ai) 和 [speech-core](https://github.com/soniqo/speech-core) 构建。
 
 语音识别(114 种语言)、文本转语音(8 种语言)、语音活动检测和噪声消除——全部在本地运行。无需云端 API,数据不会离开设备。
 
-**[演示 APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[模型](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)**(Apple 对应版本)· **[speech-core](https://github.com/soniqo/speech-core)**(管线引擎)
+**[演示 APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk)** · **[模型](https://huggingface.co/collections/aufklarer/speech-android-models-69bb8a156cac0b96a2247f26)** · **[speech-swift](https://github.com/soniqo/speech-swift)**(Apple 对应版本)· **[speech-core](https://github.com/soniqo/speech-core)**(管线引擎 + Linux/嵌入式构建)
 
-## 平台
+## 范围
 
-| 平台 | API | 加速 | 目录 |
-| --- | --- | --- | --- |
-| Android | Kotlin (`SpeechPipeline`) | NNAPI(Snapdragon、Exynos、Tensor) | `sdk/` |
-| 嵌入式 Linux | C (`speech.h`) | QNN(Hexagon DSP) | `linux/` |
+本仓库是 **Android 打包**:Kotlin SDK、JNI 桥接、演示应用。C++ 引擎和 ONNX 模型封装(Silero VAD、Parakeet STT、Kokoro TTS、DeepFilterNet3)位于 [speech-core](https://github.com/soniqo/speech-core),通过 git 子模块引入。Linux / 汽车(Yocto、Qualcomm SA8295P/SA8255P)位于 [speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux)。
 
 ## 模型
 
@@ -24,15 +21,13 @@
 | [Silero VAD v5](https://huggingface.co/aufklarer/Silero-VAD-v5-ONNX) | 语音活动检测 | 2 MB | 任意 |
 | [DeepFilterNet3](https://huggingface.co/aufklarer/DeepFilterNet3-ONNX) | 噪声消除 | ~8 MB | 任意 |
 
-模型在首次启动时自动下载(Android)或手动放置(Linux)。
-
-## Android
+模型在首次启动时通过 `ModelManager.ensureModels()` 自动下载。
 
-### 试用演示
+## 试用演示
 
 下载[已签名的 APK](https://github.com/soniqo/speech-android/releases/latest/download/app-release.apk) 并安装到任何 arm64 Android 设备(8 及以上)。模型(~1.2 GB)在首次启动时自动下载。
 
-### 添加依赖
+## 添加依赖
 
 ```kotlin
 dependencies {
@@ -40,7 +35,7 @@ dependencies {
 }
 ```
 
-### Kotlin 用法
+## Kotlin 用法
 
 ```kotlin
 val modelDir = ModelManager.ensureModels(context)
@@ -63,7 +58,7 @@ pipeline.start()
 pipeline.pushAudio(samples)
 ```
 
-### 从源代码构建
+## 从源代码构建
 
 ```bash
 git clone --recursive https://github.com/soniqo/speech-android.git
@@ -73,7 +68,9 @@ cd speech-android
 ./gradlew :sdk:connectedAndroidTest   # 34 个端到端测试
 ```
 
-### 演示应用
+`./setup.sh` 会初始化 speech-core 子模块并将 ONNX Runtime 下载到 `./ort/`。
+
+## 演示应用
 
 [`app/`](app/) 模块是一个最小化的语音助手演示,包含:
 
@@ -87,7 +84,7 @@ cd speech-android
 ./gradlew :app:installDebug
 ```
 
-### 系统语音输入(`RecognitionService`)
+## 系统语音输入(`RecognitionService`)
 
 SDK 自带可直接使用的 `audio.soniqo.speech.service.SpeechRecognitionService`,接入 Android 框架的 `SpeechRecognizer` API — 无需编写代码。一旦你的应用被设为默认语音识别器,任何调用 `SpeechRecognizer.createSpeechRecognizer(context)`(不指定 `ComponentName`)的第三方应用都能通过你的流水线获得完全本地的 STT。
 
@@ -143,53 +140,6 @@ adb shell settings put secure voice_recognition_service \
 | Kokoro 82M | TTS | 1.9 秒输出 | 1,075 毫秒 | 0.58 |
 | Silero VAD v5 | VAD | 32 毫秒块 | <1 毫秒 | <0.01 |
 
-## 嵌入式 Linux
-
-适用于汽车和嵌入式平台的最小化 C API。完整文档参见 [`linux/README.md`](linux/README.md)。
-
-### C API 用法
-
-```c
-#include <speech.h>
-
-void on_event(const speech_event_t* event, void* ctx) {
-    if (event->type == SPEECH_EVENT_TRANSCRIPTION)
-        printf("%s\n", event->text);
-}
-
-speech_config_t cfg = speech_config_default();
-cfg.model_dir = "/opt/speech/models";
-cfg.use_qnn = true;  // Hexagon DSP 加速
-
-speech_pipeline_t p = speech_create(cfg, on_event, NULL);
-speech_start(p);
-speech_push_audio(p, pcm_samples, 512);
-```
-
-### 构建
-
-```bash
-cd linux && ./setup_linux.sh
-cmake -B build -DORT_DIR=../ort-linux
-cmake --build build
-./build/speech_demo --model-dir /path/to/models
-```
-
-### 测试
-
-```bash
-linux/tests/download_models.sh              # 下载 ONNX 模型
-SPEECH_MODEL_DIR=tests/models ./build/speech_test   # 12 个测试
-```
-
-### 为 Yocto 交叉编译
-
-```bash
-source /opt/poky/environment-setup-aarch64-poky-linux
-cmake -B build -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64.cmake -DORT_DIR=...
-cmake --build build
-```
-
 ## 管线
 
 ```text
@@ -204,41 +154,47 @@ Idle → Listening → Transcribing → Speaking → Idle
 
 ```text
 ┌──────────────────────────────────────────────┐
-│   Android: SpeechPipeline (Kotlin/JNI)       │
-│   Linux:   speech.h (C API)                  │
-└──────────────────┬───────────────────────────┘
-                   │
-┌──────────────────┴───────────────────────────┐
-│            speech-core (C++ submodule)        │
-│   Turn detection · Interruptions · Context   │
-└──┬────────┬────────┬────────┬────────────────┘
-   │        │        │        │  vtables
-┌──┴──┐  ┌──┴──┐  ┌──┴──┐  ┌─┴────────┐
-│ VAD │  │ STT │  │ TTS │  │ Enhancer │
-│Silero│  │Para-│  │Koko-│  │DeepFilter│
-│     │  │keet │  │ro   │  │Net3      │
-└──┬──┘  └──┬──┘  └──┬──┘  └─┬────────┘
-   └────────┴────────┴────────┘
-       ONNX Runtime (CPU / NNAPI / QNN)
+│      SpeechPipeline (Kotlin)                 │
+│            │                                 │
+│            ▼                                 │
+│      jni_bridge.cpp  (~250 行)               │
+│            │                                 │
+│            ▼                                 │
+│  ┌──────────────────────────────────────┐    │
+│  │  speech_core_models(git 子模块)      │    │
+│  │   SileroVad / ParakeetStt /          │    │
+│  │   KokoroTts / DeepFilterEnhancer     │    │
+│  │            │                         │    │
+│  │            ▼                         │    │
+│  │  speech_core(编排:                  │    │
+│  │   管线 · 轮次 · 打断)               │    │
+│  └──────────────────────────────────────┘    │
+│            │                                 │
+│            ▼                                 │
+│      ONNX Runtime (CPU / NNAPI)              │
+└──────────────────────────────────────────────┘
 ```
 
+每个模型类直接实现对应的 speech-core 接口(`VADInterface`、`STTInterface`、`TTSInterface`、`EnhancerInterface`)—— JNI 桥接实例化它们并将引用交给 `VoicePipeline`。无需 C-vtable 适配器样板代码。
+
 ## 硬件加速
 
-| 平台 | 芯片组 | 加速 |
-| --- | --- | --- |
-| Android | Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU |
-| Android | Samsung Exynos 2200+ | NNAPI → Samsung NPU |
-| Android | Google Tensor G2+ | NNAPI → Google TPU |
-| 汽车 | SA8295P / SA8255P | QNN → Hexagon DSP |
-| 任意 | CPU 回退 | XNNPACK |
+| 芯片组 | 加速 |
+| --- | --- |
+| Snapdragon 8 Gen 1+ | NNAPI → Hexagon NPU |
+| Samsung Exynos 2200+ | NNAPI → Samsung NPU |
+| Google Tensor G2+ | NNAPI → Google TPU |
+| CPU 回退 | XNNPACK |
+
+汽车 Qualcomm SA8295P / SA8255P 搭配 QNN(Hexagon DSP)的方案,请参见 [speech-core/examples/linux](https://github.com/soniqo/speech-core/tree/main/examples/linux)。
 
 ## 相关项目
 
-| 仓库 | 平台 |
+| 仓库 | 范围 |
 | --- | --- |
 | [speech-swift](https://github.com/soniqo/speech-swift) | Apple(macOS、iOS)— MLX + CoreML |
-| [speech-core](https://github.com/soniqo/speech-core) | 跨平台 C++ 管线引擎 |
-| **speech-android** | Android + 嵌入式 Linux — ONNX Runtime |
+| [speech-core](https://github.com/soniqo/speech-core) | 跨平台 C++ 管线引擎 + ONNX 模型封装 + Linux/嵌入式示例 |
+| **speech-android** | Android 封装 — 基于 speech-core 的 Kotlin SDK + JNI 桥接 |
 
 ## 许可证
 
diff --git a/linux/CMakeLists.txt b/linux/CMakeLists.txt
deleted file mode 100644
index 667fccd..0000000
--- a/linux/CMakeLists.txt
+++ /dev/null
@@ -1,102 +0,0 @@
-cmake_minimum_required(VERSION 3.16)
-project(speech_linux VERSION 0.1.0 LANGUAGES CXX)
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-
-option(SPEECH_BUILD_DEMO "Build ALSA demo CLI" ON)
-option(SPEECH_BUILD_TESTS "Build tests" ON)
-option(SPEECH_BUILD_TOOLS "Build CLI tools (transcribe)" ON)
-
-# --- Paths ---
-set(SDK_CPP "${CMAKE_CURRENT_SOURCE_DIR}/../sdk/src/main/cpp")
-set(SPEECH_CORE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../speech-core" CACHE PATH "speech-core directory")
-set(ORT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../ort-linux" CACHE PATH "ONNX Runtime directory")
-
-# --- speech-core (static library) ---
-file(GLOB_RECURSE SPEECH_CORE_SOURCES "${SPEECH_CORE_DIR}/src/*.cpp")
-add_library(speech_core STATIC ${SPEECH_CORE_SOURCES})
-target_include_directories(speech_core PUBLIC "${SPEECH_CORE_DIR}/include")
-target_compile_features(speech_core PUBLIC cxx_std_17)
-
-# --- ONNX Runtime ---
-add_library(onnxruntime SHARED IMPORTED)
-if(APPLE)
-    set(_ORT_LIB "${ORT_DIR}/lib/libonnxruntime.dylib")
-else()
-    set(_ORT_LIB "${ORT_DIR}/lib/libonnxruntime.so")
-endif()
-set_target_properties(onnxruntime PROPERTIES
-    IMPORTED_LOCATION "${_ORT_LIB}"
-    INTERFACE_INCLUDE_DIRECTORIES "${ORT_DIR}/include"
-)
-
-# --- libspeech.so ---
-add_library(speech SHARED
-    src/speech.cpp
-    ${SDK_CPP}/audio/mel.cpp
-    ${SDK_CPP}/audio/fft.cpp
-    ${SDK_CPP}/audio/stft.cpp
-    ${SDK_CPP}/models/silero_vad.cpp
-    ${SDK_CPP}/models/parakeet_stt.cpp
-    ${SDK_CPP}/models/kokoro_tts.cpp
-    ${SDK_CPP}/models/kokoro_phonemizer.cpp
-    ${SDK_CPP}/models/kokoro_multilingual.cpp
-    ${SDK_CPP}/models/deepfilter.cpp
-)
-
-target_include_directories(speech
-    PUBLIC
-        ${CMAKE_CURRENT_SOURCE_DIR}/include
-    PRIVATE
-        ${SDK_CPP}
-        ${SDK_CPP}/models
-        ${ORT_DIR}/include
-        ${SPEECH_CORE_DIR}/include
-)
-
-target_link_libraries(speech PRIVATE speech_core onnxruntime)
-
-# --- Demo CLI ---
-if(SPEECH_BUILD_DEMO)
-    add_executable(speech_demo demo/main.cpp)
-    target_link_libraries(speech_demo PRIVATE speech)
-    target_include_directories(speech_demo PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
-
-    find_library(ALSA_LIB asound)
-    if(ALSA_LIB)
-        target_link_libraries(speech_demo PRIVATE ${ALSA_LIB})
-        target_compile_definitions(speech_demo PRIVATE HAS_ALSA=1)
-    endif()
-endif()
-
-# --- Tests ---
-if(SPEECH_BUILD_TESTS)
-    enable_testing()
-    add_executable(speech_test tests/test_pipeline.cpp)
-    target_link_libraries(speech_test PRIVATE speech)
-    target_include_directories(speech_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
-    add_test(NAME pipeline_test COMMAND speech_test)
-endif()
-
-# --- CLI tools ---
-if(SPEECH_BUILD_TOOLS)
-    add_executable(speech_transcribe tools/transcribe.cpp)
-    target_link_libraries(speech_transcribe PRIVATE speech)
-    target_include_directories(speech_transcribe PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
-
-    # speech_synthesize calls KokoroTts directly — needs the SDK private headers.
-    add_executable(speech_synthesize tools/synthesize.cpp)
-    target_link_libraries(speech_synthesize PRIVATE speech onnxruntime)
-    target_include_directories(speech_synthesize PRIVATE
-        ${SDK_CPP}
-        ${SDK_CPP}/models
-        ${ORT_DIR}/include)
-
-    add_executable(speech_phonemize tools/phonemize.cpp)
-    target_link_libraries(speech_phonemize PRIVATE speech)
-    target_include_directories(speech_phonemize PRIVATE
-        ${SDK_CPP}
-        ${SDK_CPP}/models)
-endif()
diff --git a/linux/README.md b/linux/README.md
deleted file mode 100644
index 574bcdf..0000000
--- a/linux/README.md
+++ /dev/null
@@ -1,151 +0,0 @@
-# speech-linux
-
-On-device speech SDK for embedded Linux — VAD, STT (multilingual), TTS, noise cancellation.
-
-Targets automotive (Qualcomm SA8295P, SA8255P) and embedded ARM64 platforms running Yocto or similar Linux distributions.
-
-## Quick Start
-
-```bash
-# Download ONNX Runtime
-./setup_linux.sh
-
-# Build
-cmake -B build -DORT_DIR=../ort-linux
-cmake --build build
-
-# Run tests
-cd build && ctest
-
-# Run demo (ALSA mic)
-./speech_demo --model-dir /path/to/models
-
-# Run demo (stdin PCM pipe)
-arecord -f FLOAT_LE -r 16000 -c 1 | ./speech_demo --model-dir /path/to/models
-```
-
-## C API
-
-```c
-#include <speech.h>
-
-void on_event(const speech_event_t* event, void* ctx) {
-    if (event->type == SPEECH_EVENT_TRANSCRIPTION)
-        printf("STT: %s\n", event->text);
-}
-
-int main() {
-    speech_config_t cfg = speech_config_default();
-    cfg.model_dir = "/opt/speech/models";
-
-    speech_pipeline_t p = speech_create(cfg, on_event, NULL);
-    speech_start(p);
-
-    // Feed 16kHz mono float32 PCM from your audio source
-    while (has_audio()) {
-        float buf[512];
-        read_audio(buf, 512);
-        speech_push_audio(p, buf, 512);
-    }
-
-    speech_destroy(p);
-}
-```
-
-### Functions
-
-| Function | Description |
-|---|---|
-| `speech_config_default()` | Default config (INT8, CPU, 400ms silence threshold) |
-| `speech_create(config, callback, ctx)` | Load models, create pipeline. Returns `NULL` on failure |
-| `speech_start(pipeline)` | Start processing audio |
-| `speech_push_audio(pipeline, samples, count)` | Feed PCM float32 at 16 kHz |
-| `speech_resume_listening(pipeline)` | Resume after TTS playback |
-| `speech_destroy(pipeline)` | Free all resources |
-| `speech_version()` | Version string |
-
-### Events
-
-| Event | Fields | Description |
-|---|---|---|
-| `SPEECH_EVENT_READY` | — | Pipeline initialized |
-| `SPEECH_EVENT_SPEECH_STARTED` | — | VAD detected speech |
-| `SPEECH_EVENT_SPEECH_ENDED` | — | VAD detected silence |
-| `SPEECH_EVENT_TRANSCRIPTION` | `text`, `confidence`, `stt_duration_ms` | Final transcription |
-| `SPEECH_EVENT_RESPONSE_AUDIO` | `audio_data`, `audio_data_length` | TTS PCM16 audio chunk (24 kHz) |
-| `SPEECH_EVENT_RESPONSE_DONE` | `tts_duration_ms` | TTS complete |
-| `SPEECH_EVENT_ERROR` | `text` | Error message |
-
-### Configuration
-
-```c
-speech_config_t cfg = speech_config_default();
-cfg.model_dir = "/opt/speech/models";  // required
-cfg.use_int8 = true;                   // INT8 quantized models (default)
-cfg.use_qnn = true;                    // Qualcomm QNN EP (Hexagon DSP)
-cfg.enable_enhancer = true;            // DeepFilterNet noise cancellation
-cfg.transcribe_only = true;            // STT only, no TTS echo
-cfg.min_silence_duration = 0.4f;       // seconds before end-of-speech
-```
-
-## Models
-
-Download from HuggingFace (`aufklarer/` org) into a single directory:
-
-```
-models/
-  silero-vad.onnx                    2 MB   Voice activity detection
-  parakeet-encoder-int8.onnx       840 MB   STT encoder (multilingual, 114 languages)
-  parakeet-decoder-joint-int8.onnx  51 MB   STT decoder
-  vocab.json                       156 KB   BPE vocabulary (8192 tokens)
-  kokoro-int8.onnx                 330 MB   TTS (English)
-  vocab_index.json                   2 KB   TTS phonemizer vocab
-  us_gold.json                       2 B    TTS phonemizer dict
-  us_silver.json                     2 B    TTS phonemizer dict
-  voices/af_heart.bin                1 KB   Voice embedding
-```
-
-## Cross-Compilation (Yocto)
-
-```bash
-# Source Yocto SDK environment
-source /opt/poky/environment-setup-aarch64-poky-linux
-
-# Build with cross-toolchain
-cmake -B build \
-    -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64.cmake \
-    -DORT_DIR=/path/to/ort-linux-aarch64
-
-cmake --build build
-```
-
-## QNN (Qualcomm Hexagon DSP)
-
-For hardware acceleration on SA8295P / SA8255P:
-
-1. Build ONNX Runtime with QNN EP or use Qualcomm's prebuilt
-2. Place `libQnnHtp.so` in the library path
-3. Set `cfg.use_qnn = true`
-
-The pipeline falls back to CPU if QNN is unavailable.
-
-## Architecture
-
-```
-libspeech.so
-  ├── speech.h (C API)
-  ├── speech-core (pipeline orchestration)
-  ├── Silero VAD v5 (voice activity detection)
-  ├── Parakeet TDT v3 (multilingual STT, 114 languages)
-  ├── Kokoro 82M (TTS)
-  ├── DeepFilterNet3 (noise cancellation)
-  └── ONNX Runtime (CPU / QNN EP)
-```
-
-All inference runs on-device. No network required after model download.
-
-## Thread Safety
-
-- `speech_push_audio()` is thread-safe (single producer)
-- Event callback fires from an internal worker thread
-- Do not call `speech_destroy()` from the event callback
diff --git a/linux/demo/main.cpp b/linux/demo/main.cpp
deleted file mode 100644
index 5ac0ffa..0000000
--- a/linux/demo/main.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-#include "speech.h"
-
-#include <cstdio>
-#include <cstdlib>
-#include <csignal>
-#include <cstring>
-#include <unistd.h>
-
-#ifdef HAS_ALSA
-#include <alsa/asoundlib.h>
-#endif
-
-static volatile bool running = true;
-
-static void signal_handler(int) { running = false; }
-
-static void on_event(const speech_event_t* event, void* /*ctx*/) {
-    switch (event->type) {
-        case SPEECH_EVENT_SPEECH_STARTED:
-            fprintf(stderr, "[VAD] speech started\n");
-            break;
-        case SPEECH_EVENT_SPEECH_ENDED:
-            fprintf(stderr, "[VAD] speech ended\n");
-            break;
-        case SPEECH_EVENT_TRANSCRIPTION:
-            printf("[STT] %s (%.0fms, conf=%.2f)\n",
-                   event->text ? event->text : "",
-                   event->stt_duration_ms, event->confidence);
-            fflush(stdout);
-            break;
-        case SPEECH_EVENT_RESPONSE_DONE:
-            fprintf(stderr, "[TTS] done (%.0fms)\n", event->tts_duration_ms);
-            break;
-        case SPEECH_EVENT_ERROR:
-            fprintf(stderr, "[ERROR] %s\n", event->text ? event->text : "unknown");
-            break;
-        default:
-            break;
-    }
-}
-
-static void print_usage(const char* prog) {
-    fprintf(stderr, "Usage: %s --model-dir <path> [--qnn] [--transcribe-only] [--device <alsa_dev>]\n", prog);
-}
-
-int main(int argc, char* argv[]) {
-    const char* model_dir = nullptr;
-    const char* alsa_device = "default";
-    bool use_qnn = false;
-    bool transcribe_only = false;
-
-    for (int i = 1; i < argc; i++) {
-        if (strcmp(argv[i], "--model-dir") == 0 && i + 1 < argc) {
-            model_dir = argv[++i];
-        } else if (strcmp(argv[i], "--qnn") == 0) {
-            use_qnn = true;
-        } else if (strcmp(argv[i], "--transcribe-only") == 0) {
-            transcribe_only = true;
-        } else if (strcmp(argv[i], "--device") == 0 && i + 1 < argc) {
-            alsa_device = argv[++i];
-        } else {
-            print_usage(argv[0]);
-            return 1;
-        }
-    }
-
-    if (!model_dir) {
-        print_usage(argv[0]);
-        return 1;
-    }
-
-    fprintf(stderr, "speech-linux %s\n", speech_version());
-    fprintf(stderr, "Models: %s\n", model_dir);
-    fprintf(stderr, "QNN: %s\n", use_qnn ? "yes" : "no");
-
-    speech_config_t config = speech_config_default();
-    config.model_dir = model_dir;
-    config.use_qnn = use_qnn;
-    config.transcribe_only = transcribe_only;
-
-    fprintf(stderr, "Loading models...\n");
-    speech_pipeline_t pipeline = speech_create(config, on_event, nullptr);
-    if (!pipeline) {
-        fprintf(stderr, "Failed to create pipeline\n");
-        return 1;
-    }
-
-    signal(SIGINT, signal_handler);
-    signal(SIGTERM, signal_handler);
-
-    speech_start(pipeline);
-    fprintf(stderr, "Listening... (Ctrl+C to stop)\n");
-
-#ifdef HAS_ALSA
-    snd_pcm_t* capture = nullptr;
-    int err = snd_pcm_open(&capture, alsa_device, SND_PCM_STREAM_CAPTURE, 0);
-    if (err < 0) {
-        fprintf(stderr, "ALSA open failed: %s\n", snd_strerror(err));
-        speech_destroy(pipeline);
-        return 1;
-    }
-
-    snd_pcm_set_params(capture, SND_PCM_FORMAT_FLOAT_LE, SND_PCM_ACCESS_RW_INTERLEAVED,
-                        1, 16000, 1, 100000);
-
-    float buffer[512];
-    while (running) {
-        snd_pcm_sframes_t frames = snd_pcm_readi(capture, buffer, 512);
-        if (frames < 0) {
-            frames = snd_pcm_recover(capture, (int)frames, 0);
-            if (frames < 0) break;
-        }
-        if (frames > 0) {
-            speech_push_audio(pipeline, buffer, (size_t)frames);
-        }
-    }
-
-    snd_pcm_close(capture);
-#else
-    // No ALSA: read raw float32 PCM from stdin
-    fprintf(stderr, "No ALSA — reading float32 PCM from stdin (16kHz mono)\n");
-    float buffer[512];
-    while (running) {
-        size_t n = fread(buffer, sizeof(float), 512, stdin);
-        if (n == 0) break;
-        speech_push_audio(pipeline, buffer, n);
-        // Simulate real-time pace
-        usleep((unsigned int)(n * 1000000 / 16000));
-    }
-#endif
-
-    fprintf(stderr, "\nShutting down...\n");
-    speech_destroy(pipeline);
-    return 0;
-}
diff --git a/linux/include/speech.h b/linux/include/speech.h
deleted file mode 100644
index b7b1549..0000000
--- a/linux/include/speech.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef SPEECH_H
-#define SPEECH_H
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdbool.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct speech_pipeline_s* speech_pipeline_t;
-
-typedef enum {
-    SPEECH_EVENT_READY = 0,
-    SPEECH_EVENT_SPEECH_STARTED,
-    SPEECH_EVENT_SPEECH_ENDED,
-    SPEECH_EVENT_PARTIAL_TRANSCRIPTION,
-    SPEECH_EVENT_TRANSCRIPTION,
-    SPEECH_EVENT_RESPONSE_AUDIO,
-    SPEECH_EVENT_RESPONSE_DONE,
-    SPEECH_EVENT_ERROR
-} speech_event_type_t;
-
-typedef struct {
-    speech_event_type_t type;
-    const char* text;
-    const uint8_t* audio_data;
-    size_t audio_data_length;
-    float confidence;
-    float stt_duration_ms;
-    float tts_duration_ms;
-} speech_event_t;
-
-typedef struct {
-    const char* model_dir;
-    bool use_int8;
-    bool use_qnn;
-    bool enable_enhancer;
-    bool transcribe_only;
-    float min_silence_duration;
-} speech_config_t;
-
-typedef void (*speech_event_fn)(const speech_event_t* event, void* context);
-
-speech_config_t speech_config_default(void);
-
-speech_pipeline_t speech_create(speech_config_t config,
-                                speech_event_fn on_event,
-                                void* event_context);
-
-void speech_start(speech_pipeline_t pipeline);
-
-void speech_push_audio(speech_pipeline_t pipeline,
-                       const float* samples, size_t count);
-
-void speech_resume_listening(speech_pipeline_t pipeline);
-
-void speech_destroy(speech_pipeline_t pipeline);
-
-const char* speech_version(void);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/linux/setup_linux.sh b/linux/setup_linux.sh
deleted file mode 100755
index 8b7f5a7..0000000
--- a/linux/setup_linux.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-ORT_VERSION="1.19.0"
-OS="${OS:-$(uname -s)}"
-ARCH="${1:-$(uname -m)}"
-
-ROOT="$(cd "$(dirname "$0")/.." && pwd)"
-ORT_DIR="${ROOT}/ort-linux"
-
-echo "=== speech-linux setup (${OS} ${ARCH}) ==="
-
-if [ ! -f "${ORT_DIR}/include/onnxruntime_c_api.h" ]; then
-    echo "Downloading ONNX Runtime ${ORT_VERSION} for ${OS} ${ARCH}..."
-
-    case "${OS}-${ARCH}" in
-        Linux-aarch64|Linux-arm64)
-            ORT_URL="https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VERSION}/onnxruntime-linux-aarch64-${ORT_VERSION}.tgz"
-            ORT_LIB_GLOB="libonnxruntime.so*"
-            ;;
-        Linux-x86_64|Linux-amd64)
-            ORT_URL="https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VERSION}/onnxruntime-linux-x64-${ORT_VERSION}.tgz"
-            ORT_LIB_GLOB="libonnxruntime.so*"
-            ;;
-        Darwin-arm64|Darwin-aarch64)
-            ORT_URL="https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VERSION}/onnxruntime-osx-arm64-${ORT_VERSION}.tgz"
-            ORT_LIB_GLOB="libonnxruntime*.dylib"
-            ;;
-        Darwin-x86_64)
-            ORT_URL="https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VERSION}/onnxruntime-osx-x86_64-${ORT_VERSION}.tgz"
-            ORT_LIB_GLOB="libonnxruntime*.dylib"
-            ;;
-        *)
-            echo "Unsupported platform: ${OS}-${ARCH}"
-            exit 1
-            ;;
-    esac
-
-    TMP_DIR=$(mktemp -d)
-    curl -L -o "${TMP_DIR}/ort.tgz" "${ORT_URL}"
-
-    mkdir -p "${ORT_DIR}"
-    tar xf "${TMP_DIR}/ort.tgz" -C "${TMP_DIR}"
-
-    # Find extracted dir
-    ORT_EXTRACTED=$(find "${TMP_DIR}" -maxdepth 1 -name "onnxruntime-*" -type d | head -1)
-
-    mkdir -p "${ORT_DIR}/include" "${ORT_DIR}/lib"
-    cp "${ORT_EXTRACTED}"/include/*.h "${ORT_DIR}/include/"
-    cp "${ORT_EXTRACTED}"/lib/${ORT_LIB_GLOB} "${ORT_DIR}/lib/"
-
-    rm -rf "${TMP_DIR}"
-    echo "ONNX Runtime installed to ${ORT_DIR}"
-else
-    echo "ONNX Runtime already installed"
-fi
-
-echo ""
-echo "Build with:"
-echo "  cd linux && cmake -B build -DORT_DIR=${ORT_DIR} && cmake --build build"
diff --git a/linux/src/speech.cpp b/linux/src/speech.cpp
deleted file mode 100644
index fb933fa..0000000
--- a/linux/src/speech.cpp
+++ /dev/null
@@ -1,259 +0,0 @@
-#include "speech.h"
-
-#include <string>
-#include <cstring>
-
-#include <speech_core/speech_core_c.h>
-#include "models/onnx_engine.h"
-#include "models/silero_vad.h"
-#include "models/parakeet_stt.h"
-#include "models/kokoro_tts.h"
-#include "models/deepfilter.h"
-
-// ---------------------------------------------------------------------------
-// Pipeline handle
-// ---------------------------------------------------------------------------
-
-struct speech_pipeline_s {
-    sc_pipeline_t pipeline = nullptr;
-    SileroVad* vad = nullptr;
-    ParakeetStt* stt = nullptr;
-    KokoroTts* tts = nullptr;
-    DeepFilterEnhancer* enhancer = nullptr;
-    speech_event_fn user_callback = nullptr;
-    void* user_context = nullptr;
-
-    ~speech_pipeline_s() {
-        if (pipeline) sc_pipeline_destroy(pipeline);
-        delete enhancer;
-        delete tts;
-        delete stt;
-        delete vad;
-    }
-};
-
-// ---------------------------------------------------------------------------
-// speech-core vtable adapters (pure C++, no platform deps)
-// ---------------------------------------------------------------------------
-
-static float vad_process_chunk(void* ctx, const float* samples, size_t len) {
-    return static_cast<SileroVad*>(ctx)->process_chunk(samples, len);
-}
-static void vad_reset(void* ctx) {
-    static_cast<SileroVad*>(ctx)->reset();
-}
-static int vad_sample_rate(void* ctx) {
-    return static_cast<SileroVad*>(ctx)->input_sample_rate();
-}
-static size_t vad_chunk_size(void* ctx) {
-    return static_cast<SileroVad*>(ctx)->chunk_size();
-}
-
-static sc_transcription_result_t stt_transcribe(
-    void* ctx, const float* audio, size_t len, int sr)
-{
-    auto* stt = static_cast<ParakeetStt*>(ctx);
-    auto r = stt->transcribe(audio, len, sr);
-
-    static thread_local std::string text_buf;
-    static thread_local std::string lang_buf;
-    text_buf = std::move(r.text);
-    lang_buf = std::move(r.language);
-
-    return {
-        .text = text_buf.c_str(),
-        .language = lang_buf.empty() ? nullptr : lang_buf.c_str(),
-        .confidence = r.confidence,
-        .start_time = 0.0f,
-        .end_time = 0.0f,
-    };
-}
-static int stt_sample_rate(void* ctx) {
-    return static_cast<ParakeetStt*>(ctx)->input_sample_rate();
-}
-
-static void tts_synthesize(
-    void* ctx, const char* text, const char* language,
-    sc_tts_chunk_fn on_chunk, void* chunk_ctx)
-{
-    static_cast<KokoroTts*>(ctx)->synthesize(text, language, on_chunk, chunk_ctx);
-}
-static int tts_sample_rate(void* ctx) {
-    return static_cast<KokoroTts*>(ctx)->output_sample_rate();
-}
-static void tts_cancel(void* ctx) {
-    static_cast<KokoroTts*>(ctx)->cancel();
-}
-
-static void enhancer_enhance(
-    void* ctx, const float* input, size_t len, int sr, float* output)
-{
-    static_cast<DeepFilterEnhancer*>(ctx)->enhance(input, len, sr, output);
-}
-static int enhancer_sample_rate(void* ctx) {
-    return static_cast<DeepFilterEnhancer*>(ctx)->input_sample_rate();
-}
-
-// ---------------------------------------------------------------------------
-// Event bridge: sc_event_t → speech_event_t
-// ---------------------------------------------------------------------------
-
-static void on_pipeline_event(const sc_event_t* event, void* context) {
-    auto* h = static_cast<speech_pipeline_s*>(context);
-    if (!h->user_callback) return;
-
-    speech_event_t out = {};
-    out.text = event->text;
-    out.audio_data = event->audio_data;
-    out.audio_data_length = event->audio_data_length;
-    out.confidence = event->confidence;
-    out.stt_duration_ms = event->stt_duration_ms;
-    out.tts_duration_ms = event->tts_duration_ms;
-
-    switch (event->type) {
-        case SC_EVENT_SESSION_CREATED:         out.type = SPEECH_EVENT_READY; break;
-        case SC_EVENT_SPEECH_STARTED:          out.type = SPEECH_EVENT_SPEECH_STARTED; break;
-        case SC_EVENT_SPEECH_ENDED:            out.type = SPEECH_EVENT_SPEECH_ENDED; break;
-        case SC_EVENT_PARTIAL_TRANSCRIPTION:   out.type = SPEECH_EVENT_PARTIAL_TRANSCRIPTION; break;
-        case SC_EVENT_TRANSCRIPTION_COMPLETED: out.type = SPEECH_EVENT_TRANSCRIPTION; break;
-        case SC_EVENT_RESPONSE_AUDIO_DELTA:    out.type = SPEECH_EVENT_RESPONSE_AUDIO; break;
-        case SC_EVENT_RESPONSE_DONE:           out.type = SPEECH_EVENT_RESPONSE_DONE; break;
-        case SC_EVENT_ERROR:                   out.type = SPEECH_EVENT_ERROR; break;
-        default: return;  // skip unmapped events
-    }
-
-    h->user_callback(&out, h->user_context);
-}
-
-// ---------------------------------------------------------------------------
-// Public C API
-// ---------------------------------------------------------------------------
-
-speech_config_t speech_config_default(void) {
-    return {
-        .model_dir = nullptr,
-        .use_int8 = true,
-        .use_qnn = false,
-        .enable_enhancer = false,
-        .transcribe_only = false,
-        .min_silence_duration = 0.4f,
-    };
-}
-
-speech_pipeline_t speech_create(speech_config_t config,
-                                speech_event_fn on_event,
-                                void* event_context)
-{
-    if (!config.model_dir) return nullptr;
-
-    auto* h = new speech_pipeline_s();
-    h->user_callback = on_event;
-    h->user_context = event_context;
-
-    std::string dir(config.model_dir);
-    std::string suffix = config.use_int8 ? "-int8" : "";
-    bool hw_accel = config.use_qnn;
-
-    try {
-        h->vad = new SileroVad(dir + "/silero-vad.onnx");
-        h->stt = new ParakeetStt(
-            dir + "/parakeet-encoder" + suffix + ".onnx",
-            dir + "/parakeet-decoder-joint" + suffix + ".onnx",
-            dir + "/vocab.json",
-            hw_accel);
-        // Skip TTS when transcribe-only — saves model load time and lets
-        // the CLI run on a slimmer model directory (no kokoro-e2e bundle).
-        if (!config.transcribe_only) {
-            h->tts = new KokoroTts(
-                dir + "/kokoro-e2e.onnx",
-                dir + "/voices", dir, hw_accel);
-        }
-
-        // VAD vtable
-        sc_vad_vtable_t vad_vt = {};
-        vad_vt.context = h->vad;
-        vad_vt.process_chunk = vad_process_chunk;
-        vad_vt.reset = ::vad_reset;
-        vad_vt.input_sample_rate = ::vad_sample_rate;
-        vad_vt.chunk_size = ::vad_chunk_size;
-
-        // STT vtable
-        sc_stt_vtable_t stt_vt = {};
-        stt_vt.context = h->stt;
-        stt_vt.transcribe = ::stt_transcribe;
-        stt_vt.input_sample_rate = ::stt_sample_rate;
-
-        // TTS vtable — populated only when TTS was loaded.
-        sc_tts_vtable_t tts_vt = {};
-        if (h->tts) {
-            tts_vt.context = h->tts;
-            tts_vt.synthesize = ::tts_synthesize;
-            tts_vt.output_sample_rate = ::tts_sample_rate;
-            tts_vt.cancel = ::tts_cancel;
-        }
-
-        // Pipeline config
-        sc_config_t sc_cfg = sc_config_default();
-        sc_cfg.min_silence_duration = config.min_silence_duration;
-        if (config.transcribe_only) {
-            sc_cfg.mode = SC_MODE_TRANSCRIBE_ONLY;
-        } else {
-            sc_cfg.mode = SC_MODE_ECHO;
-        }
-
-        h->pipeline = sc_pipeline_create(
-            stt_vt, tts_vt, nullptr, vad_vt,
-            sc_cfg, on_pipeline_event, h);
-
-        if (!h->pipeline) {
-            delete h;
-            return nullptr;
-        }
-
-        // Optional enhancer
-        if (config.enable_enhancer) {
-            std::string aux = dir + "/deepfilter-auxiliary.bin";
-            std::string df = dir + "/deepfilter" + suffix + ".onnx";
-            FILE* f = fopen(df.c_str(), "r");
-            if (f) {
-                fclose(f);
-                h->enhancer = new DeepFilterEnhancer(df, aux, hw_accel);
-                sc_enhancer_vtable_t enh_vt = {};
-                enh_vt.context = h->enhancer;
-                enh_vt.enhance = ::enhancer_enhance;
-                enh_vt.input_sample_rate = ::enhancer_sample_rate;
-                sc_pipeline_set_enhancer(h->pipeline, enh_vt);
-            }
-        }
-
-        return h;
-
-    } catch (const std::exception& e) {
-        LOGE("Pipeline creation failed: %s", e.what());
-        delete h;
-        return nullptr;
-    }
-}
-
-void speech_start(speech_pipeline_t pipeline) {
-    if (pipeline && pipeline->pipeline) sc_pipeline_start(pipeline->pipeline);
-}
-
-void speech_push_audio(speech_pipeline_t pipeline,
-                       const float* samples, size_t count) {
-    if (pipeline && pipeline->pipeline)
-        sc_pipeline_push_audio(pipeline->pipeline, samples, count);
-}
-
-void speech_resume_listening(speech_pipeline_t pipeline) {
-    if (pipeline && pipeline->pipeline)
-        sc_pipeline_resume_listening(pipeline->pipeline);
-}
-
-void speech_destroy(speech_pipeline_t pipeline) {
-    delete pipeline;
-}
-
-const char* speech_version(void) {
-    return "0.0.1";
-}
diff --git a/linux/tests/download_models.sh b/linux/tests/download_models.sh
deleted file mode 100755
index b803f1a..0000000
--- a/linux/tests/download_models.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-# Download ONNX models for testing.
-# Usage: ./download_models.sh [output_dir]
-
-BASE_URL="https://huggingface.co/aufklarer"
-OUT="${1:-$(dirname "$0")/models}"
-mkdir -p "$OUT/voices"
-
-FILES=(
-    "Silero-VAD-v5-ONNX/silero-vad.onnx"
-    "Parakeet-TDT-v3-ONNX/parakeet-encoder-int8.onnx"
-    "Parakeet-TDT-v3-ONNX/parakeet-decoder-joint-int8.onnx"
-    "Parakeet-TDT-v3-ONNX/vocab.json"
-    "Kokoro-82M-ONNX/kokoro-e2e.onnx"
-    "Kokoro-82M-ONNX/kokoro-e2e.onnx.data"
-    "Kokoro-82M-ONNX/vocab_index.json"
-    "Kokoro-82M-ONNX/us_gold.json"
-    "Kokoro-82M-ONNX/us_silver.json"
-    "Kokoro-82M-ONNX/dict_fr.json"
-    "Kokoro-82M-ONNX/dict_es.json"
-    "Kokoro-82M-ONNX/dict_it.json"
-    "Kokoro-82M-ONNX/dict_pt.json"
-    "Kokoro-82M-ONNX/dict_hi.json"
-    "Kokoro-82M-ONNX/voices/af_heart.bin"
-)
-
-for entry in "${FILES[@]}"; do
-    repo="${entry%%/*}"
-    file="${entry#*/}"
-    dest="$OUT/$file"
-    if [ -f "$dest" ]; then
-        continue
-    fi
-    echo "Downloading $file..."
-    curl -sL -o "$dest" "$BASE_URL/$repo/resolve/main/$file"
-done
-
-echo "Models ready in $OUT"
diff --git a/linux/tests/test_pipeline.cpp b/linux/tests/test_pipeline.cpp
deleted file mode 100644
index afe9461..0000000
--- a/linux/tests/test_pipeline.cpp
+++ /dev/null
@@ -1,284 +0,0 @@
-#include "speech.h"
-
-#include <cassert>
-#include <cstdio>
-#include <cstring>
-#include <cmath>
-#include <string>
-#include <vector>
-#include <atomic>
-#include <chrono>
-#include <thread>
-
-// ---------------------------------------------------------------------------
-// Test helpers
-// ---------------------------------------------------------------------------
-
-static int tests_run = 0;
-static int tests_passed = 0;
-
-#define TEST(name) \
-    static void test_##name(); \
-    static struct Register_##name { \
-        Register_##name() { test_funcs.push_back({#name, test_##name}); } \
-    } reg_##name; \
-    static void test_##name()
-
-#define ASSERT(cond) do { \
-    if (!(cond)) { \
-        fprintf(stderr, "  FAIL: %s (line %d)\n", #cond, __LINE__); \
-        return; \
-    } \
-} while(0)
-
-#define PASS() tests_passed++
-
-struct TestFunc { const char* name; void (*fn)(); };
-static std::vector<TestFunc> test_funcs;
-
-// ---------------------------------------------------------------------------
-// Tests
-// ---------------------------------------------------------------------------
-
-TEST(config_default) {
-    speech_config_t cfg = speech_config_default();
-    ASSERT(cfg.use_int8 == true);
-    ASSERT(cfg.use_qnn == false);
-    ASSERT(cfg.min_silence_duration > 0.0f);
-    ASSERT(cfg.model_dir == nullptr);
-    PASS();
-}
-
-TEST(version) {
-    const char* v = speech_version();
-    ASSERT(v != nullptr);
-    ASSERT(strlen(v) > 0);
-    PASS();
-}
-
-TEST(create_null_dir_fails) {
-    speech_config_t cfg = speech_config_default();
-    cfg.model_dir = nullptr;
-    speech_pipeline_t p = speech_create(cfg, nullptr, nullptr);
-    ASSERT(p == nullptr);
-    PASS();
-}
-
-TEST(create_bad_dir_fails) {
-    speech_config_t cfg = speech_config_default();
-    cfg.model_dir = "/nonexistent/path";
-    speech_pipeline_t p = speech_create(cfg, nullptr, nullptr);
-    ASSERT(p == nullptr);
-    PASS();
-}
-
-TEST(destroy_null_safe) {
-    speech_destroy(nullptr);
-    PASS();
-}
-
-TEST(push_null_safe) {
-    float buf[512] = {};
-    speech_push_audio(nullptr, buf, 512);
-    speech_start(nullptr);
-    speech_resume_listening(nullptr);
-    PASS();
-}
-
-// If models are available, test the full pipeline
-static const char* find_model_dir() {
-    const char* env = getenv("SPEECH_MODEL_DIR");
-    if (env) return env;
-    // Check common locations
-    static const char* paths[] = {
-        "./models",
-        "../models",
-        "../tests/models",
-        "/opt/speech/models",
-        nullptr
-    };
-    for (const char** p = paths; *p; p++) {
-        char path[512];
-        snprintf(path, sizeof(path), "%s/silero-vad.onnx", *p);
-        FILE* f = fopen(path, "r");
-        if (f) { fclose(f); return *p; }
-    }
-    return nullptr;
-}
-
-struct EventLog {
-    std::atomic<int> transcriptions{0};
-    std::atomic<int> speech_started{0};
-    std::atomic<int> speech_ended{0};
-    std::string last_text;
-};
-
-static void test_event_cb(const speech_event_t* event, void* ctx) {
-    auto* log = static_cast<EventLog*>(ctx);
-    switch (event->type) {
-        case SPEECH_EVENT_SPEECH_STARTED: log->speech_started++; break;
-        case SPEECH_EVENT_SPEECH_ENDED: log->speech_ended++; break;
-        case SPEECH_EVENT_TRANSCRIPTION:
-            log->transcriptions++;
-            if (event->text) log->last_text = event->text;
-            break;
-        default: break;
-    }
-}
-
-TEST(pipeline_lifecycle) {
-    const char* dir = find_model_dir();
-    if (!dir) { fprintf(stderr, "  SKIP (no models)\n"); PASS(); return; }
-
-    speech_config_t cfg = speech_config_default();
-    cfg.model_dir = dir;
-    cfg.transcribe_only = true;
-
-    EventLog log;
-    speech_pipeline_t p = speech_create(cfg, test_event_cb, &log);
-    ASSERT(p != nullptr);
-
-    speech_start(p);
-
-    // Push 2 seconds of silence
-    float silence[512] = {};
-    for (int i = 0; i < 62; i++) {
-        speech_push_audio(p, silence, 512);
-        std::this_thread::sleep_for(std::chrono::milliseconds(5));
-    }
-
-    speech_destroy(p);
-    // No crash = success
-    PASS();
-}
-
-TEST(pipeline_speech_detection) {
-    const char* dir = find_model_dir();
-    if (!dir) { fprintf(stderr, "  SKIP (no models)\n"); PASS(); return; }
-
-    speech_config_t cfg = speech_config_default();
-    cfg.model_dir = dir;
-    cfg.transcribe_only = true;
-
-    EventLog log;
-    speech_pipeline_t p = speech_create(cfg, test_event_cb, &log);
-    ASSERT(p != nullptr);
-
-    speech_start(p);
-
-    // Push speech-like signal (150Hz buzz) for 1.5s
-    float speech[512];
-    for (int chunk = 0; chunk < 47; chunk++) {
-        for (int i = 0; i < 512; i++) {
-            float t = (float)(chunk * 512 + i) / 16000.0f;
-            speech[i] = 0.3f * sinf(2.0f * 3.14159f * 150.0f * t)
-                       + 0.2f * sinf(2.0f * 3.14159f * 300.0f * t);
-        }
-        speech_push_audio(p, speech, 512);
-        std::this_thread::sleep_for(std::chrono::milliseconds(5));
-    }
-
-    // Push 1.5s silence to trigger end-of-speech
-    float silence[512] = {};
-    for (int i = 0; i < 47; i++) {
-        speech_push_audio(p, silence, 512);
-        std::this_thread::sleep_for(std::chrono::milliseconds(5));
-    }
-
-    // Wait for processing
-    std::this_thread::sleep_for(std::chrono::seconds(3));
-
-    speech_destroy(p);
-
-    // VAD should have detected speech
-    ASSERT(log.speech_started > 0);
-    PASS();
-}
-
-TEST(resume_listening_null_safe) {
-    speech_resume_listening(nullptr);
-    PASS();
-}
-
-TEST(pipeline_multiple_sessions) {
-    const char* dir = find_model_dir();
-    if (!dir) { fprintf(stderr, "  SKIP (no models)\n"); PASS(); return; }
-
-    for (int session = 0; session < 3; session++) {
-        speech_config_t cfg = speech_config_default();
-        cfg.model_dir = dir;
-        cfg.transcribe_only = true;
-
-        EventLog log;
-        speech_pipeline_t p = speech_create(cfg, test_event_cb, &log);
-        ASSERT(p != nullptr);
-
-        speech_start(p);
-
-        // Push 1 second of silence
-        float silence[512] = {};
-        for (int i = 0; i < 31; i++) {
-            speech_push_audio(p, silence, 512);
-            std::this_thread::sleep_for(std::chrono::milliseconds(5));
-        }
-
-        speech_destroy(p);
-    }
-    // No crash or leak after 3 create/destroy cycles
-    PASS();
-}
-
-TEST(pipeline_concurrent_push) {
-    const char* dir = find_model_dir();
-    if (!dir) { fprintf(stderr, "  SKIP (no models)\n"); PASS(); return; }
-
-    speech_config_t cfg = speech_config_default();
-    cfg.model_dir = dir;
-    cfg.transcribe_only = true;
-
-    EventLog log;
-    speech_pipeline_t p = speech_create(cfg, test_event_cb, &log);
-    ASSERT(p != nullptr);
-
-    speech_start(p);
-
-    // Push audio from 4 threads concurrently
-    std::vector<std::thread> threads;
-    for (int t = 0; t < 4; t++) {
-        threads.emplace_back([p]() {
-            float buf[512] = {};
-            for (int i = 0; i < 50; i++) {
-                speech_push_audio(p, buf, 512);
-                std::this_thread::sleep_for(std::chrono::milliseconds(2));
-            }
-        });
-    }
-
-    for (auto& th : threads) {
-        th.join();
-    }
-
-    speech_destroy(p);
-    // No crash under concurrent push
-    PASS();
-}
-
-// ---------------------------------------------------------------------------
-// Main
-// ---------------------------------------------------------------------------
-
-int main() {
-    fprintf(stderr, "speech-linux tests (%s)\n\n", speech_version());
-
-    for (auto& t : test_funcs) {
-        tests_run++;
-        fprintf(stderr, "  %s... ", t.name);
-        t.fn();
-        if (tests_passed == tests_run) {
-            fprintf(stderr, "ok\n");
-        }
-    }
-
-    fprintf(stderr, "\n%d/%d passed\n", tests_passed, tests_run);
-    return tests_passed == tests_run ? 0 : 1;
-}
diff --git a/linux/toolchain-aarch64.cmake b/linux/toolchain-aarch64.cmake
deleted file mode 100644
index 272ed2d..0000000
--- a/linux/toolchain-aarch64.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-set(CMAKE_SYSTEM_NAME Linux)
-set(CMAKE_SYSTEM_PROCESSOR aarch64)
-
-# Yocto SDK cross-compiler (source environment-setup-aarch64-poky-linux first)
-set(CMAKE_C_COMPILER   $ENV{CC}  CACHE STRING "" FORCE)
-set(CMAKE_CXX_COMPILER $ENV{CXX} CACHE STRING "" FORCE)
-set(CMAKE_SYSROOT      $ENV{SDKTARGETSYSROOT} CACHE STRING "" FORCE)
-
-set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
diff --git a/linux/tools/phonemize.cpp b/linux/tools/phonemize.cpp
deleted file mode 100644
index a183df3..0000000
--- a/linux/tools/phonemize.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-// Tiny CLI that dumps the phoneme string + token IDs the Kokoro phonemizer
-// produces for a piece of text. Used to verify text→phoneme conversion is
-// correct before blaming the TTS model.
-//
-// Usage: speech_phonemize <model_dir> "<text>" [language]
-
-#include "models/kokoro_phonemizer.h"
-
-#include <cstdio>
-#include <string>
-
-int main(int argc, char** argv) {
-    if (argc < 3) {
-        std::fprintf(stderr,
-            "usage: %s <model_dir> \"<text>\" [language]\n"
-            "  model_dir : directory holding vocab_index.json + dictionaries\n"
-            "  language  : BCP-47 tag (default: en)\n",
-            argv[0]);
-        return 2;
-    }
-    const std::string model_dir = argv[1];
-    const std::string text      = argv[2];
-    const std::string language  = (argc >= 4) ? argv[3] : "en";
-
-    KokoroPhonemizer p;
-    if (!p.load_vocab(model_dir + "/vocab_index.json")) {
-        std::fprintf(stderr, "failed to load vocab from %s/vocab_index.json\n",
-                     model_dir.c_str());
-        return 1;
-    }
-    p.load_dictionaries(model_dir);
-    for (const char* lang : {"fr", "es", "it", "pt", "hi"}) {
-        p.load_language_dict(lang, model_dir + "/dict_" + std::string(lang) + ".json");
-    }
-    p.set_language(language);
-
-    std::string phonemes = p.text_to_phonemes(text);
-    auto ids = p.tokenize(text, 128);
-
-    std::printf("text     : %s\n", text.c_str());
-    std::printf("language : %s\n", language.c_str());
-    std::printf("phonemes : %s\n", phonemes.c_str());
-    std::printf("tokens   : [%zu]", ids.size());
-    for (auto id : ids) std::printf(" %lld", static_cast<long long>(id));
-    std::printf("\n");
-    return 0;
-}
diff --git a/linux/tools/synthesize.cpp b/linux/tools/synthesize.cpp
deleted file mode 100644
index 2701608..0000000
--- a/linux/tools/synthesize.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-// Tiny CLI that runs Kokoro TTS on a piece of text and writes the audio to a WAV.
-//
-// Usage: speech_synthesize <model_dir> <output.wav> "<text>" [language]
-//
-// Pairs with speech_transcribe — round-trip a known prompt through synthesis
-// and back through STT to surface phonemizer / tokenizer / decoder bugs
-// without bouncing through Android.
-//
-// Calls KokoroTts directly (skipping the speech-core pipeline) so we can
-// inspect the raw audio buffer the model emits.
-
-#include "models/kokoro_tts.h"
-
-#include <cstdint>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <string>
-#include <vector>
-
-namespace {
-
-constexpr int kSampleRate = 24000;
-
-struct ChunkSink {
-    std::vector<float> samples;
-};
-
-static void on_chunk(const float* samples, size_t length,
-                     bool /*is_final*/, void* ctx) {
-    auto* sink = static_cast<ChunkSink*>(ctx);
-    sink->samples.insert(sink->samples.end(), samples, samples + length);
-}
-
-static bool write_wav(const std::string& path,
-                      const float* samples, size_t count, int sample_rate) {
-    std::ofstream f(path, std::ios::binary);
-    if (!f.is_open()) return false;
-
-    auto put32 = [&](uint32_t v) {
-        char b[4] = {char(v & 0xFF), char((v >> 8) & 0xFF),
-                     char((v >> 16) & 0xFF), char((v >> 24) & 0xFF)};
-        f.write(b, 4);
-    };
-    auto put16 = [&](uint16_t v) {
-        char b[2] = {char(v & 0xFF), char((v >> 8) & 0xFF)};
-        f.write(b, 2);
-    };
-
-    const uint32_t data_bytes = static_cast<uint32_t>(count) * 2;
-    f.write("RIFF", 4); put32(36 + data_bytes);
-    f.write("WAVE", 4);
-    f.write("fmt ", 4); put32(16);
-    put16(1);                               // PCM
-    put16(1);                               // mono
-    put32(static_cast<uint32_t>(sample_rate));
-    put32(static_cast<uint32_t>(sample_rate) * 2);
-    put16(2);                               // block align
-    put16(16);                              // bits/sample
-    f.write("data", 4); put32(data_bytes);
-
-    for (size_t i = 0; i < count; i++) {
-        float clamped = samples[i];
-        if (clamped < -1.0f) clamped = -1.0f;
-        if (clamped >  1.0f) clamped =  1.0f;
-        int16_t v = static_cast<int16_t>(clamped * 32767.0f);
-        put16(static_cast<uint16_t>(v));
-    }
-    return f.good();
-}
-
-}  // namespace
-
-int main(int argc, char** argv) {
-    if (argc < 4) {
-        std::fprintf(stderr,
-            "usage: %s <model_dir> <output.wav> \"<text>\" [language]\n"
-            "  model_dir : directory holding kokoro-e2e.onnx + voices/*.bin\n"
-            "  language  : BCP-47 tag (default: en). Auto-switches voice.\n",
-            argv[0]);
-        return 2;
-    }
-    const std::string model_dir = argv[1];
-    const std::string out_wav   = argv[2];
-    const std::string text      = argv[3];
-    const std::string language  = (argc >= 5) ? argv[4] : "en";
-
-    KokoroTts tts(model_dir + "/kokoro-e2e.onnx",
-                  model_dir + "/voices",
-                  model_dir,
-                  /*nnapi=*/false);
-
-    ChunkSink sink;
-    tts.synthesize(text.c_str(), language.c_str(), on_chunk, &sink);
-
-    if (sink.samples.empty()) {
-        std::fprintf(stderr, "synthesis produced no audio\n");
-        return 1;
-    }
-
-    if (!write_wav(out_wav, sink.samples.data(), sink.samples.size(), kSampleRate)) {
-        std::fprintf(stderr, "could not write %s\n", out_wav.c_str());
-        return 1;
-    }
-    std::fprintf(stderr, "wrote %zu samples (%.2fs @ %d Hz) to %s\n",
-                 sink.samples.size(),
-                 double(sink.samples.size()) / double(kSampleRate),
-                 kSampleRate, out_wav.c_str());
-    return 0;
-}
diff --git a/linux/tools/transcribe.cpp b/linux/tools/transcribe.cpp
deleted file mode 100644
index 2bad307..0000000
--- a/linux/tools/transcribe.cpp
+++ /dev/null
@@ -1,262 +0,0 @@
-// Tiny CLI that runs Parakeet STT on a WAV file and prints what it heard.
-//
-// Usage: speech_transcribe <model_dir> <input.wav>
-//
-// Reads PCM Float32 / Int16 / Int24 mono or stereo at any sample rate, then
-// resamples + downmixes to 16 kHz mono Float32 and feeds it through the
-// pipeline. Useful for diagnosing TTS round-trip quality (synthesise speech,
-// transcribe it back, compare to the original prompt).
-//
-// No external deps beyond libspeech.
-
-#include "speech.h"
-
-#include <atomic>
-#include <chrono>
-#include <condition_variable>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <mutex>
-#include <string>
-#include <thread>
-#include <vector>
-
-namespace {
-
-constexpr int kTargetSampleRate = 16000;
-constexpr size_t kChunkSamples = 512;  // 32 ms at 16 kHz
-
-// ---------------------------------------------------------------------------
-// WAV reader
-// ---------------------------------------------------------------------------
-
-struct WavData {
-    std::vector<float> samples;  // mono, target sample rate
-    int sample_rate = 0;
-    int original_sample_rate = 0;
-    int original_channels = 0;
-    int original_bits = 0;
-};
-
-static uint32_t read_u32(const uint8_t* p) {
-    return uint32_t(p[0]) | (uint32_t(p[1]) << 8)
-         | (uint32_t(p[2]) << 16) | (uint32_t(p[3]) << 24);
-}
-static uint16_t read_u16(const uint8_t* p) {
-    return uint16_t(p[0]) | (uint16_t(p[1]) << 8);
-}
-
-static bool load_wav(const std::string& path, WavData& out, std::string& err) {
-    std::ifstream f(path, std::ios::binary);
-    if (!f.is_open()) { err = "cannot open " + path; return false; }
-
-    std::vector<uint8_t> bytes((std::istreambuf_iterator<char>(f)),
-                               std::istreambuf_iterator<char>());
-    if (bytes.size() < 44) { err = "file too small to be a WAV"; return false; }
-    if (std::memcmp(bytes.data(), "RIFF", 4) != 0 ||
-        std::memcmp(bytes.data() + 8, "WAVE", 4) != 0) {
-        err = "not a RIFF/WAVE file";
-        return false;
-    }
-
-    // Walk chunks looking for fmt + data.
-    size_t pos = 12;
-    uint16_t fmt_format = 0, fmt_channels = 0, fmt_bits = 0;
-    uint32_t fmt_rate = 0;
-    const uint8_t* data_ptr = nullptr;
-    uint32_t data_len = 0;
-    while (pos + 8 <= bytes.size()) {
-        const uint8_t* hdr = bytes.data() + pos;
-        const char tag[5] = {char(hdr[0]), char(hdr[1]), char(hdr[2]), char(hdr[3]), 0};
-        uint32_t chunk_len = read_u32(hdr + 4);
-        if (pos + 8 + chunk_len > bytes.size()) break;
-        if (std::strcmp(tag, "fmt ") == 0 && chunk_len >= 16) {
-            fmt_format = read_u16(hdr + 8);
-            fmt_channels = read_u16(hdr + 10);
-            fmt_rate = read_u32(hdr + 12);
-            fmt_bits = read_u16(hdr + 22);
-        } else if (std::strcmp(tag, "data") == 0) {
-            data_ptr = hdr + 8;
-            data_len = chunk_len;
-            break;
-        }
-        pos += 8 + chunk_len + (chunk_len & 1);  // pad to even
-    }
-    if (!data_ptr || fmt_channels == 0) {
-        err = "WAV has no fmt or data chunk";
-        return false;
-    }
-    if (fmt_format != 1 /*PCM*/ && fmt_format != 3 /*FLOAT*/) {
-        err = "WAV format " + std::to_string(fmt_format)
-            + " unsupported (need PCM=1 or FLOAT=3)";
-        return false;
-    }
-
-    out.original_sample_rate = static_cast<int>(fmt_rate);
-    out.original_channels = fmt_channels;
-    out.original_bits = fmt_bits;
-
-    // Decode + downmix to mono float
-    const size_t bytes_per_sample = fmt_bits / 8;
-    const size_t frame_bytes = bytes_per_sample * fmt_channels;
-    const size_t frame_count = data_len / frame_bytes;
-    std::vector<float> mono(frame_count);
-    for (size_t i = 0; i < frame_count; i++) {
-        float sum = 0.0f;
-        for (int c = 0; c < fmt_channels; c++) {
-            const uint8_t* sp = data_ptr + i * frame_bytes + c * bytes_per_sample;
-            float s = 0.0f;
-            if (fmt_format == 3 && fmt_bits == 32) {
-                std::memcpy(&s, sp, 4);
-            } else if (fmt_format == 1 && fmt_bits == 16) {
-                int16_t v = int16_t(uint16_t(sp[0]) | (uint16_t(sp[1]) << 8));
-                s = float(v) / 32768.0f;
-            } else if (fmt_format == 1 && fmt_bits == 24) {
-                int32_t v = int32_t(uint32_t(sp[0])
-                          | (uint32_t(sp[1]) << 8) | (uint32_t(sp[2]) << 16));
-                if (v & 0x800000) v |= 0xFF000000;  // sign extend
-                s = float(v) / 8388608.0f;
-            } else if (fmt_format == 1 && fmt_bits == 32) {
-                int32_t v = int32_t(read_u32(sp));
-                s = float(v) / 2147483648.0f;
-            } else {
-                err = "unsupported sample width " + std::to_string(fmt_bits);
-                return false;
-            }
-            sum += s;
-        }
-        mono[i] = sum / float(fmt_channels);
-    }
-
-    // Linear-interpolation resample to kTargetSampleRate. Cheap, but
-    // adequate for diagnosing model output — TTS bandwidth is well below
-    // 8 kHz so aliasing isn't a meaningful concern here.
-    if (static_cast<int>(fmt_rate) == kTargetSampleRate) {
-        out.samples = std::move(mono);
-    } else {
-        const double ratio = double(fmt_rate) / double(kTargetSampleRate);
-        const size_t out_len = size_t(double(mono.size()) / ratio);
-        out.samples.resize(out_len);
-        for (size_t i = 0; i < out_len; i++) {
-            double src = double(i) * ratio;
-            size_t i0 = size_t(src);
-            double frac = src - double(i0);
-            float a = mono[i0];
-            float b = (i0 + 1 < mono.size()) ? mono[i0 + 1] : a;
-            out.samples[i] = float(double(a) + (double(b) - double(a)) * frac);
-        }
-    }
-    out.sample_rate = kTargetSampleRate;
-    return true;
-}
-
-// ---------------------------------------------------------------------------
-// Pipeline event handler
-// ---------------------------------------------------------------------------
-
-struct Result {
-    std::mutex mu;
-    std::condition_variable cv;
-    std::string text;
-    float confidence = 0.0f;
-    bool done = false;
-    bool error = false;
-};
-
-static void on_event(const speech_event_t* event, void* ctx) {
-    auto* r = static_cast<Result*>(ctx);
-    std::unique_lock<std::mutex> lock(r->mu);
-    switch (event->type) {
-        case SPEECH_EVENT_TRANSCRIPTION:
-            if (event->text) r->text = event->text;
-            r->confidence = event->confidence;
-            r->done = true;
-            r->cv.notify_all();
-            break;
-        case SPEECH_EVENT_PARTIAL_TRANSCRIPTION:
-            if (event->text) {
-                std::cerr << "  [partial] " << event->text << "\r" << std::flush;
-            }
-            break;
-        case SPEECH_EVENT_ERROR:
-            std::cerr << "  [error] " << (event->text ? event->text : "") << "\n";
-            r->error = true;
-            r->done = true;
-            r->cv.notify_all();
-            break;
-        default:
-            break;
-    }
-}
-
-}  // namespace
-
-int main(int argc, char** argv) {
-    if (argc != 3) {
-        std::fprintf(stderr,
-            "usage: %s <model_dir> <input.wav>\n"
-            "  model_dir : directory holding parakeet-* + silero-vad.onnx\n"
-            "  input.wav : audio to transcribe (mono or stereo, 16-bit/24-bit/float)\n",
-            argv[0]);
-        return 2;
-    }
-    const std::string model_dir = argv[1];
-    const std::string wav_path  = argv[2];
-
-    WavData wav;
-    std::string err;
-    if (!load_wav(wav_path, wav, err)) {
-        std::fprintf(stderr, "wav: %s\n", err.c_str());
-        return 1;
-    }
-    std::fprintf(stderr,
-        "loaded %s: %d Hz × %dch × %d-bit → %.2fs of 16 kHz mono\n",
-        wav_path.c_str(),
-        wav.original_sample_rate, wav.original_channels, wav.original_bits,
-        double(wav.samples.size()) / double(wav.sample_rate));
-
-    speech_config_t cfg = speech_config_default();
-    cfg.model_dir = model_dir.c_str();
-    cfg.transcribe_only = true;
-
-    Result result;
-    speech_pipeline_t pipeline = speech_create(cfg, on_event, &result);
-    if (!pipeline) {
-        std::fprintf(stderr, "speech_create failed (model dir? missing files?)\n");
-        return 1;
-    }
-    speech_start(pipeline);
-
-    // Push real audio
-    for (size_t off = 0; off < wav.samples.size(); off += kChunkSamples) {
-        size_t n = std::min(kChunkSamples, wav.samples.size() - off);
-        speech_push_audio(pipeline, wav.samples.data() + off, n);
-    }
-    // Trailing 1.5 s of silence so VAD sees end-of-utterance and Parakeet flushes
-    std::vector<float> silence(kChunkSamples, 0.0f);
-    for (int i = 0; i < 47; i++) {
-        speech_push_audio(pipeline, silence.data(), silence.size());
-    }
-
-    // Wait up to 30 s for the transcription event
-    {
-        std::unique_lock<std::mutex> lock(result.mu);
-        result.cv.wait_for(lock, std::chrono::seconds(30),
-                           [&]{ return result.done; });
-    }
-
-    speech_destroy(pipeline);
-
-    if (!result.done || result.error) {
-        std::fprintf(stderr, "transcription did not complete\n");
-        return 1;
-    }
-    // Result on stdout — single line, useful for piping
-    std::printf("%s\n", result.text.c_str());
-    std::fprintf(stderr, "confidence: %.3f\n", result.confidence);
-    return 0;
-}
diff --git a/sdk/src/main/cpp/CMakeLists.txt b/sdk/src/main/cpp/CMakeLists.txt
index 3dc8bc8..16f6ceb 100644
--- a/sdk/src/main/cpp/CMakeLists.txt
+++ b/sdk/src/main/cpp/CMakeLists.txt
@@ -6,71 +6,41 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 # ---------------------------------------------------------------------------
 # speech-core (git submodule)
+#
+# Pull in speech-core via add_subdirectory so we reuse its target definitions
+# rather than re-listing sources here. SPEECH_CORE_WITH_ONNX=ON adds the
+# speech_core_models target (Silero VAD / Parakeet STT / Kokoro TTS /
+# DeepFilterNet wrappers). Tests and examples are off — the Android NDK
+# can't run host ctest binaries anyway.
 # ---------------------------------------------------------------------------
 
 if(NOT DEFINED SPEECH_CORE_DIR)
-    message(FATAL_ERROR "SPEECH_CORE_DIR must be set")
+    message(FATAL_ERROR "SPEECH_CORE_DIR must be set (path to the speech-core submodule)")
 endif()
 
-set(SPEECH_CORE_SOURCES
-    ${SPEECH_CORE_DIR}/src/pipeline/voice_pipeline.cpp
-    ${SPEECH_CORE_DIR}/src/pipeline/turn_detector.cpp
-    ${SPEECH_CORE_DIR}/src/pipeline/speech_queue.cpp
-    ${SPEECH_CORE_DIR}/src/pipeline/conversation_context.cpp
-    ${SPEECH_CORE_DIR}/src/vad/streaming_vad.cpp
-    ${SPEECH_CORE_DIR}/src/audio/audio_buffer.cpp
-    ${SPEECH_CORE_DIR}/src/audio/resampler.cpp
-    ${SPEECH_CORE_DIR}/src/audio/pcm_codec.cpp
-    ${SPEECH_CORE_DIR}/src/tools/tool_registry.cpp
-    ${SPEECH_CORE_DIR}/src/tools/intent_matcher.cpp
-    ${SPEECH_CORE_DIR}/src/tools/tool_executor.cpp
-    ${SPEECH_CORE_DIR}/src/speech_core_c.cpp
-)
-
-add_library(speech_core STATIC ${SPEECH_CORE_SOURCES})
-target_include_directories(speech_core PUBLIC ${SPEECH_CORE_DIR}/include)
-target_compile_options(speech_core PRIVATE -O2)
-
-# ---------------------------------------------------------------------------
-# ONNX Runtime (prebuilt)
-# ---------------------------------------------------------------------------
-
 if(NOT DEFINED ORT_DIR)
-    message(FATAL_ERROR "ORT_DIR must be set (path to onnxruntime with include/ and lib/)")
+    message(FATAL_ERROR "ORT_DIR must be set (path to onnxruntime with include/ and lib/${ANDROID_ABI}/libonnxruntime.so)")
 endif()
 
-add_library(onnxruntime SHARED IMPORTED)
-set_target_properties(onnxruntime PROPERTIES
-    IMPORTED_LOCATION ${ORT_DIR}/lib/${ANDROID_ABI}/libonnxruntime.so
-)
+set(SPEECH_CORE_WITH_ONNX     ON  CACHE BOOL "" FORCE)
+set(SPEECH_CORE_BUILD_TESTS   OFF CACHE BOOL "" FORCE)
+set(SPEECH_CORE_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
+
+add_subdirectory(${SPEECH_CORE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/speech_core)
 
 # ---------------------------------------------------------------------------
-# speech-android native library
+# speech-android native library — JNI bridge only
+#
+# All audio DSP / model wrappers / onnx_engine now live in speech-core's
+# speech_core_models target. The bridge just constructs them and feeds
+# them into speech_core::VoicePipeline.
 # ---------------------------------------------------------------------------
 
-add_library(speech_android SHARED
-    jni_bridge.cpp
-    audio/mel.cpp
-    audio/fft.cpp
-    audio/stft.cpp
-    models/soc_detect.cpp
-    models/silero_vad.cpp
-    models/parakeet_stt.cpp
-    models/kokoro_tts.cpp
-    models/kokoro_phonemizer.cpp
-    models/kokoro_multilingual.cpp
-    models/deepfilter.cpp
-)
-
-target_include_directories(speech_android PRIVATE
-    ${ORT_DIR}/include
-    ${SPEECH_CORE_DIR}/include
-    ${CMAKE_CURRENT_SOURCE_DIR}
-)
+add_library(speech_android SHARED jni_bridge.cpp)
 
 target_link_libraries(speech_android
-    speech_core
-    onnxruntime
-    android
-    log
+    PRIVATE
+        speech_core_models
+        android
+        log
 )
diff --git a/sdk/src/main/cpp/audio/fft.cpp b/sdk/src/main/cpp/audio/fft.cpp
deleted file mode 100644
index dc221fd..0000000
--- a/sdk/src/main/cpp/audio/fft.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-#include "fft.h"
-#include <cmath>
-#include <vector>
-
-static void fft_complex(float* re, float* im, size_t n, bool inverse) {
-    // Bit-reversal permutation
-    for (size_t i = 1, j = 0; i < n; i++) {
-        size_t bit = n >> 1;
-        while (j & bit) { j ^= bit; bit >>= 1; }
-        j ^= bit;
-        if (i < j) {
-            std::swap(re[i], re[j]);
-            std::swap(im[i], im[j]);
-        }
-    }
-
-    // Cooley-Tukey
-    float sign = inverse ? 1.0f : -1.0f;
-    for (size_t len = 2; len <= n; len <<= 1) {
-        float ang = sign * 2.0f * static_cast<float>(M_PI) / static_cast<float>(len);
-        float wr = std::cos(ang), wi = std::sin(ang);
-
-        for (size_t i = 0; i < n; i += len) {
-            float cur_r = 1.0f, cur_i = 0.0f;
-            for (size_t j = 0; j < len / 2; j++) {
-                size_t u = i + j, v = i + j + len / 2;
-                float tr = re[v] * cur_r - im[v] * cur_i;
-                float ti = re[v] * cur_i + im[v] * cur_r;
-                re[v] = re[u] - tr;
-                im[v] = im[u] - ti;
-                re[u] += tr;
-                im[u] += ti;
-                float new_r = cur_r * wr - cur_i * wi;
-                cur_i = cur_r * wi + cur_i * wr;
-                cur_r = new_r;
-            }
-        }
-    }
-
-    if (inverse) {
-        float inv_n = 1.0f / static_cast<float>(n);
-        for (size_t i = 0; i < n; i++) {
-            re[i] *= inv_n;
-            im[i] *= inv_n;
-        }
-    }
-}
-
-// Zero-pad to next power of 2 for non-power-of-2 FFT sizes
-static size_t next_pow2(size_t n) {
-    size_t p = 1;
-    while (p < n) p <<= 1;
-    return p;
-}
-
-void fft_real(const float* input, size_t n,
-              float* out_real, float* out_imag)
-{
-    size_t N = next_pow2(n);
-    std::vector<float> re(N, 0.0f), im(N, 0.0f);
-    for (size_t i = 0; i < n; i++) re[i] = input[i];
-
-    fft_complex(re.data(), im.data(), N, false);
-
-    size_t bins = n / 2 + 1;
-    for (size_t i = 0; i < bins; i++) {
-        out_real[i] = re[i];
-        out_imag[i] = im[i];
-    }
-}
-
-void ifft_real(const float* in_real, const float* in_imag, size_t n,
-               float* output)
-{
-    size_t N = next_pow2(n);
-    std::vector<float> re(N, 0.0f), im(N, 0.0f);
-
-    size_t bins = n / 2 + 1;
-    for (size_t i = 0; i < bins; i++) {
-        re[i] = in_real[i];
-        im[i] = in_imag[i];
-    }
-    // Conjugate symmetry
-    for (size_t i = bins; i < N; i++) {
-        re[i] =  re[N - i];
-        im[i] = -im[N - i];
-    }
-
-    fft_complex(re.data(), im.data(), N, true);
-
-    for (size_t i = 0; i < n; i++) output[i] = re[i];
-}
diff --git a/sdk/src/main/cpp/audio/fft.h b/sdk/src/main/cpp/audio/fft.h
deleted file mode 100644
index bfe5433..0000000
--- a/sdk/src/main/cpp/audio/fft.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#pragma once
-
-#include <cstddef>
-#include <vector>
-
-/// Minimal radix-2 FFT for Android NDK (no external dependencies).
-/// Operates on real signals — returns complex spectrum [0..N/2].
-
-void fft_real(const float* input, size_t n,
-              float* out_real, float* out_imag);
-
-void ifft_real(const float* in_real, const float* in_imag, size_t n,
-               float* output);
diff --git a/sdk/src/main/cpp/audio/mel.cpp b/sdk/src/main/cpp/audio/mel.cpp
deleted file mode 100644
index 3d2edf0..0000000
--- a/sdk/src/main/cpp/audio/mel.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-#include "mel.h"
-#include "fft.h"
-#include <cmath>
-#include <algorithm>
-
-// HTK mel scale (used when slaney_norm=false).
-static float htk_hz_to_mel(float hz) {
-    return 2595.0f * std::log10(1.0f + hz / 700.0f);
-}
-static float htk_mel_to_hz(float mel) {
-    return 700.0f * (std::pow(10.0f, mel / 2595.0f) - 1.0f);
-}
-
-// Slaney mel scale (used when slaney_norm=true):
-//   Linear below 1000 Hz:  mel = 3 * f / 200
-//   Log above 1000 Hz:     mel = 15 + 27 * log(f/1000) / log(6.4)
-static constexpr float kSlaneyBreakHz = 1000.0f;
-static constexpr float kSlaneyBreakMel = 15.0f;      // 3 * 1000 / 200
-static const float kSlaneyLogStep = 27.0f / std::log(6.4f);  // ≈ 14.536
-
-static float slaney_hz_to_mel(float hz) {
-    if (hz < kSlaneyBreakHz)
-        return 3.0f * hz / 200.0f;
-    return kSlaneyBreakMel + std::log(hz / kSlaneyBreakHz) * kSlaneyLogStep;
-}
-static float slaney_mel_to_hz(float mel) {
-    if (mel < kSlaneyBreakMel)
-        return 200.0f * mel / 3.0f;
-    return kSlaneyBreakHz * std::exp((mel - kSlaneyBreakMel) / kSlaneyLogStep);
-}
-
-static std::vector<float> mel_filterbank(
-    int num_mel_bins, int n_fft, int sample_rate, bool slaney_norm)
-{
-    int num_bins = n_fft / 2 + 1;
-
-    // Choose mel scale: Slaney (torchaudio default) when slaney_norm is on,
-    // HTK otherwise (backward compat).
-    auto hz2mel = slaney_norm ? slaney_hz_to_mel : htk_hz_to_mel;
-    auto mel2hz = slaney_norm ? slaney_mel_to_hz : htk_mel_to_hz;
-
-    float mel_low = hz2mel(0.0f);
-    float mel_high = hz2mel(static_cast<float>(sample_rate) / 2.0f);
-
-    std::vector<float> mel_points(num_mel_bins + 2);
-    // Hz centres of each mel point (for Slaney norm later).
-    std::vector<float> hz_points(num_mel_bins + 2);
-    for (int i = 0; i < num_mel_bins + 2; i++) {
-        float mel = mel_low + (mel_high - mel_low) * i / (num_mel_bins + 1);
-        hz_points[i] = mel2hz(mel);
-    }
-
-    // Convert to FFT bin indices
-    std::vector<float> bin_freqs(num_mel_bins + 2);
-    for (int i = 0; i < num_mel_bins + 2; i++) {
-        bin_freqs[i] = hz_points[i] * n_fft / sample_rate;
-    }
-
-    // Triangular filters [num_mel_bins * num_bins]
-    std::vector<float> fb(num_mel_bins * num_bins, 0.0f);
-    for (int m = 0; m < num_mel_bins; m++) {
-        float left = bin_freqs[m];
-        float center = bin_freqs[m + 1];
-        float right = bin_freqs[m + 2];
-
-        for (int f = 0; f < num_bins; f++) {
-            float ff = static_cast<float>(f);
-            if (ff >= left && ff <= center && center > left) {
-                fb[m * num_bins + f] = (ff - left) / (center - left);
-            } else if (ff > center && ff <= right && right > center) {
-                fb[m * num_bins + f] = (right - ff) / (right - center);
-            }
-        }
-
-        // Slaney normalization: divide each filter by its bandwidth in Hz
-        // so the filter has unit area. Matches torchaudio norm="slaney".
-        if (slaney_norm) {
-            float bandwidth = hz_points[m + 2] - hz_points[m];
-            if (bandwidth > 0.0f) {
-                float enorm = 2.0f / bandwidth;
-                for (int f = 0; f < num_bins; f++) {
-                    fb[m * num_bins + f] *= enorm;
-                }
-            }
-        }
-    }
-    return fb;
-}
-
-std::vector<float> mel_spectrogram(
-    const float* audio, size_t length,
-    int sample_rate, int n_fft, int hop_length,
-    int win_length, int num_mel_bins,
-    bool slaney_norm, float log_floor, bool center)
-{
-    // Optional center padding: pad signal by n_fft/2 on each side using
-    // reflect mode (matches torchaudio / NeMo center=True).
-    std::vector<float> padded;
-    const float* sig = audio;
-    size_t sig_len = length;
-
-    if (center) {
-        int pad = n_fft / 2;
-        sig_len = length + 2 * static_cast<size_t>(pad);
-        padded.resize(sig_len);
-
-        // Left reflect padding: padded[pad-1-i] = audio[i+1] for i in [0, pad-1)
-        for (int i = 0; i < pad; ++i) {
-            int src = std::min(i + 1, static_cast<int>(length) - 1);
-            padded[pad - 1 - i] = audio[src];
-        }
-        // Copy original signal
-        std::copy(audio, audio + length, padded.begin() + pad);
-        // Right reflect padding
-        for (int i = 0; i < pad; ++i) {
-            int src = std::max(static_cast<int>(length) - 2 - i, 0);
-            padded[pad + static_cast<int>(length) + i] = audio[src];
-        }
-        sig = padded.data();
-    }
-
-    int num_bins = n_fft / 2 + 1;
-    int num_frames = static_cast<int>((sig_len - static_cast<size_t>(win_length))
-                                      / hop_length) + 1;
-    if (num_frames <= 0) return {};
-
-    auto fb = mel_filterbank(num_mel_bins, n_fft, sample_rate, slaney_norm);
-
-    // Hann window
-    std::vector<float> window(win_length);
-    for (int i = 0; i < win_length; i++) {
-        window[i] = 0.5f * (1.0f - std::cos(2.0f * static_cast<float>(M_PI)
-                    * i / (win_length - 1)));
-    }
-
-    // STFT + mel
-    std::vector<float> mel(num_mel_bins * num_frames);
-    std::vector<float> frame(n_fft, 0.0f);
-    std::vector<float> spec_re(num_bins), spec_im(num_bins);
-
-    for (int t = 0; t < num_frames; t++) {
-        // Windowed frame (zero-padded if win_length < n_fft)
-        std::fill(frame.begin(), frame.end(), 0.0f);
-        for (int i = 0; i < win_length; i++) {
-            frame[i] = sig[t * hop_length + i] * window[i];
-        }
-
-        fft_real(frame.data(), n_fft, spec_re.data(), spec_im.data());
-
-        // Power spectrum → mel → log
-        for (int m = 0; m < num_mel_bins; m++) {
-            float sum = 0.0f;
-            for (int f = 0; f < num_bins; f++) {
-                float power = spec_re[f] * spec_re[f]
-                            + spec_im[f] * spec_im[f];
-                sum += power * fb[m * num_bins + f];
-            }
-            mel[m * num_frames + t] = std::log(sum + log_floor);
-        }
-    }
-
-    return mel;
-}
diff --git a/sdk/src/main/cpp/audio/mel.h b/sdk/src/main/cpp/audio/mel.h
deleted file mode 100644
index 7350c84..0000000
--- a/sdk/src/main/cpp/audio/mel.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#pragma once
-
-#include <cstddef>
-#include <vector>
-
-/// Compute log-mel spectrogram from raw audio.
-/// Returns flattened [num_mel_bins, num_frames] in channels-first layout
-/// (row = mel bin, column = time frame).
-///
-/// Optional parameters (default to the original behaviour):
-///   slaney_norm  — area-normalise each triangular filter by its bandwidth
-///   log_floor    — additive floor before log: log(x + floor)
-///   center       — pad signal by n_fft/2 on each side (reflect mode)
-std::vector<float> mel_spectrogram(
-    const float* audio, size_t length,
-    int sample_rate, int n_fft, int hop_length,
-    int win_length, int num_mel_bins,
-    bool slaney_norm = false,
-    float log_floor = 1e-10f,
-    bool center = false);
diff --git a/sdk/src/main/cpp/audio/stft.cpp b/sdk/src/main/cpp/audio/stft.cpp
deleted file mode 100644
index 8e89ead..0000000
--- a/sdk/src/main/cpp/audio/stft.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-#include "stft.h"
-#include "fft.h"
-#include <cstring>
-#include <vector>
-#include <algorithm>
-
-int stft_num_frames(size_t signal_length, int fft_size, int hop_size) {
-    if (static_cast<int>(signal_length) < fft_size) return 0;
-    return static_cast<int>((signal_length - fft_size) / hop_size) + 1;
-}
-
-void stft_forward(const float* audio, size_t length,
-                  int fft_size, int hop_size,
-                  const float* window,
-                  float* out_real, float* out_imag)
-{
-    int num_frames = stft_num_frames(length, fft_size, hop_size);
-    int freq_bins = fft_size / 2 + 1;
-    std::vector<float> frame(fft_size);
-
-    for (int t = 0; t < num_frames; t++) {
-        // Apply window
-        for (int i = 0; i < fft_size; i++) {
-            frame[i] = audio[t * hop_size + i] * window[i];
-        }
-
-        fft_real(frame.data(), fft_size,
-                 out_real + t * freq_bins,
-                 out_imag + t * freq_bins);
-    }
-}
-
-void stft_inverse(const float* spec_real, const float* spec_imag,
-                  int num_frames, int fft_size, int hop_size,
-                  const float* window,
-                  float* output, size_t out_length)
-{
-    int freq_bins = fft_size / 2 + 1;
-    std::vector<float> frame(fft_size);
-    std::vector<float> win_sum(out_length, 0.0f);
-
-    std::memset(output, 0, out_length * sizeof(float));
-
-    for (int t = 0; t < num_frames; t++) {
-        ifft_real(spec_real + t * freq_bins,
-                  spec_imag + t * freq_bins,
-                  fft_size, frame.data());
-
-        // Overlap-add with synthesis window
-        for (int i = 0; i < fft_size; i++) {
-            size_t idx = t * hop_size + i;
-            if (idx >= out_length) break;
-            output[idx] += frame[i] * window[i];
-            win_sum[idx] += window[i] * window[i];
-        }
-    }
-
-    // Normalize by window sum
-    for (size_t i = 0; i < out_length; i++) {
-        if (win_sum[i] > 1e-8f) {
-            output[i] /= win_sum[i];
-        }
-    }
-}
diff --git a/sdk/src/main/cpp/audio/stft.h b/sdk/src/main/cpp/audio/stft.h
deleted file mode 100644
index b5f5160..0000000
--- a/sdk/src/main/cpp/audio/stft.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#pragma once
-
-#include <cstddef>
-
-/// Number of STFT frames for a given signal length.
-int stft_num_frames(size_t signal_length, int fft_size, int hop_size);
-
-/// Forward STFT with overlap-add windowing.
-/// @param audio       Input signal
-/// @param length      Number of samples
-/// @param fft_size    FFT size (e.g. 960 for DeepFilterNet3)
-/// @param hop_size    Hop size (e.g. 480)
-/// @param window      Analysis window [fft_size]
-/// @param out_real    Output real spectrum [num_frames * freq_bins]
-/// @param out_imag    Output imaginary spectrum [num_frames * freq_bins]
-void stft_forward(const float* audio, size_t length,
-                  int fft_size, int hop_size,
-                  const float* window,
-                  float* out_real, float* out_imag);
-
-/// Inverse STFT via overlap-add.
-/// @param spec_real   Real spectrum [num_frames * freq_bins]
-/// @param spec_imag   Imaginary spectrum [num_frames * freq_bins]
-/// @param num_frames  Number of STFT frames
-/// @param fft_size    FFT size
-/// @param hop_size    Hop size
-/// @param window      Synthesis window [fft_size]
-/// @param output      Output signal buffer
-/// @param out_length  Expected output length (samples)
-void stft_inverse(const float* spec_real, const float* spec_imag,
-                  int num_frames, int fft_size, int hop_size,
-                  const float* window,
-                  float* output, size_t out_length);
diff --git a/sdk/src/main/cpp/jni_bridge.cpp b/sdk/src/main/cpp/jni_bridge.cpp
index 9ec1ed5..c2bc7cd 100644
--- a/sdk/src/main/cpp/jni_bridge.cpp
+++ b/sdk/src/main/cpp/jni_bridge.cpp
@@ -1,41 +1,42 @@
 #include <jni.h>
 #include <android/log.h>
-#include <string>
-#include <cstdint>
 
-#include <speech_core/speech_core_c.h>
-#include "models/onnx_engine.h"
-#include "models/silero_vad.h"
-#include "models/parakeet_stt.h"
-#include "models/kokoro_tts.h"
-#include "models/deepfilter.h"
+#include <speech_core/models/deepfilter.h>
+#include <speech_core/models/kokoro_tts.h>
+#include <speech_core/models/onnx_engine.h>
+#include <speech_core/models/parakeet_stt.h>
+#include <speech_core/models/silero_vad.h>
+#include <speech_core/pipeline/agent_config.h>
+#include <speech_core/pipeline/voice_pipeline.h>
+
+#include <cstdint>
+#include <memory>
+#include <string>
 
 #define LOG_TAG "Speech"
 #define LOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)
 #define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__)
 
 // ---------------------------------------------------------------------------
-// Pipeline handle — owns all native objects for one pipeline instance
+// Pipeline handle
+//
+// speech_core::* model wrappers directly implement the speech_core interfaces
+// (VADInterface / STTInterface / TTSInterface / EnhancerInterface), so the
+// JNI bridge constructs them and hands references to VoicePipeline. No
+// C-vtable adapters needed — the entire vtable boilerplate that used to live
+// here was deleted in this change.
 // ---------------------------------------------------------------------------
 
 struct PipelineHandle {
-    sc_pipeline_t pipeline = nullptr;
-    SileroVad* vad = nullptr;
-    ParakeetStt* stt = nullptr;
-    KokoroTts* tts = nullptr;
-    DeepFilterEnhancer* enhancer = nullptr;
+    std::unique_ptr<speech_core::SileroVad> vad;
+    std::unique_ptr<speech_core::ParakeetStt> stt;
+    std::unique_ptr<speech_core::KokoroTts> tts;
+    std::unique_ptr<speech_core::DeepFilterEnhancer> enhancer;
+    std::unique_ptr<speech_core::VoicePipeline> pipeline;
 
     JavaVM* jvm = nullptr;
     jobject callback = nullptr;
     jmethodID on_event_mid = nullptr;
-
-    ~PipelineHandle() {
-        if (pipeline) sc_pipeline_destroy(pipeline);
-        delete enhancer;
-        delete tts;
-        delete stt;
-        delete vad;
-    }
 };
 
 // ---------------------------------------------------------------------------
@@ -51,152 +52,68 @@ static JNIEnv* get_env(JavaVM* jvm) {
 }
 
 // ---------------------------------------------------------------------------
-// speech-core vtable adapters
+// Pipeline event → Kotlin onEvent
+//
+// Kotlin signature unchanged:
+//   void onEvent(int type, String text, byte[] audio,
+//                float confidence, float sttMs, float ttsMs)
 // ---------------------------------------------------------------------------
 
-// --- VAD ---
-
-static float vad_process_chunk(void* ctx, const float* samples, size_t len) {
-    return static_cast<SileroVad*>(ctx)->process_chunk(samples, len);
-}
-static void vad_reset(void* ctx) {
-    static_cast<SileroVad*>(ctx)->reset();
-}
-static int vad_sample_rate(void* ctx) {
-    return static_cast<SileroVad*>(ctx)->input_sample_rate();
-}
-static size_t vad_chunk_size(void* ctx) {
-    return static_cast<SileroVad*>(ctx)->chunk_size();
-}
-
-// --- STT ---
-
-static sc_transcription_result_t stt_transcribe(
-    void* ctx, const float* audio, size_t len, int sr)
-{
-    auto* stt = static_cast<ParakeetStt*>(ctx);
-    auto r = stt->transcribe(audio, len, sr);
-
-    // Static buffers — valid until next call (per C API contract)
-    static thread_local std::string text_buf;
-    static thread_local std::string lang_buf;
-    text_buf = std::move(r.text);
-    lang_buf = std::move(r.language);
-
-    return {
-        .text = text_buf.c_str(),
-        .language = lang_buf.empty() ? nullptr : lang_buf.c_str(),
-        .confidence = r.confidence,
-        .start_time = 0.0f,
-        .end_time = 0.0f,
-    };
-}
-static int stt_sample_rate(void* ctx) {
-    return static_cast<ParakeetStt*>(ctx)->input_sample_rate();
-}
-
-static void stt_begin_stream(void* ctx, int sample_rate) {
-    static_cast<ParakeetStt*>(ctx)->begin_stream(sample_rate);
-}
-
-static sc_partial_result_t stt_push_chunk(void* ctx, const float* audio, size_t len) {
-    auto* stt = static_cast<ParakeetStt*>(ctx);
-    auto r = stt->push_chunk(audio, len);
-    static thread_local std::string text_buf;
-    static thread_local std::string lang_buf;
-    text_buf = std::move(r.text);
-    lang_buf = std::move(r.language);
-    return {
-        .text = text_buf.c_str(),
-        .language = lang_buf.empty() ? nullptr : lang_buf.c_str(),
-        .confidence = r.confidence,
-    };
-}
-
-static void stt_flush_stream(void* ctx) {
-    static_cast<ParakeetStt*>(ctx)->flush_stream();
-}
-
-static sc_transcription_result_t stt_end_stream(void* ctx) {
-    auto* stt = static_cast<ParakeetStt*>(ctx);
-    auto r = stt->end_stream();
-    static thread_local std::string text_buf;
-    static thread_local std::string lang_buf;
-    text_buf = std::move(r.text);
-    lang_buf = std::move(r.language);
-    return {
-        .text = text_buf.c_str(),
-        .language = lang_buf.empty() ? nullptr : lang_buf.c_str(),
-        .confidence = r.confidence,
-        .start_time = 0.0f,
-        .end_time = 0.0f,
-    };
-}
-
-static void stt_cancel_stream(void* ctx) {
-    static_cast<ParakeetStt*>(ctx)->cancel_stream();
-}
-
-// --- TTS ---
-
-static void tts_synthesize(
-    void* ctx, const char* text, const char* language,
-    sc_tts_chunk_fn on_chunk, void* chunk_ctx)
-{
-    auto* tts = static_cast<KokoroTts*>(ctx);
-    tts->synthesize(text, language, on_chunk, chunk_ctx);
-}
-static int tts_sample_rate(void* ctx) {
-    return static_cast<KokoroTts*>(ctx)->output_sample_rate();
-}
-static void tts_cancel(void* ctx) {
-    static_cast<KokoroTts*>(ctx)->cancel();
-}
-
-// --- Enhancer ---
-
-static void enhancer_enhance(
-    void* ctx, const float* input, size_t len, int sr, float* output)
-{
-    static_cast<DeepFilterEnhancer*>(ctx)->enhance(input, len, sr, output);
-}
-static int enhancer_sample_rate(void* ctx) {
-    return static_cast<DeepFilterEnhancer*>(ctx)->input_sample_rate();
+// Map speech_core::EventType → the int values the Kotlin side expects.
+//
+// Kotlin's SpeechPipeline.kt switches on raw ints inherited from the original
+// C ABI (sc_event_t.type), whose ordering differs from speech_core::EventType:
+// the C ABI had ResponseAudioDelta=7 / ResponseDone=8, the enum has them
+// swapped. Map explicitly so renumbering speech_core::EventType in the future
+// can't silently break the Kotlin event stream.
+static jint to_kotlin_event(speech_core::EventType t) {
+    using ET = speech_core::EventType;
+    switch (t) {
+        case ET::SessionCreated:         return 0;
+        case ET::SpeechStarted:          return 1;
+        case ET::SpeechEnded:            return 2;
+        case ET::PartialTranscription:   return 3;
+        case ET::TranscriptionCompleted: return 4;
+        case ET::ResponseCreated:        return 5;
+        case ET::ResponseInterrupted:    return 6;
+        case ET::ResponseAudioDelta:     return 7;
+        case ET::ResponseDone:           return 8;
+        case ET::ToolCallStarted:        return 9;
+        case ET::ToolCallCompleted:      return 10;
+        case ET::Error:                  return 11;
+    }
+    return -1;
 }
 
-// ---------------------------------------------------------------------------
-// Event callback → Kotlin
-// ---------------------------------------------------------------------------
-
-static void on_pipeline_event(const sc_event_t* event, void* context) {
-    auto* handle = static_cast<PipelineHandle*>(context);
+static void dispatch_event(PipelineHandle* h,
+                           const speech_core::PipelineEvent& event) {
     LOGI("event type=%d text='%.60s' audio=%zu stt=%.0fms tts=%.0fms",
-         event->type, event->text ? event->text : "",
-         event->audio_data_length, event->stt_duration_ms, event->tts_duration_ms);
-    if (!handle->callback) return;
+         static_cast<int>(event.type), event.text.c_str(),
+         event.audio_data.size(), event.stt_duration_ms,
+         event.tts_duration_ms);
+
+    if (!h->callback) return;
 
-    JNIEnv* env = get_env(handle->jvm);
+    JNIEnv* env = get_env(h->jvm);
     if (!env) return;
 
-    jstring text = event->text
-        ? env->NewStringUTF(event->text) : nullptr;
+    jstring text = !event.text.empty()
+        ? env->NewStringUTF(event.text.c_str()) : nullptr;
 
     jbyteArray audio = nullptr;
-    if (event->audio_data && event->audio_data_length > 0) {
-        audio = env->NewByteArray(static_cast<jsize>(event->audio_data_length));
+    if (!event.audio_data.empty()) {
+        audio = env->NewByteArray(static_cast<jsize>(event.audio_data.size()));
         env->SetByteArrayRegion(audio, 0,
-            static_cast<jsize>(event->audio_data_length),
-            reinterpret_cast<const jbyte*>(event->audio_data));
+            static_cast<jsize>(event.audio_data.size()),
+            reinterpret_cast<const jbyte*>(event.audio_data.data()));
     }
 
-    // void onEvent(int type, String text, byte[] audio,
-    //              float confidence, float sttMs, float ttsMs)
-    env->CallVoidMethod(handle->callback, handle->on_event_mid,
-        static_cast<jint>(event->type),
+    env->CallVoidMethod(h->callback, h->on_event_mid,
+        to_kotlin_event(event.type),
         text, audio,
-        event->confidence,
-        event->stt_duration_ms,
-        event->tts_duration_ms);
+        event.confidence,
+        event.stt_duration_ms,
+        event.tts_duration_ms);
 
     if (audio) env->DeleteLocalRef(audio);
     if (text) env->DeleteLocalRef(text);
@@ -227,7 +144,7 @@ Java_audio_soniqo_speech_NativeBridge_nativeCreate(
     bool nnapi = useNnapi;
     std::string suffix = useInt8 ? "-int8" : "";
 
-    auto* h = new PipelineHandle();
+    auto h = std::make_unique<PipelineHandle>();
     env->GetJavaVM(&h->jvm);
     h->callback = env->NewGlobalRef(callback);
 
@@ -238,62 +155,38 @@ Java_audio_soniqo_speech_NativeBridge_nativeCreate(
 
     try {
         // Load models
-        h->vad = new SileroVad(dir + "/silero-vad.onnx", false);
-        h->stt = new ParakeetStt(
+        h->vad = std::make_unique<speech_core::SileroVad>(
+            dir + "/silero-vad.onnx", /*hw_accel=*/false);
+        h->stt = std::make_unique<speech_core::ParakeetStt>(
             dir + "/parakeet-encoder" + suffix + ".onnx",
             dir + "/parakeet-decoder-joint" + suffix + ".onnx",
             dir + "/vocab.json",
             nnapi);
-        h->tts = new KokoroTts(
+        h->tts = std::make_unique<speech_core::KokoroTts>(
             dir + "/kokoro-e2e.onnx",
             dir + "/voices",
             dir,
             nnapi);
 
-        // Build vtables
-        sc_vad_vtable_t vad_vt = {
-            .context = h->vad,
-            .process_chunk = vad_process_chunk,
-            .reset = vad_reset,
-            .input_sample_rate = vad_sample_rate,
-            .chunk_size = vad_chunk_size,
-        };
-
-        sc_stt_vtable_t stt_vt = {};
-        stt_vt.context = h->stt;
-        stt_vt.transcribe = stt_transcribe;
-        stt_vt.input_sample_rate = stt_sample_rate;
-        stt_vt.begin_stream = stt_begin_stream;
-        stt_vt.push_chunk = stt_push_chunk;
-        stt_vt.flush_stream = stt_flush_stream;
-        stt_vt.end_stream = stt_end_stream;
-        stt_vt.cancel_stream = stt_cancel_stream;
-
-        sc_tts_vtable_t tts_vt = {};
-        tts_vt.context = h->tts;
-        tts_vt.synthesize = tts_synthesize;
-        tts_vt.output_sample_rate = tts_sample_rate;
-        tts_vt.cancel = tts_cancel;
-
-        // Pipeline config
-        sc_config_t config = sc_config_default();
-        config.min_silence_duration = 0.5f;
-        config.eager_stt = false;
-        config.min_speech_duration = 0.15f;
-        config.post_playback_guard = 0.15f;
-        config.emit_partial_transcriptions = emitPartialTranscriptions;
-        config.partial_transcription_interval = partialTranscriptionInterval;
-
-        config.mode = SC_MODE_ECHO;
-        h->pipeline = sc_pipeline_create(
-            stt_vt, tts_vt, nullptr, vad_vt,
-            config, on_pipeline_event, h);
+        speech_core::AgentConfig cfg;
+        cfg.vad.min_silence_duration = 0.5f;
+        cfg.vad.min_speech_duration = 0.15f;
+        cfg.eager_stt = false;
+        cfg.post_playback_guard = 0.15f;
+        cfg.emit_partial_transcriptions = emitPartialTranscriptions;
+        cfg.partial_transcription_interval = partialTranscriptionInterval;
+        cfg.mode = speech_core::AgentConfig::Mode::Echo;
 
         // Note: DeepFilterNet3 noise cancellation is disabled in the pipeline.
-        // DFN operates at 48kHz but the pipeline pushes 16kHz audio — running
-        // DFN without resampling produces artifacts. Needs 16k→48k→DFN→48k→16k
-        // resample chain before it can be re-enabled. See issue #12.
-        // The model is still downloaded for future use.
+        // DFN operates at 48 kHz but the pipeline pushes 16 kHz audio —
+        // running DFN without resampling produces artifacts. Needs a
+        // 16k→48k→DFN→48k→16k resample chain before it can be re-enabled.
+        // See issue #12. The model is still downloaded for future use.
+
+        PipelineHandle* raw = h.get();
+        h->pipeline = std::make_unique<speech_core::VoicePipeline>(
+            *h->stt, *h->tts, /*llm=*/nullptr, *h->vad, cfg,
+            [raw](const speech_core::PipelineEvent& e) { dispatch_event(raw, e); });
 
         auto& engine = OnnxEngine::get();
         if (engine.had_nnapi_fallback()) {
@@ -305,7 +198,6 @@ Java_audio_soniqo_speech_NativeBridge_nativeCreate(
     } catch (const std::exception& e) {
         LOGE("Pipeline creation failed: %s", e.what());
         if (h->callback) env->DeleteGlobalRef(h->callback);
-        delete h;
         jclass ex_cls = env->FindClass("java/lang/RuntimeException");
         if (ex_cls) {
             std::string msg = std::string("Native pipeline failed: ") + e.what();
@@ -314,7 +206,7 @@ Java_audio_soniqo_speech_NativeBridge_nativeCreate(
         return 0;
     }
 
-    return reinterpret_cast<jlong>(h);
+    return reinterpret_cast<jlong>(h.release());
 }
 
 JNIEXPORT jstring JNICALL
@@ -344,7 +236,7 @@ Java_audio_soniqo_speech_NativeBridge_nativeStart(
     JNIEnv* /*env*/, jobject /*thiz*/, jlong handle)
 {
     auto* h = reinterpret_cast<PipelineHandle*>(handle);
-    if (h && h->pipeline) sc_pipeline_start(h->pipeline);
+    if (h && h->pipeline) h->pipeline->start();
 }
 
 JNIEXPORT void JNICALL
@@ -352,7 +244,7 @@ Java_audio_soniqo_speech_NativeBridge_nativeStop(
     JNIEnv* /*env*/, jobject /*thiz*/, jlong handle)
 {
     auto* h = reinterpret_cast<PipelineHandle*>(handle);
-    if (h && h->pipeline) sc_pipeline_stop(h->pipeline);
+    if (h && h->pipeline) h->pipeline->stop();
 }
 
 JNIEXPORT void JNICALL
@@ -364,7 +256,7 @@ Java_audio_soniqo_speech_NativeBridge_nativePushAudio(
     if (!h || !h->pipeline) return;
 
     float* data = env->GetFloatArrayElements(samples, nullptr);
-    sc_pipeline_push_audio(h->pipeline, data, static_cast<size_t>(count));
+    h->pipeline->push_audio(data, static_cast<size_t>(count));
     env->ReleaseFloatArrayElements(samples, data, JNI_ABORT);
 }
 
@@ -373,7 +265,7 @@ Java_audio_soniqo_speech_NativeBridge_nativeResumeListen(
     JNIEnv* /*env*/, jobject /*thiz*/, jlong handle)
 {
     auto* h = reinterpret_cast<PipelineHandle*>(handle);
-    if (h && h->pipeline) sc_pipeline_resume_listening(h->pipeline);
+    if (h && h->pipeline) h->pipeline->resume_listening();
 }
 
 JNIEXPORT jint JNICALL
@@ -381,8 +273,8 @@ Java_audio_soniqo_speech_NativeBridge_nativeGetState(
     JNIEnv* /*env*/, jobject /*thiz*/, jlong handle)
 {
     auto* h = reinterpret_cast<PipelineHandle*>(handle);
-    if (!h || !h->pipeline) return SC_STATE_IDLE;
-    return sc_pipeline_state(h->pipeline);
+    if (!h || !h->pipeline) return 0;
+    return static_cast<jint>(h->pipeline->state());
 }
 
 } // extern "C"
diff --git a/sdk/src/main/cpp/models/deepfilter.cpp b/sdk/src/main/cpp/models/deepfilter.cpp
deleted file mode 100644
index eb43bae..0000000
--- a/sdk/src/main/cpp/models/deepfilter.cpp
+++ /dev/null
@@ -1,192 +0,0 @@
-#include "deepfilter.h"
-#include "onnx_engine.h"
-#include "../audio/stft.h"
-#include <cmath>
-#include <cstring>
-#include <fstream>
-
-DeepFilterEnhancer::DeepFilterEnhancer(
-    const std::string& model_path,
-    const std::string& auxiliary_path,
-    bool nnapi)
-{
-    auto& engine = OnnxEngine::get();
-    api_ = engine.api();
-    session_ = engine.load(model_path, nnapi);
-    load_auxiliary(auxiliary_path);
-}
-
-DeepFilterEnhancer::~DeepFilterEnhancer() {
-    if (session_) api_->ReleaseSession(session_);
-}
-
-void DeepFilterEnhancer::load_auxiliary(const std::string& path) {
-    // Load precomputed ERB filterbanks and window from binary file.
-    // Format: erb_fb [481*32] | erb_inv_fb [32*481] | window [960]  (float32)
-    std::ifstream file(path, std::ios::binary);
-    if (!file.is_open()) {
-        LOGE("Auxiliary file not found: %s", path.c_str());
-        return;
-    }
-
-    erb_fb_.resize(cfg_.freq_bins * cfg_.erb_bands);
-    erb_inv_fb_.resize(cfg_.erb_bands * cfg_.freq_bins);
-    window_.resize(cfg_.fft_size);
-
-    file.read(reinterpret_cast<char*>(erb_fb_.data()),
-              erb_fb_.size() * sizeof(float));
-    file.read(reinterpret_cast<char*>(erb_inv_fb_.data()),
-              erb_inv_fb_.size() * sizeof(float));
-    file.read(reinterpret_cast<char*>(window_.data()),
-              window_.size() * sizeof(float));
-}
-
-void DeepFilterEnhancer::compute_erb_features(
-    const float* spec_real, const float* spec_imag, int num_frames,
-    std::vector<float>& feat_erb, std::vector<float>& feat_spec)
-{
-    feat_erb.resize(num_frames * cfg_.erb_bands);
-    feat_spec.resize(num_frames * 2 * cfg_.df_bins);
-
-    for (int t = 0; t < num_frames; t++) {
-        // Power spectrum → ERB bands
-        for (int b = 0; b < cfg_.erb_bands; b++) {
-            float sum = 0.0f;
-            for (int f = 0; f < cfg_.freq_bins; f++) {
-                float re = spec_real[t * cfg_.freq_bins + f];
-                float im = spec_imag[t * cfg_.freq_bins + f];
-                sum += (re * re + im * im) * erb_fb_[f * cfg_.erb_bands + b];
-            }
-            feat_erb[t * cfg_.erb_bands + b] = 10.0f * std::log10(sum + 1e-10f);
-        }
-
-        // Complex spectrum for deep-filtered bins
-        for (int f = 0; f < cfg_.df_bins; f++) {
-            feat_spec[t * 2 * cfg_.df_bins + f] =
-                spec_real[t * cfg_.freq_bins + f];
-            feat_spec[t * 2 * cfg_.df_bins + cfg_.df_bins + f] =
-                spec_imag[t * cfg_.freq_bins + f];
-        }
-    }
-}
-
-void DeepFilterEnhancer::apply_erb_mask(
-    float* spec_real, float* spec_imag,
-    const float* mask, int num_frames)
-{
-    for (int t = 0; t < num_frames; t++) {
-        for (int f = 0; f < cfg_.freq_bins; f++) {
-            // Expand ERB mask to full spectrum
-            float gain = 0.0f;
-            for (int b = 0; b < cfg_.erb_bands; b++) {
-                gain += mask[t * cfg_.erb_bands + b]
-                        * erb_inv_fb_[b * cfg_.freq_bins + f];
-            }
-            spec_real[t * cfg_.freq_bins + f] *= gain;
-            spec_imag[t * cfg_.freq_bins + f] *= gain;
-        }
-    }
-}
-
-void DeepFilterEnhancer::apply_deep_filter(
-    float* spec_real, float* spec_imag,
-    const float* coefs, int num_frames)
-{
-    int pad_before = cfg_.df_order - 1 - cfg_.df_lookahead;
-
-    for (int t = 0; t < num_frames; t++) {
-        for (int f = 0; f < cfg_.df_bins; f++) {
-            float out_re = 0.0f, out_im = 0.0f;
-
-            for (int n = 0; n < cfg_.df_order; n++) {
-                int src_t = t + n - pad_before;
-                if (src_t < 0 || src_t >= num_frames) continue;
-
-                float x_re = spec_real[src_t * cfg_.freq_bins + f];
-                float x_im = spec_imag[src_t * cfg_.freq_bins + f];
-
-                // coefs layout: [1, df_order, T, df_bins, 2]
-                int idx = (n * num_frames * cfg_.df_bins + t * cfg_.df_bins + f) * 2;
-                float w_re = coefs[idx];
-                float w_im = coefs[idx + 1];
-
-                // Complex multiply
-                out_re += x_re * w_re - x_im * w_im;
-                out_im += x_re * w_im + x_im * w_re;
-            }
-
-            spec_real[t * cfg_.freq_bins + f] = out_re;
-            spec_imag[t * cfg_.freq_bins + f] = out_im;
-        }
-    }
-}
-
-void DeepFilterEnhancer::enhance(
-    const float* audio, size_t length, int /*sample_rate*/, float* output)
-{
-    auto* mem = OnnxEngine::get().cpu_memory();
-
-    // --- STFT ---
-
-    int num_frames = stft_num_frames(length, cfg_.fft_size, cfg_.hop_size);
-    std::vector<float> spec_real(num_frames * cfg_.freq_bins);
-    std::vector<float> spec_imag(num_frames * cfg_.freq_bins);
-
-    stft_forward(audio, length, cfg_.fft_size, cfg_.hop_size,
-                 window_.data(), spec_real.data(), spec_imag.data());
-
-    // --- features ---
-
-    std::vector<float> feat_erb, feat_spec;
-    compute_erb_features(spec_real.data(), spec_imag.data(),
-                         num_frames, feat_erb, feat_spec);
-
-    // --- ONNX inference ---
-
-    int64_t T = num_frames;
-    const int64_t erb_shape[]  = {1, 1, T, cfg_.erb_bands};
-    const int64_t spec_shape[] = {1, 2, T, cfg_.df_bins};
-
-    OrtValue* t_erb = nullptr;
-    ort_check(api_, api_->CreateTensorWithDataAsOrtValue(
-        mem, feat_erb.data(), feat_erb.size() * sizeof(float),
-        erb_shape, 4, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &t_erb));
-
-    OrtValue* t_spec = nullptr;
-    ort_check(api_, api_->CreateTensorWithDataAsOrtValue(
-        mem, feat_spec.data(), feat_spec.size() * sizeof(float),
-        spec_shape, 4, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &t_spec));
-
-    const char* in_names[]  = {"feat_erb", "feat_spec"};
-    const char* out_names[] = {"erb_mask", "df_coefs"};
-    OrtValue* inputs[]  = {t_erb, t_spec};
-    OrtValue* outputs[] = {nullptr, nullptr};
-
-    ort_check(api_, api_->Run(
-        session_, nullptr,
-        in_names, inputs, 2,
-        out_names, 2, outputs));
-
-    float* erb_mask = nullptr;
-    ort_check(api_, api_->GetTensorMutableData(outputs[0], (void**)&erb_mask));
-    float* df_coefs = nullptr;
-    ort_check(api_, api_->GetTensorMutableData(outputs[1], (void**)&df_coefs));
-
-    // --- apply mask + deep filter ---
-
-    apply_erb_mask(spec_real.data(), spec_imag.data(), erb_mask, num_frames);
-    apply_deep_filter(spec_real.data(), spec_imag.data(), df_coefs, num_frames);
-
-    // --- inverse STFT ---
-
-    stft_inverse(spec_real.data(), spec_imag.data(), num_frames,
-                 cfg_.fft_size, cfg_.hop_size,
-                 window_.data(), output, length);
-
-    // --- cleanup ---
-
-    api_->ReleaseValue(outputs[1]);
-    api_->ReleaseValue(outputs[0]);
-    api_->ReleaseValue(t_spec);
-    api_->ReleaseValue(t_erb);
-}
diff --git a/sdk/src/main/cpp/models/deepfilter.h b/sdk/src/main/cpp/models/deepfilter.h
deleted file mode 100644
index c6603e7..0000000
--- a/sdk/src/main/cpp/models/deepfilter.h
+++ /dev/null
@@ -1,58 +0,0 @@
-#pragma once
-
-#include <onnxruntime_c_api.h>
-#include <string>
-#include <vector>
-
-/// DeepFilterNet3 — real-time speech enhancement / noise cancellation.
-/// Processes audio at 48 kHz using STFT + ERB filterbank + neural network.
-/// Model size: ~2.1M parameters (~8 MB FP16).
-class DeepFilterEnhancer {
-public:
-    struct Config {
-        int fft_size    = 960;
-        int hop_size    = 480;
-        int erb_bands   = 32;
-        int df_bins     = 96;   // deep-filtered frequency bins
-        int df_order    = 5;    // filter taps
-        int df_lookahead = 2;
-        int freq_bins   = 481;  // fft_size / 2 + 1
-        int sample_rate = 48000;
-    };
-
-    DeepFilterEnhancer(const std::string& model_path,
-                       const std::string& auxiliary_path,
-                       bool nnapi = true);
-    ~DeepFilterEnhancer();
-
-    /// Enhance audio by removing noise.
-    /// @param audio       Input PCM Float32 at 48 kHz
-    /// @param length      Number of samples
-    /// @param sample_rate Input sample rate (must be 48000)
-    /// @param output      Pre-allocated output buffer (same length)
-    void enhance(const float* audio, size_t length, int sample_rate,
-                 float* output);
-
-    int input_sample_rate() const { return cfg_.sample_rate; }
-
-private:
-    void load_auxiliary(const std::string& path);
-    void compute_erb_features(const float* spectrum_real,
-                              const float* spectrum_imag,
-                              int num_frames,
-                              std::vector<float>& feat_erb,
-                              std::vector<float>& feat_spec);
-    void apply_erb_mask(float* spectrum_real, float* spectrum_imag,
-                        const float* mask, int num_frames);
-    void apply_deep_filter(float* spectrum_real, float* spectrum_imag,
-                           const float* coefs, int num_frames);
-
-    const OrtApi* api_;
-    OrtSession* session_ = nullptr;
-    Config cfg_;
-
-    // ERB filterbanks
-    std::vector<float> erb_fb_;       // [freq_bins, erb_bands]
-    std::vector<float> erb_inv_fb_;   // [erb_bands, freq_bins]
-    std::vector<float> window_;       // Vorbis window [fft_size]
-};
diff --git a/sdk/src/main/cpp/models/inference_engine.h b/sdk/src/main/cpp/models/inference_engine.h
deleted file mode 100644
index c2eb10e..0000000
--- a/sdk/src/main/cpp/models/inference_engine.h
+++ /dev/null
@@ -1,73 +0,0 @@
-#pragma once
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <vector>
-
-/// Supported inference backends.
-enum class Backend { ONNX, LITERT, AUTO };
-
-/// Tensor element data types.
-enum class DType { FLOAT32, INT64, INT32, INT8 };
-
-/// Describes a tensor's data, shape, and type for passing to inference.
-struct TensorInfo {
-    const void* data;
-    std::vector<int64_t> shape;
-    DType dtype;
-
-    size_t byte_size() const {
-        size_t elems = 1;
-        for (auto d : shape) elems *= static_cast<size_t>(d);
-        switch (dtype) {
-            case DType::FLOAT32: return elems * 4;
-            case DType::INT64:   return elems * 8;
-            case DType::INT32:   return elems * 4;
-            case DType::INT8:    return elems * 1;
-        }
-        return elems * 4;
-    }
-};
-
-/// Wraps a single output tensor from an inference call.
-/// Owns the backend-specific memory — valid until destroyed or next run().
-class OutputTensor {
-public:
-    virtual ~OutputTensor() = default;
-
-    virtual float* data_float() = 0;
-    virtual int64_t* data_int64() = 0;
-    virtual std::vector<int64_t> shape() = 0;
-    virtual size_t element_count() = 0;
-};
-
-/// A loaded model session — run inference with named inputs/outputs.
-class InferenceSession {
-public:
-    virtual ~InferenceSession() = default;
-
-    /// Run inference. Outputs are returned as owned OutputTensor objects.
-    virtual std::vector<std::unique_ptr<OutputTensor>> run(
-        const std::vector<const char*>& input_names,
-        const std::vector<TensorInfo>& inputs,
-        const std::vector<const char*>& output_names) = 0;
-};
-
-/// Factory for loading models. Each backend implements this.
-class InferenceBackend {
-public:
-    virtual ~InferenceBackend() = default;
-
-    virtual std::unique_ptr<InferenceSession> load(
-        const std::string& path, bool hw_accel = true) = 0;
-
-    virtual Backend type() const = 0;
-};
-
-/// Detect the optimal backend for the current device's SoC.
-Backend detect_optimal_backend();
-
-/// Create a backend instance. AUTO resolves via detect_optimal_backend().
-std::unique_ptr<InferenceBackend> create_backend(Backend preference);
diff --git a/sdk/src/main/cpp/models/kokoro_multilingual.cpp b/sdk/src/main/cpp/models/kokoro_multilingual.cpp
deleted file mode 100644
index 9987258..0000000
--- a/sdk/src/main/cpp/models/kokoro_multilingual.cpp
+++ /dev/null
@@ -1,1841 +0,0 @@
-#include "kokoro_multilingual.h"
-#include <algorithm>
-#include <cctype>
-#include <cstdint>
-#include <unordered_map>
-#include <vector>
-
-// ---------------------------------------------------------------------------
-// UTF-8 helpers
-// ---------------------------------------------------------------------------
-
-/// Decode one UTF-8 character, returning codepoint and advancing pos.
-static uint32_t utf8_decode(const std::string& s, size_t& pos) {
-    if (pos >= s.size()) return 0;
-    unsigned char c = static_cast<unsigned char>(s[pos]);
-    uint32_t cp;
-    size_t len;
-    if (c < 0x80)       { cp = c;           len = 1; }
-    else if (c < 0xC0)  { cp = c;           len = 1; } // continuation — error
-    else if (c < 0xE0)  { cp = c & 0x1F;    len = 2; }
-    else if (c < 0xF0)  { cp = c & 0x0F;    len = 3; }
-    else                 { cp = c & 0x07;    len = 4; }
-    for (size_t i = 1; i < len && (pos + i) < s.size(); i++) {
-        cp = (cp << 6) | (static_cast<unsigned char>(s[pos + i]) & 0x3F);
-    }
-    pos += len;
-    return cp;
-}
-
-/// Get one UTF-8 character as a string, advancing pos.
-static std::string utf8_char_at(const std::string& s, size_t& pos) {
-    if (pos >= s.size()) return "";
-    unsigned char c = static_cast<unsigned char>(s[pos]);
-    size_t len = 1;
-    if ((c & 0xE0) == 0xC0)      len = 2;
-    else if ((c & 0xF0) == 0xE0) len = 3;
-    else if ((c & 0xF8) == 0xF0) len = 4;
-    if (pos + len > s.size()) len = s.size() - pos;
-    std::string result = s.substr(pos, len);
-    pos += len;
-    return result;
-}
-
-/// Split UTF-8 string into individual characters.
-static std::vector<std::string> utf8_split(const std::string& s) {
-    std::vector<std::string> out;
-    size_t pos = 0;
-    while (pos < s.size()) {
-        out.push_back(utf8_char_at(s, pos));
-    }
-    return out;
-}
-
-/// Encode a Unicode codepoint to UTF-8.
-static std::string utf8_encode(uint32_t cp) {
-    std::string out;
-    if (cp < 0x80) {
-        out += static_cast<char>(cp);
-    } else if (cp < 0x800) {
-        out += static_cast<char>(0xC0 | (cp >> 6));
-        out += static_cast<char>(0x80 | (cp & 0x3F));
-    } else if (cp < 0x10000) {
-        out += static_cast<char>(0xE0 | (cp >> 12));
-        out += static_cast<char>(0x80 | ((cp >> 6) & 0x3F));
-        out += static_cast<char>(0x80 | (cp & 0x3F));
-    } else {
-        out += static_cast<char>(0xF0 | (cp >> 18));
-        out += static_cast<char>(0x80 | ((cp >> 12) & 0x3F));
-        out += static_cast<char>(0x80 | ((cp >> 6) & 0x3F));
-        out += static_cast<char>(0x80 | (cp & 0x3F));
-    }
-    return out;
-}
-
-/// Get codepoint of first UTF-8 character.
-static uint32_t utf8_codepoint(const std::string& s) {
-    size_t pos = 0;
-    return utf8_decode(s, pos);
-}
-
-/// Check if a character is a vowel letter (for Latin languages).
-static bool is_latin_vowel(char c) {
-    char lc = static_cast<char>(std::tolower(static_cast<unsigned char>(c)));
-    return lc == 'a' || lc == 'e' || lc == 'i' || lc == 'o' || lc == 'u' || lc == 'y';
-}
-
-/// Check if character at offset i in string is a vowel (ASCII only).
-static bool is_vowel_at(const std::string& s, size_t i) {
-    if (i >= s.size()) return false;
-    return is_latin_vowel(s[i]);
-}
-
-/// Check if character is a consonant letter (ASCII).
-static bool is_consonant(char c) {
-    char lc = static_cast<char>(std::tolower(static_cast<unsigned char>(c)));
-    return lc >= 'a' && lc <= 'z' && !is_latin_vowel(lc);
-}
-
-/// Lowercase an ASCII string.
-static std::string to_lower_ascii(const std::string& s) {
-    std::string r = s;
-    for (auto& c : r) c = static_cast<char>(std::tolower(static_cast<unsigned char>(c)));
-    return r;
-}
-
-/// Check if a string starts with prefix at position pos.
-static bool starts_with_at(const std::string& s, size_t pos, const std::string& prefix) {
-    if (pos + prefix.size() > s.size()) return false;
-    return s.compare(pos, prefix.size(), prefix) == 0;
-}
-
-/// Post-process IPA for Kokoro vocab compatibility.
-/// Maps IPA symbols that Kokoro doesn't have to ones it does.
-static std::string kokoro_postprocess(const std::string& ipa) {
-    std::string result = ipa;
-
-    // dʒ → ʤ
-    {
-        std::string from = "d\xCA\x92";   // dʒ
-        std::string to   = "\xCA\xA4";     // ʤ
-        size_t pos = 0;
-        while ((pos = result.find(from, pos)) != std::string::npos) {
-            result.replace(pos, from.size(), to);
-            pos += to.size();
-        }
-    }
-    // tʃ → ʧ
-    {
-        std::string from = "t\xCA\x83";   // tʃ
-        std::string to   = "\xCA\xA7";     // ʧ
-        size_t pos = 0;
-        while ((pos = result.find(from, pos)) != std::string::npos) {
-            result.replace(pos, from.size(), to);
-            pos += to.size();
-        }
-    }
-    // ʁ → ɹ  (French uvular R → Kokoro's approximant R)
-    {
-        std::string from = "\xCA\x81";   // ʁ
-        std::string to   = "\xC9\xB9";   // ɹ
-        size_t pos = 0;
-        while ((pos = result.find(from, pos)) != std::string::npos) {
-            result.replace(pos, from.size(), to);
-            pos += to.size();
-        }
-    }
-    return result;
-}
-
-/// Check if position is at word end (next char is space, punct, or end).
-static bool at_word_end(const std::string& s, size_t pos) {
-    if (pos >= s.size()) return true;
-    char c = s[pos];
-    return c == ' ' || c == ',' || c == '.' || c == '!' || c == '?'
-        || c == ';' || c == ':' || c == '-' || c == '\n' || c == '\t';
-}
-
-// ===========================================================================
-// FRENCH
-// ===========================================================================
-
-std::string multilingual::french_g2p(const std::string& text) {
-    std::string s = to_lower_ascii(text);
-    std::string ipa;
-    size_t len = s.size();
-
-    for (size_t i = 0; i < len; ) {
-        // --- Trigraphs ---
-        if (i + 3 <= len) {
-            std::string tri = s.substr(i, 3);
-            if (tri == "eau") { ipa += "o";    i += 3; continue; }
-            if (tri == "ain") {
-                // ain before consonant or end = nasal
-                if (i + 3 >= len || !is_vowel_at(s, i + 3)) {
-                    ipa += "\xC9\x9B\xCC\x83"; // ɛ̃
-                    i += 3; continue;
-                }
-            }
-            if (tri == "ein") {
-                if (i + 3 >= len || !is_vowel_at(s, i + 3)) {
-                    ipa += "\xC9\x9B\xCC\x83"; // ɛ̃
-                    i += 3; continue;
-                }
-            }
-            if (tri == "oin") {
-                if (i + 3 >= len || !is_vowel_at(s, i + 3)) {
-                    ipa += "w\xC9\x9B\xCC\x83"; // wɛ̃
-                    i += 3; continue;
-                }
-            }
-            if (tri == "ien") {
-                if (i + 3 >= len || !is_vowel_at(s, i + 3)) {
-                    ipa += "j\xC9\x9B\xCC\x83"; // jɛ̃
-                    i += 3; continue;
-                }
-            }
-        }
-
-        // --- Digraphs ---
-        if (i + 2 <= len) {
-            std::string di = s.substr(i, 2);
-
-            // Nasal vowels (before consonant or end, not before vowel)
-            if (di == "on" || di == "om") {
-                if (i + 2 >= len || !is_vowel_at(s, i + 2)) {
-                    // Check not followed by another n/m (e.g., "onne")
-                    if (i + 2 < len && (s[i + 2] == 'n' || s[i + 2] == 'm')) {
-                        // Not nasal — "bonne" → not nasal
-                    } else {
-                        ipa += "\xC9\x94\xCC\x83"; // ɔ̃
-                        i += 2; continue;
-                    }
-                }
-            }
-            if (di == "an" || di == "am") {
-                if (i + 2 >= len || !is_vowel_at(s, i + 2)) {
-                    if (i + 2 < len && (s[i + 2] == 'n' || s[i + 2] == 'm')) {
-                        // Not nasal
-                    } else {
-                        ipa += "\xC9\x91\xCC\x83"; // ɑ̃
-                        i += 2; continue;
-                    }
-                }
-            }
-            if (di == "en" || di == "em") {
-                if (i + 2 >= len || !is_vowel_at(s, i + 2)) {
-                    if (i + 2 < len && (s[i + 2] == 'n' || s[i + 2] == 'm')) {
-                        // Not nasal
-                    } else {
-                        ipa += "\xC9\x91\xCC\x83"; // ɑ̃
-                        i += 2; continue;
-                    }
-                }
-            }
-            if (di == "in" || di == "im") {
-                if (i + 2 >= len || !is_vowel_at(s, i + 2)) {
-                    if (i + 2 < len && (s[i + 2] == 'n' || s[i + 2] == 'm')) {
-                        // Not nasal
-                    } else {
-                        ipa += "\xC9\x9B\xCC\x83"; // ɛ̃
-                        i += 2; continue;
-                    }
-                }
-            }
-            if (di == "un" || di == "um") {
-                if (i + 2 >= len || !is_vowel_at(s, i + 2)) {
-                    if (i + 2 < len && (s[i + 2] == 'n' || s[i + 2] == 'm')) {
-                        // Not nasal
-                    } else {
-                        ipa += "\xC5\x93\xCC\x83"; // œ̃
-                        i += 2; continue;
-                    }
-                }
-            }
-
-            // Other digraphs
-            if (di == "ou") { ipa += "u";                 i += 2; continue; }
-            if (di == "oi") { ipa += "wa";                i += 2; continue; }
-            if (di == "ai") { ipa += "\xC9\x9B";         i += 2; continue; } // ɛ
-            if (di == "ei") { ipa += "\xC9\x9B";         i += 2; continue; } // ɛ
-            if (di == "au") { ipa += "o";                 i += 2; continue; }
-            if (di == "eu") { ipa += "\xC3\xB8";         i += 2; continue; } // ø
-            if (di == "ch") { ipa += "\xCA\x83";         i += 2; continue; } // ʃ
-            if (di == "ph") { ipa += "f";                 i += 2; continue; }
-            if (di == "gn") { ipa += "\xC9\xB2";         i += 2; continue; } // ɲ
-            if (di == "qu") { ipa += "k";                 i += 2; continue; }
-            if (di == "gu") {
-                // gu before e/i → g (silent u)
-                if (i + 2 < len && (s[i + 2] == 'e' || s[i + 2] == 'i')) {
-                    ipa += "g"; i += 2; continue;
-                }
-            }
-            if (di == "ss") { ipa += "s";                 i += 2; continue; }
-            if (di == "ll") { ipa += "l";                 i += 2; continue; }
-            if (di == "tt") { ipa += "t";                 i += 2; continue; }
-            if (di == "nn") { ipa += "n";                 i += 2; continue; }
-            if (di == "mm") { ipa += "m";                 i += 2; continue; }
-            if (di == "rr") { ipa += "\xCA\x81";         i += 2; continue; } // ʁ
-        }
-
-        char c = s[i];
-
-        // Context-dependent consonants
-        if (c == 'c') {
-            if (i + 1 < len && (s[i + 1] == 'e' || s[i + 1] == 'i' || s[i + 1] == 'y')) {
-                ipa += "s";
-            } else {
-                ipa += "k";
-            }
-            i++; continue;
-        }
-        if (c == 'g') {
-            if (i + 1 < len && (s[i + 1] == 'e' || s[i + 1] == 'i')) {
-                ipa += "\xCA\x92"; // ʒ
-            } else {
-                ipa += "g";
-            }
-            i++; continue;
-        }
-
-        // Silent final consonants
-        if ((c == 'd' || c == 't' || c == 's' || c == 'x' || c == 'z' || c == 'p')
-            && at_word_end(s, i + 1)) {
-            i++; continue;
-        }
-
-        // Simple consonant mappings
-        if (c == 'j') { ipa += "\xCA\x92"; i++; continue; } // ʒ
-        if (c == 'r') { ipa += "\xCA\x81"; i++; continue; } // ʁ
-        if (c == 'x') { ipa += "ks";       i++; continue; }
-
-        // Vowels
-        if (c == 'e') {
-            // Final 'e' is often silent (schwa)
-            if (at_word_end(s, i + 1) && i > 0) {
-                // Silent final -e (except monosyllables)
-                i++; continue;
-            }
-            ipa += "\xC9\x99"; // ə
-            i++; continue;
-        }
-        if (c == 'u') { ipa += "y";                     i++; continue; }
-        if (c == 'y') { ipa += "i";                     i++; continue; }
-
-        // Passthrough: a, i, o, b, d, f, k, l, m, n, p, t, v, w, z + punctuation
-        if (c == ' ') { ipa += " "; i++; continue; }
-        if (c >= 'a' && c <= 'z') {
-            ipa += c;
-            i++; continue;
-        }
-
-        // Punctuation passthrough
-        if (c == ',' || c == '.' || c == '!' || c == '?' || c == ';' || c == ':' || c == '-') {
-            ipa += c;
-            i++; continue;
-        }
-
-        // Skip unknown characters
-        size_t tmp = i;
-        utf8_char_at(s, tmp);
-        i = tmp;
-    }
-
-    return kokoro_postprocess(ipa);
-}
-
-// ===========================================================================
-// SPANISH
-// ===========================================================================
-
-std::string multilingual::spanish_g2p(const std::string& text) {
-    // Work with UTF-8 characters for accented vowels
-    auto chars = utf8_split(text);
-    std::string ipa;
-
-    for (size_t i = 0; i < chars.size(); ) {
-        std::string c = chars[i];
-        uint32_t cp = utf8_codepoint(c);
-
-        // Lowercase for comparison
-        std::string cl;
-        if (cp >= 'A' && cp <= 'Z') {
-            cl = std::string(1, static_cast<char>(cp + 32));
-        } else {
-            cl = c;
-        }
-
-        // --- Digraphs (check two chars) ---
-        std::string next_l;
-        if (i + 1 < chars.size()) {
-            uint32_t ncp = utf8_codepoint(chars[i + 1]);
-            if (ncp >= 'A' && ncp <= 'Z') {
-                next_l = std::string(1, static_cast<char>(ncp + 32));
-            } else {
-                next_l = chars[i + 1];
-            }
-        }
-
-        if (!next_l.empty()) {
-            std::string di = cl + next_l;
-            if (di == "ch") { ipa += "t\xCA\x83";       i += 2; continue; } // tʃ
-            if (di == "ll") { ipa += "\xCA\x9D";         i += 2; continue; } // ʝ
-            if (di == "rr") { ipa += "r";                 i += 2; continue; }
-            if (di == "qu") {
-                // qu before e/i = k (silent u)
-                if (i + 2 < chars.size()) {
-                    uint32_t nncp = utf8_codepoint(chars[i + 2]);
-                    char nn = static_cast<char>(std::tolower(nncp));
-                    if (nn == 'e' || nn == 'i') {
-                        ipa += "k"; i += 2; continue;
-                    }
-                }
-                ipa += "k"; i += 2; continue;
-            }
-            if (di == "gu") {
-                // gu before e/i = g (silent u)
-                if (i + 2 < chars.size()) {
-                    uint32_t nncp = utf8_codepoint(chars[i + 2]);
-                    char nn = static_cast<char>(std::tolower(nncp));
-                    if (nn == 'e' || nn == 'i') {
-                        ipa += "g"; i += 2; continue;
-                    }
-                }
-            }
-        }
-
-        // --- Accented vowels (stressed, lengthened) ---
-        // á = C3 A1, é = C3 A9, í = C3 AD, ó = C3 B3, ú = C3 BA
-        // Á = C3 81, É = C3 89, Í = C3 8D, Ó = C3 93, Ú = C3 9A
-        if (cp == 0xE1 || cp == 0xC1)  { ipa += "a\xCB\x90"; i++; continue; } // aː
-        if (cp == 0xE9 || cp == 0xC9)  { ipa += "e\xCB\x90"; i++; continue; } // eː
-        if (cp == 0xED || cp == 0xCD)  { ipa += "i\xCB\x90"; i++; continue; } // iː
-        if (cp == 0xF3 || cp == 0xD3)  { ipa += "o\xCB\x90"; i++; continue; } // oː
-        if (cp == 0xFA || cp == 0xDA)  { ipa += "u\xCB\x90"; i++; continue; } // uː
-
-        // ñ = C3 B1, Ñ = C3 91
-        if (cp == 0xF1 || cp == 0xD1)  { ipa += "\xC9\xB2"; i++; continue; } // ɲ
-
-        // ü = C3 BC (used in güe, güi)
-        if (cp == 0xFC || cp == 0xDC)  { ipa += "w"; i++; continue; }
-
-        // --- Context-dependent consonants ---
-        if (cl == "c") {
-            if (!next_l.empty() && (next_l == "e" || next_l == "i")) {
-                ipa += "\xCE\xB8"; // θ (Castilian)
-            } else {
-                ipa += "k";
-            }
-            i++; continue;
-        }
-        if (cl == "g") {
-            if (!next_l.empty() && (next_l == "e" || next_l == "i")) {
-                ipa += "x"; // velar fricative
-            } else {
-                ipa += "g";
-            }
-            i++; continue;
-        }
-        if (cl == "j") { ipa += "x";                    i++; continue; }
-        if (cl == "z") { ipa += "\xCE\xB8";             i++; continue; } // θ
-        if (cl == "v") { ipa += "b";                     i++; continue; } // Spanish v = b
-        if (cl == "h") { i++; continue; } // silent h
-        if (cl == "x") { ipa += "ks";                   i++; continue; }
-
-        // Simple passthrough
-        if (cp == ' ')  { ipa += " "; i++; continue; }
-        if (cp >= 'a' && cp <= 'z') { ipa += static_cast<char>(cp); i++; continue; }
-        if (cp >= 'A' && cp <= 'Z') { ipa += static_cast<char>(cp + 32); i++; continue; }
-
-        // Punctuation
-        if (cp == ',' || cp == '.' || cp == '!' || cp == '?' || cp == ';'
-            || cp == ':' || cp == '-') {
-            ipa += static_cast<char>(cp);
-            i++; continue;
-        }
-        // Inverted punctuation
-        if (cp == 0xBF || cp == 0xA1) { i++; continue; } // ¿ ¡ — skip
-
-        i++; // skip unknown
-    }
-
-    return kokoro_postprocess(ipa);
-}
-
-// ===========================================================================
-// PORTUGUESE
-// ===========================================================================
-
-std::string multilingual::portuguese_g2p(const std::string& text) {
-    auto chars = utf8_split(text);
-    std::string ipa;
-
-    for (size_t i = 0; i < chars.size(); ) {
-        std::string c = chars[i];
-        uint32_t cp = utf8_codepoint(c);
-
-        std::string cl;
-        if (cp >= 'A' && cp <= 'Z') {
-            cl = std::string(1, static_cast<char>(cp + 32));
-        } else {
-            cl = c;
-        }
-
-        // Lookahead
-        std::string next_l, next2_l;
-        if (i + 1 < chars.size()) {
-            uint32_t ncp = utf8_codepoint(chars[i + 1]);
-            next_l = (ncp >= 'A' && ncp <= 'Z')
-                ? std::string(1, static_cast<char>(ncp + 32)) : chars[i + 1];
-        }
-        if (i + 2 < chars.size()) {
-            uint32_t ncp = utf8_codepoint(chars[i + 2]);
-            next2_l = (ncp >= 'A' && ncp <= 'Z')
-                ? std::string(1, static_cast<char>(ncp + 32)) : chars[i + 2];
-        }
-
-        // --- Trigraphs ---
-        if (!next_l.empty() && !next2_l.empty()) {
-            std::string tri = cl + next_l + next2_l;
-            // ção → sɐ̃w̃
-            if (cp == 0xE7 || cp == 0xC7) { // ç
-                if (next_l == "a" || next_l == "\xC3\xA3") { // ã (U+00E3)
-                    uint32_t n2cp = utf8_codepoint(chars[i + 2]);
-                    if (n2cp == 'o' || n2cp == 0xF5) { // o or õ
-                        ipa += "s\xC9\x90\xCC\x83w\xCC\x83"; // sɐ̃w̃
-                        i += 3; continue;
-                    }
-                }
-            }
-            if (tri == "lha" || tri == "lhe" || tri == "lhi" || tri == "lho" || tri == "lhu") {
-                // lh → ʎ
-                ipa += "\xCA\x8E"; // ʎ
-                i += 2; continue; // consume lh, leave vowel for next iteration
-            }
-            if (tri == "nha" || tri == "nhe" || tri == "nhi" || tri == "nho" || tri == "nhu") {
-                ipa += "\xC9\xB2"; // ɲ
-                i += 2; continue;
-            }
-        }
-
-        // --- Digraphs ---
-        if (!next_l.empty()) {
-            std::string di = cl + next_l;
-            if (di == "nh") { ipa += "\xC9\xB2";        i += 2; continue; } // ɲ
-            if (di == "lh") { ipa += "\xCA\x8E";        i += 2; continue; } // ʎ
-            if (di == "ch") { ipa += "\xCA\x83";        i += 2; continue; } // ʃ
-            if (di == "ss") { ipa += "s";                i += 2; continue; }
-            if (di == "rr") { ipa += "\xCA\x81";        i += 2; continue; } // ʁ
-            if (di == "qu") { ipa += "k";                i += 2; continue; }
-            if (di == "gu") {
-                if (!next2_l.empty() && (next2_l == "e" || next2_l == "i")) {
-                    ipa += "g"; i += 2; continue;
-                }
-            }
-            if (di == "ou") { ipa += "ow";               i += 2; continue; }
-            if (di == "ei") { ipa += "ej";               i += 2; continue; }
-            if (di == "ai") { ipa += "aj";               i += 2; continue; }
-            if (di == "oi") { ipa += "oj";               i += 2; continue; }
-        }
-
-        // --- Nasal vowels (ã, õ) ---
-        // ã = U+00E3, õ = U+00F5
-        if (cp == 0xE3 || cp == 0xC3) {
-            // Check for ão
-            if (!next_l.empty()) {
-                uint32_t ncp = utf8_codepoint(chars[i + 1]);
-                if (ncp == 'o' || ncp == 0xF5) {
-                    ipa += "\xC9\x90\xCC\x83w\xCC\x83"; // ɐ̃w̃
-                    i += 2; continue;
-                }
-            }
-            ipa += "\xC9\x90\xCC\x83"; // ɐ̃
-            i++; continue;
-        }
-        if (cp == 0xF5 || cp == 0xD5) {
-            // Check for õe
-            if (!next_l.empty() && (next_l == "e" || next_l == "\xC3\xA9")) {
-                ipa += "o\xCC\x83j\xCC\x83"; // õj̃
-                i += 2; continue;
-            }
-            ipa += "o\xCC\x83"; // õ
-            i++; continue;
-        }
-
-        // --- Accented vowels ---
-        if (cp == 0xE1 || cp == 0xC1) { ipa += "a";  i++; continue; } // á
-        if (cp == 0xE2 || cp == 0xC2) { ipa += "a";  i++; continue; } // â
-        if (cp == 0xE9 || cp == 0xC9) { ipa += "\xC9\x9B"; i++; continue; } // é → ɛ (open)
-        if (cp == 0xEA || cp == 0xCA) { ipa += "e";  i++; continue; } // ê
-        if (cp == 0xED || cp == 0xCD) { ipa += "i";  i++; continue; } // í
-        if (cp == 0xF3 || cp == 0xD3) { ipa += "\xC9\x94"; i++; continue; } // ó → ɔ (open)
-        if (cp == 0xF4 || cp == 0xD4) { ipa += "o";  i++; continue; } // ô
-        if (cp == 0xFA || cp == 0xDA) { ipa += "u";  i++; continue; } // ú
-
-        // ç → s
-        if (cp == 0xE7 || cp == 0xC7) { ipa += "s";  i++; continue; }
-
-        // Context-dependent
-        if (cl == "c") {
-            if (!next_l.empty() && (next_l == "e" || next_l == "i")) {
-                ipa += "s";
-            } else {
-                ipa += "k";
-            }
-            i++; continue;
-        }
-        if (cl == "g") {
-            if (!next_l.empty() && (next_l == "e" || next_l == "i")) {
-                ipa += "\xCA\x92"; // ʒ
-            } else {
-                ipa += "g";
-            }
-            i++; continue;
-        }
-        if (cl == "r") {
-            // Initial r or rr = ʁ, intervocalic = ɾ
-            if (i == 0 || (i > 0 && !is_latin_vowel(chars[i - 1][0]))) {
-                ipa += "\xCA\x81"; // ʁ
-            } else {
-                ipa += "\xC9\xBE"; // ɾ
-            }
-            i++; continue;
-        }
-        if (cl == "s") {
-            // Intervocalic s = z
-            if (i > 0 && i + 1 < chars.size()
-                && is_latin_vowel(chars[i - 1][0]) && is_latin_vowel(chars[i + 1][0])) {
-                ipa += "z";
-            } else {
-                ipa += "s";
-            }
-            i++; continue;
-        }
-
-        if (cl == "j") { ipa += "\xCA\x92"; i++; continue; } // ʒ
-        if (cl == "x") { ipa += "\xCA\x83"; i++; continue; } // ʃ (most common)
-        if (cl == "h") { i++; continue; } // silent
-
-        // Passthrough
-        if (cp == ' ')  { ipa += " "; i++; continue; }
-        if (cp >= 'a' && cp <= 'z') { ipa += static_cast<char>(cp); i++; continue; }
-        if (cp >= 'A' && cp <= 'Z') { ipa += static_cast<char>(cp + 32); i++; continue; }
-
-        // Punctuation
-        if (cp == ',' || cp == '.' || cp == '!' || cp == '?' || cp == ';'
-            || cp == ':' || cp == '-') {
-            ipa += static_cast<char>(cp);
-            i++; continue;
-        }
-
-        i++; // skip unknown
-    }
-
-    return kokoro_postprocess(ipa);
-}
-
-// ===========================================================================
-// ITALIAN
-// ===========================================================================
-
-std::string multilingual::italian_g2p(const std::string& text) {
-    auto chars = utf8_split(text);
-    std::string ipa;
-
-    for (size_t i = 0; i < chars.size(); ) {
-        std::string c = chars[i];
-        uint32_t cp = utf8_codepoint(c);
-
-        std::string cl;
-        if (cp >= 'A' && cp <= 'Z') {
-            cl = std::string(1, static_cast<char>(cp + 32));
-        } else {
-            cl = c;
-        }
-
-        // Lookahead
-        std::string next_l, next2_l;
-        if (i + 1 < chars.size()) {
-            uint32_t ncp = utf8_codepoint(chars[i + 1]);
-            next_l = (ncp >= 'A' && ncp <= 'Z')
-                ? std::string(1, static_cast<char>(ncp + 32)) : chars[i + 1];
-        }
-        if (i + 2 < chars.size()) {
-            uint32_t ncp = utf8_codepoint(chars[i + 2]);
-            next2_l = (ncp >= 'A' && ncp <= 'Z')
-                ? std::string(1, static_cast<char>(ncp + 32)) : chars[i + 2];
-        }
-
-        // --- Trigraphs ---
-        if (!next_l.empty() && !next2_l.empty()) {
-            std::string tri = cl + next_l + next2_l;
-            // sci before e/i = ʃ + vowel
-            if (tri == "sce" || tri == "sci") {
-                ipa += "\xCA\x83"; // ʃ
-                i += 2; continue; // consume sc, leave vowel
-            }
-            // gli before vowel = ʎ + vowel
-            if (cl == "g" && next_l == "l" && next2_l == "i") {
-                // Check if followed by a vowel
-                if (i + 3 < chars.size()) {
-                    uint32_t nncp = utf8_codepoint(chars[i + 3]);
-                    if (nncp == 'a' || nncp == 'e' || nncp == 'i' || nncp == 'o' || nncp == 'u') {
-                        ipa += "\xCA\x8E"; // ʎ
-                        i += 3; continue; // consume gli
-                    }
-                }
-                // gli at end or before consonant = ʎi
-                ipa += "\xCA\x8Ei"; // ʎi
-                i += 3; continue;
-            }
-            // ghi/ghe = g + vowel (hard g before e/i)
-            if (cl == "g" && next_l == "h") {
-                if (next2_l == "e" || next2_l == "i") {
-                    ipa += "g";
-                    i += 2; continue; // consume gh, leave vowel
-                }
-            }
-            // chi/che = k + vowel (hard c before e/i)
-            if (cl == "c" && next_l == "h") {
-                if (next2_l == "e" || next2_l == "i") {
-                    ipa += "k";
-                    i += 2; continue;
-                }
-            }
-        }
-
-        // --- Digraphs ---
-        if (!next_l.empty()) {
-            std::string di = cl + next_l;
-            if (di == "gn") { ipa += "\xC9\xB2";         i += 2; continue; } // ɲ
-            if (di == "sc") {
-                // sc before e/i = ʃ (already handled in trigraphs above for explicit vowel)
-                if (!next2_l.empty() && (next2_l == "e" || next2_l == "i")) {
-                    ipa += "\xCA\x83"; // ʃ
-                    i += 2; continue;
-                }
-                ipa += "sk"; i += 2; continue;
-            }
-            if (di == "qu") { ipa += "kw";                i += 2; continue; }
-            if (di == "ss") { ipa += "s";                 i += 2; continue; }
-            if (di == "zz") { ipa += "ts";                i += 2; continue; }
-            if (di == "cc") {
-                if (!next2_l.empty() && (next2_l == "e" || next2_l == "i")) {
-                    ipa += "t\xCA\x83"; // tʃ
-                    i += 2; continue;
-                }
-                ipa += "kk"; i += 2; continue;
-            }
-            if (di == "gg") {
-                if (!next2_l.empty() && (next2_l == "e" || next2_l == "i")) {
-                    ipa += "d\xCA\x92"; // dʒ
-                    i += 2; continue;
-                }
-                ipa += "gg"; i += 2; continue;
-            }
-            if (di == "gl") {
-                // gl before i = ʎ (covered in trigraphs)
-                // gl otherwise = gl
-                ipa += "gl"; i += 2; continue;
-            }
-        }
-
-        // --- Context-dependent consonants ---
-        if (cl == "c") {
-            if (!next_l.empty() && (next_l == "e" || next_l == "i")) {
-                ipa += "t\xCA\x83"; // tʃ
-            } else {
-                ipa += "k";
-            }
-            i++; continue;
-        }
-        if (cl == "g") {
-            if (!next_l.empty() && (next_l == "e" || next_l == "i")) {
-                ipa += "d\xCA\x92"; // dʒ
-            } else {
-                ipa += "g";
-            }
-            i++; continue;
-        }
-        if (cl == "z") {
-            // Default: ts (can be dz in some words — would need dictionary)
-            ipa += "ts";
-            i++; continue;
-        }
-        if (cl == "s") {
-            // Intervocalic s = z
-            if (i > 0 && i + 1 < chars.size()
-                && is_latin_vowel(chars[i - 1][0]) && is_latin_vowel(chars[i + 1][0])) {
-                ipa += "z";
-            } else {
-                ipa += "s";
-            }
-            i++; continue;
-        }
-
-        // Accented vowels
-        if (cp == 0xE0 || cp == 0xC0) { ipa += "a";  i++; continue; } // à
-        if (cp == 0xE1 || cp == 0xC1) { ipa += "a";  i++; continue; } // á
-        if (cp == 0xE8 || cp == 0xC8) { ipa += "\xC9\x9B"; i++; continue; } // è → ɛ
-        if (cp == 0xE9 || cp == 0xC9) { ipa += "e";  i++; continue; } // é
-        if (cp == 0xEC || cp == 0xCC) { ipa += "i";  i++; continue; } // ì
-        if (cp == 0xED || cp == 0xCD) { ipa += "i";  i++; continue; } // í
-        if (cp == 0xF2 || cp == 0xD2) { ipa += "\xC9\x94"; i++; continue; } // ò → ɔ
-        if (cp == 0xF3 || cp == 0xD3) { ipa += "o";  i++; continue; } // ó
-        if (cp == 0xF9 || cp == 0xD9) { ipa += "u";  i++; continue; } // ù
-        if (cp == 0xFA || cp == 0xDA) { ipa += "u";  i++; continue; } // ú
-
-        if (cl == "h") { i++; continue; } // silent
-        if (cl == "j") { ipa += "j"; i++; continue; }
-
-        // Passthrough
-        if (cp == ' ')  { ipa += " "; i++; continue; }
-        if (cp >= 'a' && cp <= 'z') { ipa += static_cast<char>(cp); i++; continue; }
-        if (cp >= 'A' && cp <= 'Z') { ipa += static_cast<char>(cp + 32); i++; continue; }
-
-        // Punctuation
-        if (cp == ',' || cp == '.' || cp == '!' || cp == '?' || cp == ';'
-            || cp == ':' || cp == '-') {
-            ipa += static_cast<char>(cp);
-            i++; continue;
-        }
-
-        i++; // skip unknown
-    }
-
-    return kokoro_postprocess(ipa);
-}
-
-// ===========================================================================
-// JAPANESE
-// ===========================================================================
-
-// Katakana and Hiragana → IPA tables.
-// Built as static maps initialized on first use.
-
-struct KanaEntry {
-    const char* kana;
-    const char* ipa;
-};
-
-static const std::unordered_map<std::string, std::string>& get_kana_map() {
-    static const std::unordered_map<std::string, std::string> map = []() {
-        std::unordered_map<std::string, std::string> m;
-
-        // --- Katakana digraphs (must be checked before singles) ---
-        // Stored with their UTF-8 sequences.
-
-        // キャ行
-        m["\xe3\x82\xad\xe3\x83\xa3"] = "kja"; // キャ
-        m["\xe3\x82\xad\xe3\x83\xa5"] = "kju"; // キュ
-        m["\xe3\x82\xad\xe3\x83\xa7"] = "kjo"; // キョ
-
-        // シャ行
-        m["\xe3\x82\xb7\xe3\x83\xa3"] = "\xca\x83" "a"; // シャ = ʃa
-        m["\xe3\x82\xb7\xe3\x83\xa5"] = "\xca\x83" "u"; // シュ = ʃu
-        m["\xe3\x82\xb7\xe3\x83\xa7"] = "\xca\x83" "o"; // ショ = ʃo
-
-        // チャ行
-        m["\xe3\x83\x81\xe3\x83\xa3"] = "t\xca\x83" "a"; // チャ = tʃa
-        m["\xe3\x83\x81\xe3\x83\xa5"] = "t\xca\x83" "u"; // チュ = tʃu
-        m["\xe3\x83\x81\xe3\x83\xa7"] = "t\xca\x83" "o"; // チョ = tʃo
-
-        // ニャ行
-        m["\xe3\x83\x8b\xe3\x83\xa3"] = "\xc9\xb2" "a"; // ニャ = ɲa
-        m["\xe3\x83\x8b\xe3\x83\xa5"] = "\xc9\xb2" "u"; // ニュ = ɲu
-        m["\xe3\x83\x8b\xe3\x83\xa7"] = "\xc9\xb2" "o"; // ニョ = ɲo
-
-        // ヒャ行
-        m["\xe3\x83\x92\xe3\x83\xa3"] = "\xc3\xa7" "a"; // ヒャ = ça
-        m["\xe3\x83\x92\xe3\x83\xa5"] = "\xc3\xa7" "u"; // ヒュ = çu
-        m["\xe3\x83\x92\xe3\x83\xa7"] = "\xc3\xa7" "o"; // ヒョ = ço
-
-        // ミャ行
-        m["\xe3\x83\x9f\xe3\x83\xa3"] = "mja"; // ミャ
-        m["\xe3\x83\x9f\xe3\x83\xa5"] = "mju"; // ミュ
-        m["\xe3\x83\x9f\xe3\x83\xa7"] = "mjo"; // ミョ
-
-        // リャ行
-        m["\xe3\x83\xaa\xe3\x83\xa3"] = "\xc9\xbe" "ja"; // リャ = ɾja
-        m["\xe3\x83\xaa\xe3\x83\xa5"] = "\xc9\xbe" "ju"; // リュ = ɾju
-        m["\xe3\x83\xaa\xe3\x83\xa7"] = "\xc9\xbe" "jo"; // リョ = ɾjo
-
-        // ギャ行
-        m["\xe3\x82\xae\xe3\x83\xa3"] = "gja"; // ギャ
-        m["\xe3\x82\xae\xe3\x83\xa5"] = "gju"; // ギュ
-        m["\xe3\x82\xae\xe3\x83\xa7"] = "gjo"; // ギョ
-
-        // ジャ行
-        m["\xe3\x82\xb8\xe3\x83\xa3"] = "d\xca\x92" "a"; // ジャ = dʒa
-        m["\xe3\x82\xb8\xe3\x83\xa5"] = "d\xca\x92" "u"; // ジュ = dʒu
-        m["\xe3\x82\xb8\xe3\x83\xa7"] = "d\xca\x92" "o"; // ジョ = dʒo
-
-        // ビャ行
-        m["\xe3\x83\x93\xe3\x83\xa3"] = "bja"; // ビャ
-        m["\xe3\x83\x93\xe3\x83\xa5"] = "bju"; // ビュ
-        m["\xe3\x83\x93\xe3\x83\xa7"] = "bjo"; // ビョ
-
-        // ピャ行
-        m["\xe3\x83\x94\xe3\x83\xa3"] = "pja"; // ピャ
-        m["\xe3\x83\x94\xe3\x83\xa5"] = "pju"; // ピュ
-        m["\xe3\x83\x94\xe3\x83\xa7"] = "pjo"; // ピョ
-
-        // --- Katakana singles ---
-        // ア行
-        m["\xe3\x82\xa2"] = "a";   // ア
-        m["\xe3\x82\xa4"] = "i";   // イ
-        m["\xe3\x82\xa6"] = "\xc9\xb0"; // ウ = ɰ (unrounded)
-        m["\xe3\x82\xa8"] = "e";   // エ
-        m["\xe3\x82\xaa"] = "o";   // オ
-
-        // カ行
-        m["\xe3\x82\xab"] = "ka";  // カ
-        m["\xe3\x82\xad"] = "ki";  // キ
-        m["\xe3\x82\xaf"] = "k\xc9\xb0"; // ク = kɰ
-        m["\xe3\x82\xb1"] = "ke";  // ケ
-        m["\xe3\x82\xb3"] = "ko";  // コ
-
-        // サ行
-        m["\xe3\x82\xb5"] = "sa";  // サ
-        m["\xe3\x82\xb7"] = "\xca\x83i"; // シ = ʃi
-        m["\xe3\x82\xb9"] = "s\xc9\xb0"; // ス = sɰ
-        m["\xe3\x82\xbb"] = "se";  // セ
-        m["\xe3\x82\xbd"] = "so";  // ソ
-
-        // タ行
-        m["\xe3\x82\xbf"] = "ta";  // タ
-        m["\xe3\x83\x81"] = "t\xca\x83i"; // チ = tʃi
-        m["\xe3\x83\x84"] = "ts\xc9\xb0"; // ツ = tsɰ
-        m["\xe3\x83\x86"] = "te";  // テ
-        m["\xe3\x83\x88"] = "to";  // ト
-
-        // ナ行
-        m["\xe3\x83\x8a"] = "na";  // ナ
-        m["\xe3\x83\x8b"] = "\xc9\xb2i"; // ニ = ɲi
-        m["\xe3\x83\x8c"] = "n\xc9\xb0"; // ヌ = nɰ
-        m["\xe3\x83\x8d"] = "ne";  // ネ
-        m["\xe3\x83\x8e"] = "no";  // ノ
-
-        // ハ行
-        m["\xe3\x83\x8f"] = "ha";  // ハ
-        m["\xe3\x83\x92"] = "\xc3\xa7i"; // ヒ = çi
-        m["\xe3\x83\x95"] = "\xc9\xb8\xc9\xb0"; // フ = ɸɰ
-        m["\xe3\x83\x98"] = "he";  // ヘ
-        m["\xe3\x83\x9b"] = "ho";  // ホ
-
-        // マ行
-        m["\xe3\x83\x9e"] = "ma";  // マ
-        m["\xe3\x83\x9f"] = "mi";  // ミ
-        m["\xe3\x83\xa0"] = "m\xc9\xb0"; // ム = mɰ
-        m["\xe3\x83\xa1"] = "me";  // メ
-        m["\xe3\x83\xa2"] = "mo";  // モ
-
-        // ヤ行
-        m["\xe3\x83\xa4"] = "ja";  // ヤ
-        m["\xe3\x83\xa6"] = "j\xc9\xb0"; // ユ = jɰ
-        m["\xe3\x83\xa8"] = "jo";  // ヨ
-
-        // ラ行
-        m["\xe3\x83\xa9"] = "\xc9\xbe" "a"; // ラ = ɾa
-        m["\xe3\x83\xaa"] = "\xc9\xbe" "i"; // リ = ɾi
-        m["\xe3\x83\xab"] = "\xc9\xbe\xc9\xb0"; // ル = ɾɰ
-        m["\xe3\x83\xac"] = "\xc9\xbe" "e"; // レ = ɾe
-        m["\xe3\x83\xad"] = "\xc9\xbe" "o"; // ロ = ɾo
-
-        // ワ行
-        m["\xe3\x83\xaf"] = "wa";  // ワ
-        m["\xe3\x83\xb2"] = "o";   // ヲ
-        m["\xe3\x83\xb3"] = "\xc9\xb4"; // ン = ɴ
-
-        // 濁音 (voiced) — ガ行
-        m["\xe3\x82\xac"] = "ga";  // ガ
-        m["\xe3\x82\xae"] = "gi";  // ギ
-        m["\xe3\x82\xb0"] = "g\xc9\xb0"; // グ = gɰ
-        m["\xe3\x82\xb2"] = "ge";  // ゲ
-        m["\xe3\x82\xb4"] = "go";  // ゴ
-
-        // ザ行
-        m["\xe3\x82\xb6"] = "za";  // ザ
-        m["\xe3\x82\xb8"] = "d\xca\x92i"; // ジ = dʒi
-        m["\xe3\x82\xba"] = "z\xc9\xb0"; // ズ = zɰ
-        m["\xe3\x82\xbc"] = "ze";  // ゼ
-        m["\xe3\x82\xbe"] = "zo";  // ゾ
-
-        // ダ行
-        m["\xe3\x83\x80"] = "da";  // ダ
-        m["\xe3\x83\x82"] = "d\xca\x92i"; // ヂ = dʒi
-        m["\xe3\x83\x85"] = "z\xc9\xb0"; // ヅ = zɰ
-        m["\xe3\x83\x87"] = "de";  // デ
-        m["\xe3\x83\x89"] = "do";  // ド
-
-        // バ行
-        m["\xe3\x83\x90"] = "ba";  // バ
-        m["\xe3\x83\x93"] = "bi";  // ビ
-        m["\xe3\x83\x96"] = "b\xc9\xb0"; // ブ = bɰ
-        m["\xe3\x83\x99"] = "be";  // ベ
-        m["\xe3\x83\x9c"] = "bo";  // ボ
-
-        // パ行
-        m["\xe3\x83\x91"] = "pa";  // パ
-        m["\xe3\x83\x94"] = "pi";  // ピ
-        m["\xe3\x83\x97"] = "p\xc9\xb0"; // プ = pɰ
-        m["\xe3\x83\x9a"] = "pe";  // ペ
-        m["\xe3\x83\x9d"] = "po";  // ポ
-
-        // Special
-        m["\xe3\x83\x83"] = "\xca\x94"; // ッ (small tsu) = ʔ (glottal stop)
-        m["\xe3\x83\xbc"] = "\xcb\x90"; // ー (long vowel) = ː
-
-        // --- Hiragana (offset katakana by 0x60) ---
-        // We add the same entries for hiragana.
-        // Hiragana range: U+3041-U+3093
-        // Katakana range: U+30A1-U+30F3
-        // Offset: Hiragana = Katakana - 0x60
-
-        // Hiragana vowels
-        m["\xe3\x81\x82"] = "a";   // あ
-        m["\xe3\x81\x84"] = "i";   // い
-        m["\xe3\x81\x86"] = "\xc9\xb0"; // う = ɰ
-        m["\xe3\x81\x88"] = "e";   // え
-        m["\xe3\x81\x8a"] = "o";   // お
-
-        // か行
-        m["\xe3\x81\x8b"] = "ka";  // か
-        m["\xe3\x81\x8d"] = "ki";  // き
-        m["\xe3\x81\x8f"] = "k\xc9\xb0"; // く
-        m["\xe3\x81\x91"] = "ke";  // け
-        m["\xe3\x81\x93"] = "ko";  // こ
-
-        // さ行
-        m["\xe3\x81\x95"] = "sa";  // さ
-        m["\xe3\x81\x97"] = "\xca\x83i"; // し = ʃi
-        m["\xe3\x81\x99"] = "s\xc9\xb0"; // す
-        m["\xe3\x81\x9b"] = "se";  // せ
-        m["\xe3\x81\x9d"] = "so";  // そ
-
-        // た行
-        m["\xe3\x81\x9f"] = "ta";  // た
-        m["\xe3\x81\xa1"] = "t\xca\x83i"; // ち = tʃi
-        m["\xe3\x81\xa4"] = "ts\xc9\xb0"; // つ
-        m["\xe3\x81\xa6"] = "te";  // て
-        m["\xe3\x81\xa8"] = "to";  // と
-
-        // な行
-        m["\xe3\x81\xaa"] = "na";  // な
-        m["\xe3\x81\xab"] = "\xc9\xb2i"; // に = ɲi
-        m["\xe3\x81\xac"] = "n\xc9\xb0"; // ぬ
-        m["\xe3\x81\xad"] = "ne";  // ね
-        m["\xe3\x81\xae"] = "no";  // の
-
-        // は行
-        m["\xe3\x81\xaf"] = "ha";  // は
-        m["\xe3\x81\xb2"] = "\xc3\xa7i"; // ひ = çi
-        m["\xe3\x81\xb5"] = "\xc9\xb8\xc9\xb0"; // ふ = ɸɰ
-        m["\xe3\x81\xb8"] = "he";  // へ
-        m["\xe3\x81\xbb"] = "ho";  // ほ
-
-        // ま行
-        m["\xe3\x81\xbe"] = "ma";  // ま
-        m["\xe3\x81\xbf"] = "mi";  // み
-        m["\xe3\x82\x80"] = "m\xc9\xb0"; // む
-        m["\xe3\x82\x81"] = "me";  // め
-        m["\xe3\x82\x82"] = "mo";  // も
-
-        // や行
-        m["\xe3\x82\x84"] = "ja";  // や
-        m["\xe3\x82\x86"] = "j\xc9\xb0"; // ゆ
-        m["\xe3\x82\x88"] = "jo";  // よ
-
-        // ら行
-        m["\xe3\x82\x89"] = "\xc9\xbe" "a"; // ら = ɾa
-        m["\xe3\x82\x8a"] = "\xc9\xbe" "i"; // り = ɾi
-        m["\xe3\x82\x8b"] = "\xc9\xbe\xc9\xb0"; // る = ɾɰ
-        m["\xe3\x82\x8c"] = "\xc9\xbe" "e"; // れ = ɾe
-        m["\xe3\x82\x8d"] = "\xc9\xbe" "o"; // ろ = ɾo
-
-        // わ行
-        m["\xe3\x82\x8f"] = "wa";  // わ
-        m["\xe3\x82\x92"] = "o";   // を
-        m["\xe3\x82\x93"] = "\xc9\xb4"; // ん = ɴ
-
-        // 濁音 — が行
-        m["\xe3\x81\x8c"] = "ga";  // が
-        m["\xe3\x81\x8e"] = "gi";  // ぎ
-        m["\xe3\x81\x90"] = "g\xc9\xb0"; // ぐ
-        m["\xe3\x81\x92"] = "ge";  // げ
-        m["\xe3\x81\x94"] = "go";  // ご
-
-        // ざ行
-        m["\xe3\x81\x96"] = "za";  // ざ
-        m["\xe3\x81\x98"] = "d\xca\x92i"; // じ = dʒi
-        m["\xe3\x81\x9a"] = "z\xc9\xb0"; // ず
-        m["\xe3\x81\x9c"] = "ze";  // ぜ
-        m["\xe3\x81\x9e"] = "zo";  // ぞ
-
-        // だ行
-        m["\xe3\x81\xa0"] = "da";  // だ
-        m["\xe3\x81\xa2"] = "d\xca\x92i"; // ぢ = dʒi
-        m["\xe3\x81\xa5"] = "z\xc9\xb0"; // づ
-        m["\xe3\x81\xa7"] = "de";  // で
-        m["\xe3\x81\xa9"] = "do";  // ど
-
-        // ば行
-        m["\xe3\x81\xb0"] = "ba";  // ば
-        m["\xe3\x81\xb3"] = "bi";  // び
-        m["\xe3\x81\xb6"] = "b\xc9\xb0"; // ぶ
-        m["\xe3\x81\xb9"] = "be";  // べ
-        m["\xe3\x81\xbc"] = "bo";  // ぼ
-
-        // ぱ行
-        m["\xe3\x81\xb1"] = "pa";  // ぱ
-        m["\xe3\x81\xb4"] = "pi";  // ぴ
-        m["\xe3\x81\xb7"] = "p\xc9\xb0"; // ぷ
-        m["\xe3\x81\xba"] = "pe";  // ぺ
-        m["\xe3\x81\xbd"] = "po";  // ぽ
-
-        // Special hiragana
-        m["\xe3\x81\xa3"] = "\xca\x94"; // っ (small tsu) = ʔ
-
-        // Hiragana digraphs (きゃ etc.)
-        m["\xe3\x81\x8d\xe3\x82\x83"] = "kja"; // きゃ
-        m["\xe3\x81\x8d\xe3\x82\x85"] = "kju"; // きゅ
-        m["\xe3\x81\x8d\xe3\x82\x87"] = "kjo"; // きょ
-
-        m["\xe3\x81\x97\xe3\x82\x83"] = "\xca\x83" "a"; // しゃ = ʃa
-        m["\xe3\x81\x97\xe3\x82\x85"] = "\xca\x83" "u"; // しゅ = ʃu
-        m["\xe3\x81\x97\xe3\x82\x87"] = "\xca\x83" "o"; // しょ = ʃo
-
-        m["\xe3\x81\xa1\xe3\x82\x83"] = "t\xca\x83" "a"; // ちゃ = tʃa
-        m["\xe3\x81\xa1\xe3\x82\x85"] = "t\xca\x83" "u"; // ちゅ = tʃu
-        m["\xe3\x81\xa1\xe3\x82\x87"] = "t\xca\x83" "o"; // ちょ = tʃo
-
-        m["\xe3\x81\xab\xe3\x82\x83"] = "\xc9\xb2" "a"; // にゃ = ɲa
-        m["\xe3\x81\xab\xe3\x82\x85"] = "\xc9\xb2" "u"; // にゅ = ɲu
-        m["\xe3\x81\xab\xe3\x82\x87"] = "\xc9\xb2" "o"; // にょ = ɲo
-
-        m["\xe3\x81\xb2\xe3\x82\x83"] = "\xc3\xa7" "a"; // ひゃ = ça
-        m["\xe3\x81\xb2\xe3\x82\x85"] = "\xc3\xa7" "u"; // ひゅ = çu
-        m["\xe3\x81\xb2\xe3\x82\x87"] = "\xc3\xa7" "o"; // ひょ = ço
-
-        m["\xe3\x81\xbf\xe3\x82\x83"] = "mja"; // みゃ
-        m["\xe3\x81\xbf\xe3\x82\x85"] = "mju"; // みゅ
-        m["\xe3\x81\xbf\xe3\x82\x87"] = "mjo"; // みょ
-
-        m["\xe3\x82\x8a\xe3\x82\x83"] = "\xc9\xbe" "ja"; // りゃ = ɾja
-        m["\xe3\x82\x8a\xe3\x82\x85"] = "\xc9\xbe" "ju"; // りゅ = ɾju
-        m["\xe3\x82\x8a\xe3\x82\x87"] = "\xc9\xbe" "jo"; // りょ = ɾjo
-
-        m["\xe3\x81\x8e\xe3\x82\x83"] = "gja"; // ぎゃ
-        m["\xe3\x81\x8e\xe3\x82\x85"] = "gju"; // ぎゅ
-        m["\xe3\x81\x8e\xe3\x82\x87"] = "gjo"; // ぎょ
-
-        m["\xe3\x81\x98\xe3\x82\x83"] = "d\xca\x92" "a"; // じゃ = dʒa
-        m["\xe3\x81\x98\xe3\x82\x85"] = "d\xca\x92" "u"; // じゅ = dʒu
-        m["\xe3\x81\x98\xe3\x82\x87"] = "d\xca\x92" "o"; // じょ = dʒo
-
-        m["\xe3\x81\xb3\xe3\x82\x83"] = "bja"; // びゃ
-        m["\xe3\x81\xb3\xe3\x82\x85"] = "bju"; // びゅ
-        m["\xe3\x81\xb3\xe3\x82\x87"] = "bjo"; // びょ
-
-        m["\xe3\x81\xb4\xe3\x82\x83"] = "pja"; // ぴゃ
-        m["\xe3\x81\xb4\xe3\x82\x85"] = "pju"; // ぴゅ
-        m["\xe3\x81\xb4\xe3\x82\x87"] = "pjo"; // ぴょ
-
-        return m;
-    }();
-    return map;
-}
-
-std::string multilingual::japanese_g2p(const std::string& text) {
-    const auto& kana_map = get_kana_map();
-    auto chars = utf8_split(text);
-    std::string ipa;
-
-    for (size_t i = 0; i < chars.size(); ) {
-        // Try digraph (two characters) first
-        if (i + 1 < chars.size()) {
-            std::string pair = chars[i] + chars[i + 1];
-            auto it = kana_map.find(pair);
-            if (it != kana_map.end()) {
-                ipa += it->second;
-                i += 2;
-                continue;
-            }
-        }
-
-        // Try single character
-        auto it = kana_map.find(chars[i]);
-        if (it != kana_map.end()) {
-            ipa += it->second;
-            i++;
-            continue;
-        }
-
-        uint32_t cp = utf8_codepoint(chars[i]);
-
-        // ASCII passthrough
-        if (cp == ' ')  { ipa += " "; i++; continue; }
-        if ((cp >= 'a' && cp <= 'z') || (cp >= 'A' && cp <= 'Z')
-            || (cp >= '0' && cp <= '9')) {
-            ipa += chars[i]; i++; continue;
-        }
-        // Punctuation
-        if (cp == ',' || cp == '.' || cp == '!' || cp == '?'
-            || cp == 0x3001 || cp == 0x3002) { // 、。
-            ipa += ","; // normalize Japanese punctuation to comma pause
-            i++; continue;
-        }
-
-        // CJK ideographs (kanji) — pass through as-is (requires JNI/dictionary for proper conversion)
-        if (cp >= 0x4E00 && cp <= 0x9FFF) {
-            // TODO: Kanji→reading conversion requires dictionary or JNI callback
-            ipa += chars[i];
-            i++; continue;
-        }
-
-        i++; // skip unknown
-    }
-
-    return kokoro_postprocess(ipa);
-}
-
-// ===========================================================================
-// CHINESE (Pinyin → IPA)
-// ===========================================================================
-
-// Pinyin syllable → IPA conversion.
-// This handles pre-segmented pinyin input (space-separated syllables).
-// For raw Chinese text, a pinyin segmenter is needed upstream (JNI or ICU).
-
-struct PinyinMapping {
-    const char* pinyin;
-    const char* ipa;
-};
-
-// Build the pinyin→IPA table on first use.
-static const std::unordered_map<std::string, std::string>& get_pinyin_finals_map() {
-    static const std::unordered_map<std::string, std::string> map = {
-        // Complex finals first (longer match priority)
-        {"iang", "ja\xc5\x8b"},     // jaŋ
-        {"iong", "j\xca\x8a\xc5\x8b"}, // jʊŋ
-        {"uang", "wa\xc5\x8b"},     // waŋ
-        {"iao",  "jaw"},
-        {"ian",  "j\xc9\x9bn"},     // jɛn
-        {"ang",  "a\xc5\x8b"},      // aŋ
-        {"eng",  "\xc9\x99\xc5\x8b"}, // əŋ
-        {"ing",  "i\xc5\x8b"},      // iŋ
-        {"ong",  "\xca\x8a\xc5\x8b"}, // ʊŋ
-        {"uai",  "waj"},
-        {"uan",  "wan"},
-        {"ai",   "aj"},
-        {"ei",   "ej"},
-        {"ao",   "aw"},
-        {"ou",   "ow"},
-        {"an",   "an"},
-        {"en",   "\xc9\x99n"},       // ən
-        {"in",   "in"},
-        {"un",   "\xc9\x99n"},       // ən (=uen simplified)
-        {"ia",   "ja"},
-        {"ie",   "je"},
-        {"uo",   "wo"},
-        {"ua",   "wa"},
-        {"ue",   "we"},              // üe
-        {"ui",   "wej"},             // =uei
-        {"iu",   "jow"},             // =iou
-        {"er",   "\xc9\x99\xc9\xbb"}, // əɻ
-        {"a",    "a"},
-        {"e",    "\xc9\xa4"},        // ɤ
-        {"i",    "i"},
-        {"o",    "wo"},
-        {"u",    "u"},
-    };
-    return map;
-}
-
-static const std::unordered_map<std::string, std::string>& get_pinyin_initials_map() {
-    static const std::unordered_map<std::string, std::string> map = {
-        {"zh",  "\xca\x88\xca\x82"},     // ʈʂ
-        {"ch",  "\xca\x88\xca\x82\xca\xb0"}, // ʈʂʰ
-        {"sh",  "\xca\x82"},              // ʂ
-        {"b",   "p"},
-        {"p",   "p\xca\xb0"},            // pʰ
-        {"m",   "m"},
-        {"f",   "f"},
-        {"d",   "t"},
-        {"t",   "t\xca\xb0"},            // tʰ
-        {"n",   "n"},
-        {"l",   "l"},
-        {"g",   "k"},
-        {"k",   "k\xca\xb0"},            // kʰ
-        {"h",   "x"},
-        {"j",   "t\xc9\x95"},            // tɕ
-        {"q",   "t\xc9\x95\xca\xb0"},    // tɕʰ
-        {"x",   "\xc9\x95"},             // ɕ
-        {"z",   "ts"},
-        {"c",   "ts\xca\xb0"},           // tsʰ
-        {"s",   "s"},
-        {"r",   "\xc9\xbb"},             // ɻ
-        {"y",   "j"},                     // glide
-        {"w",   "w"},                     // glide
-    };
-    return map;
-}
-
-/// Convert a single pinyin syllable (with optional tone number) to IPA.
-static std::string pinyin_syllable_to_ipa(const std::string& syllable) {
-    if (syllable.empty()) return "";
-
-    std::string syl = to_lower_ascii(syllable);
-
-    // Strip tone number (1-5) at end
-    if (!syl.empty() && syl.back() >= '1' && syl.back() <= '5') {
-        syl.pop_back();
-    }
-    if (syl.empty()) return "";
-
-    // Handle ü (written as v or ü in some pinyin systems)
-    {
-        size_t pos = 0;
-        while ((pos = syl.find('v', pos)) != std::string::npos) {
-            syl.replace(pos, 1, "\xc3\xbc"); // ü
-            pos += 2;
-        }
-    }
-
-    const auto& initials = get_pinyin_initials_map();
-    const auto& finals = get_pinyin_finals_map();
-
-    std::string initial_ipa;
-    std::string remaining = syl;
-
-    // Try two-char initial first, then one-char
-    if (syl.size() >= 2) {
-        auto it = initials.find(syl.substr(0, 2));
-        if (it != initials.end()) {
-            initial_ipa = it->second;
-            remaining = syl.substr(2);
-        }
-    }
-    if (initial_ipa.empty() && syl.size() >= 1) {
-        auto it = initials.find(syl.substr(0, 1));
-        if (it != initials.end()) {
-            initial_ipa = it->second;
-            remaining = syl.substr(1);
-        }
-    }
-
-    // Special case: ü finals after j/q/x/y (written as u but pronounced y)
-    if (!remaining.empty() && remaining[0] == 'u') {
-        std::string init1 = syl.size() >= 1 ? syl.substr(0, 1) : "";
-        if (init1 == "j" || init1 == "q" || init1 == "x" || init1 == "y") {
-            remaining = "v" + remaining.substr(1); // treat as ü
-            // Actually, for j/q/x, the u IS ü. Map to y sound.
-            // Keep as-is for finals matching, the final will handle it.
-        }
-    }
-
-    // Match final
-    std::string final_ipa;
-    // Try longest match first
-    for (size_t len = std::min(remaining.size(), size_t(4)); len > 0; len--) {
-        auto it = finals.find(remaining.substr(0, len));
-        if (it != finals.end()) {
-            final_ipa = it->second;
-            break;
-        }
-    }
-
-    if (final_ipa.empty() && !remaining.empty()) {
-        // Fallback: just use the remaining as-is
-        final_ipa = remaining;
-    }
-
-    return initial_ipa + final_ipa;
-}
-
-std::string multilingual::chinese_g2p(const std::string& text) {
-    // Input is expected to be pinyin (space-separated syllables) or mixed text.
-    // CJK characters are passed through (would need pinyin conversion upstream).
-    auto chars = utf8_split(text);
-    std::string ipa;
-    std::string current_syllable;
-
-    auto flush_syllable = [&]() {
-        if (!current_syllable.empty()) {
-            ipa += pinyin_syllable_to_ipa(current_syllable);
-            current_syllable.clear();
-        }
-    };
-
-    for (size_t i = 0; i < chars.size(); i++) {
-        uint32_t cp = utf8_codepoint(chars[i]);
-
-        // CJK ideographs — pass through (needs upstream pinyin conversion)
-        if (cp >= 0x4E00 && cp <= 0x9FFF) {
-            flush_syllable();
-            // TODO: Character→pinyin conversion requires dictionary or JNI callback
-            ipa += chars[i];
-            continue;
-        }
-
-        // Space or punctuation = syllable boundary
-        if (cp == ' ' || cp == ',' || cp == '.' || cp == '!' || cp == '?'
-            || cp == ';' || cp == ':' || cp == '-') {
-            flush_syllable();
-            if (cp == ' ') ipa += " ";
-            else ipa += static_cast<char>(cp);
-            continue;
-        }
-
-        // ASCII letters and digits = part of pinyin syllable
-        if ((cp >= 'a' && cp <= 'z') || (cp >= 'A' && cp <= 'Z')
-            || (cp >= '0' && cp <= '9')) {
-            current_syllable += static_cast<char>(cp);
-            continue;
-        }
-
-        // ü (U+00FC)
-        if (cp == 0xFC) {
-            current_syllable += "v"; // internal representation
-            continue;
-        }
-
-        // Skip unknown
-        flush_syllable();
-    }
-    flush_syllable();
-
-    return kokoro_postprocess(ipa);
-}
-
-// ===========================================================================
-// HINDI (Devanagari → IPA)
-// ===========================================================================
-
-// Devanagari consonants → IPA
-static const std::unordered_map<uint32_t, std::string>& get_devanagari_consonants() {
-    static const std::unordered_map<uint32_t, std::string> map = {
-        // Velars
-        {0x0915, "k"},       // क
-        {0x0916, "k\xca\xb0"},  // ख = kʰ
-        {0x0917, "\xc9\xa1"},    // ग = ɡ
-        {0x0918, "\xc9\xa1\xca\xb1"}, // घ = ɡʱ
-        {0x0919, "\xc5\x8b"},    // ङ = ŋ
-
-        // Palatals
-        {0x091A, "t\xca\x83"},      // च = tʃ
-        {0x091B, "t\xca\x83\xca\xb0"}, // छ = tʃʰ
-        {0x091C, "d\xca\x92"},      // ज = dʒ
-        {0x091D, "d\xca\x92\xca\xb1"}, // झ = dʒʱ
-        {0x091E, "\xc9\xb2"},       // ञ = ɲ
-
-        // Retroflexes
-        {0x091F, "\xca\x88"},       // ट = ʈ
-        {0x0920, "\xca\x88\xca\xb0"}, // ठ = ʈʰ
-        {0x0921, "\xc9\x96"},       // ड = ɖ
-        {0x0922, "\xc9\x96\xca\xb1"}, // ढ = ɖʱ
-        {0x0923, "\xc9\xb3"},       // ण = ɳ
-
-        // Dentals
-        {0x0924, "t\xcc\xaa"},      // त = t̪
-        {0x0925, "t\xcc\xaa\xca\xb0"}, // थ = t̪ʰ
-        {0x0926, "d\xcc\xaa"},      // द = d̪
-        {0x0927, "d\xcc\xaa\xca\xb1"}, // ध = d̪ʱ
-        {0x0928, "n"},              // न = n
-
-        // Labials
-        {0x092A, "p"},       // प
-        {0x092B, "p\xca\xb0"},  // फ = pʰ
-        {0x092C, "b"},       // ब
-        {0x092D, "b\xca\xb1"},  // भ = bʱ
-        {0x092E, "m"},       // म
-
-        // Semi-vowels / Approximants
-        {0x092F, "j"},       // य
-        {0x0930, "\xc9\xbe"},   // र = ɾ
-        {0x0932, "l"},       // ल
-        {0x0935, "\xca\x8b"},   // व = ʋ
-
-        // Sibilants / Fricatives
-        {0x0936, "\xca\x83"},   // श = ʃ
-        {0x0937, "\xca\x82"},   // ष = ʂ
-        {0x0938, "s"},       // स
-        {0x0939, "\xc9\xa6"},   // ह = ɦ
-
-        // Nukta variants
-        {0x0958, "k"},       // क़ → k (Urdu qaf)
-        {0x0959, "x"},       // ख़ → x
-        {0x095A, "\xc9\xa3"},   // ग़ → ɣ
-        {0x095B, "z"},       // ज़ → z
-        {0x095C, "\xc9\x96"},   // ड़ → ɖ (flap)
-        {0x095D, "\xc9\x96\xca\xb1"}, // ढ़ → ɖʱ
-        {0x095E, "f"},       // फ़ → f
-    };
-    return map;
-}
-
-// Devanagari independent vowels → IPA
-static const std::unordered_map<uint32_t, std::string>& get_devanagari_vowels() {
-    static const std::unordered_map<uint32_t, std::string> map = {
-        {0x0905, "\xc9\x99"},       // अ = ə
-        {0x0906, "a\xcb\x90"},      // आ = aː
-        {0x0907, "\xc9\xaa"},       // इ = ɪ
-        {0x0908, "i\xcb\x90"},      // ई = iː
-        {0x0909, "\xca\x8a"},       // उ = ʊ
-        {0x090A, "u\xcb\x90"},      // ऊ = uː
-        {0x090B, "\xc9\xbe\xc9\xaa"}, // ऋ = ɾɪ
-        {0x090F, "e\xcb\x90"},      // ए = eː
-        {0x0910, "\xc9\x99j"},      // ऐ = əj (diphthong)
-        {0x0913, "o\xcb\x90"},      // ओ = oː
-        {0x0914, "\xc9\x99w"},      // औ = əw (diphthong)
-    };
-    return map;
-}
-
-// Devanagari vowel signs (matras) → IPA
-static const std::unordered_map<uint32_t, std::string>& get_devanagari_matras() {
-    static const std::unordered_map<uint32_t, std::string> map = {
-        {0x093E, "a\xcb\x90"},      // ा = aː
-        {0x093F, "\xc9\xaa"},       // ि = ɪ
-        {0x0940, "i\xcb\x90"},      // ी = iː
-        {0x0941, "\xca\x8a"},       // ु = ʊ
-        {0x0942, "u\xcb\x90"},      // ू = uː
-        {0x0943, "\xc9\xbe\xc9\xaa"}, // ृ = ɾɪ
-        {0x0947, "e\xcb\x90"},      // े = eː
-        {0x0948, "\xc9\x99j"},      // ै = əj
-        {0x094B, "o\xcb\x90"},      // ो = oː
-        {0x094C, "\xc9\x99w"},      // ौ = əw
-    };
-    return map;
-}
-
-std::string multilingual::hindi_g2p(const std::string& text) {
-    const auto& consonants = get_devanagari_consonants();
-    const auto& vowels = get_devanagari_vowels();
-    const auto& matras = get_devanagari_matras();
-
-    auto chars = utf8_split(text);
-    std::string ipa;
-    bool prev_was_consonant = false; // track for inherent schwa
-
-    for (size_t i = 0; i < chars.size(); i++) {
-        uint32_t cp = utf8_codepoint(chars[i]);
-
-        // Virama (halant) — suppresses inherent vowel
-        if (cp == 0x094D) {
-            prev_was_consonant = false; // no schwa for previous consonant
-            continue;
-        }
-
-        // Anusvara (nasalization)
-        if (cp == 0x0902) {
-            ipa += "\xc9\xb4"; // ɴ (generic nasal, assimilates in speech)
-            prev_was_consonant = false;
-            continue;
-        }
-
-        // Visarga
-        if (cp == 0x0903) {
-            ipa += "\xc9\xa6"; // ɦ
-            prev_was_consonant = false;
-            continue;
-        }
-
-        // Chandrabindu (nasalization of vowel)
-        if (cp == 0x0901) {
-            ipa += "\xcc\x83"; // combining tilde (nasalize previous vowel)
-            continue;
-        }
-
-        // Nukta — modifies previous consonant. Skip (handled in nukta consonant entries).
-        if (cp == 0x093C) {
-            continue;
-        }
-
-        // Check vowel signs (matras) first
-        auto matra_it = matras.find(cp);
-        if (matra_it != matras.end()) {
-            prev_was_consonant = false;
-            ipa += matra_it->second;
-            continue;
-        }
-
-        // Independent vowels
-        auto vowel_it = vowels.find(cp);
-        if (vowel_it != vowels.end()) {
-            if (prev_was_consonant) {
-                // Previous consonant had no explicit vowel — add inherent schwa
-                ipa += "\xc9\x99"; // ə
-            }
-            prev_was_consonant = false;
-            ipa += vowel_it->second;
-            continue;
-        }
-
-        // Consonants
-        auto cons_it = consonants.find(cp);
-        if (cons_it != consonants.end()) {
-            if (prev_was_consonant) {
-                // Previous consonant had no explicit vowel — add inherent schwa
-                ipa += "\xc9\x99"; // ə
-            }
-            ipa += cons_it->second;
-            prev_was_consonant = true;
-            continue;
-        }
-
-        // Space
-        if (cp == ' ') {
-            if (prev_was_consonant) {
-                // Word-final consonant: add schwa for open syllables
-                // (Hindi schwa deletion is complex — we add it conservatively)
-                ipa += "\xc9\x99"; // ə
-            }
-            prev_was_consonant = false;
-            ipa += " ";
-            continue;
-        }
-
-        // ASCII passthrough
-        if ((cp >= 'a' && cp <= 'z') || (cp >= 'A' && cp <= 'Z')
-            || (cp >= '0' && cp <= '9')) {
-            if (prev_was_consonant) {
-                ipa += "\xc9\x99"; // ə
-                prev_was_consonant = false;
-            }
-            ipa += chars[i];
-            continue;
-        }
-
-        // Punctuation
-        if (cp == ',' || cp == '.' || cp == '!' || cp == '?'
-            || cp == ';' || cp == ':' || cp == '-'
-            || cp == 0x0964 || cp == 0x0965) { // Devanagari danda / double danda
-            if (prev_was_consonant) {
-                ipa += "\xc9\x99"; // ə
-                prev_was_consonant = false;
-            }
-            if (cp == 0x0964 || cp == 0x0965) {
-                ipa += ".";
-            } else {
-                ipa += static_cast<char>(cp);
-            }
-            continue;
-        }
-
-        // Devanagari digits (0x0966-0x096F) — pass through as Arabic numerals
-        if (cp >= 0x0966 && cp <= 0x096F) {
-            if (prev_was_consonant) {
-                ipa += "\xc9\x99";
-                prev_was_consonant = false;
-            }
-            ipa += static_cast<char>('0' + (cp - 0x0966));
-            continue;
-        }
-
-        // Skip unknown
-        if (prev_was_consonant) {
-            ipa += "\xc9\x99";
-            prev_was_consonant = false;
-        }
-    }
-
-    // Handle trailing consonant
-    if (prev_was_consonant) {
-        ipa += "\xc9\x99"; // ə
-    }
-
-    return kokoro_postprocess(ipa);
-}
-
-// ===========================================================================
-// DICTIONARY-FIRST PHONEMIZERS
-// ===========================================================================
-
-// Split text into words and punctuation tokens for dictionary lookup.
-// Returns vector of strings: each is either a word (letters), whitespace, or punctuation.
-static std::vector<std::string> split_into_tokens(const std::string& text) {
-    std::vector<std::string> tokens;
-    auto chars = utf8_split(text);
-    std::string current_word;
-
-    auto flush_word = [&]() {
-        if (!current_word.empty()) {
-            tokens.push_back(current_word);
-            current_word.clear();
-        }
-    };
-
-    for (size_t i = 0; i < chars.size(); i++) {
-        uint32_t cp = utf8_codepoint(chars[i]);
-
-        // Whitespace
-        if (cp == ' ' || cp == '\t' || cp == '\n' || cp == '\r') {
-            flush_word();
-            tokens.push_back(" ");
-            continue;
-        }
-
-        // ASCII punctuation that should be passed through
-        if (cp == ',' || cp == '.' || cp == '!' || cp == '?' ||
-            cp == ';' || cp == ':' || cp == '-' || cp == '\'' || cp == '"') {
-            flush_word();
-            tokens.push_back(chars[i]);
-            continue;
-        }
-
-        // Inverted punctuation (Spanish)
-        if (cp == 0xBF || cp == 0xA1) {
-            flush_word();
-            tokens.push_back(chars[i]);
-            continue;
-        }
-
-        // Japanese/Chinese punctuation
-        if (cp == 0x3001 || cp == 0x3002 || cp == 0xFF0C || cp == 0xFF0E ||
-            cp == 0xFF01 || cp == 0xFF1F) {
-            flush_word();
-            tokens.push_back(","); // normalize CJK punctuation to comma
-            continue;
-        }
-
-        // Devanagari danda
-        if (cp == 0x0964 || cp == 0x0965) {
-            flush_word();
-            tokens.push_back(".");
-            continue;
-        }
-
-        // Everything else is part of a word
-        current_word += chars[i];
-    }
-    flush_word();
-    return tokens;
-}
-
-// Map punctuation token to IPA-compatible output.
-static std::string punct_to_ipa(const std::string& tok) {
-    if (tok == "," || tok == "." || tok == "!" || tok == "?" ||
-        tok == ";" || tok == ":" || tok == "-" || tok == "'") {
-        return tok;
-    }
-    return "";
-}
-
-// Lowercase a UTF-8 string (handles ASCII letters and common Latin accented chars).
-static std::string utf8_to_lower(const std::string& s) {
-    auto chars = utf8_split(s);
-    std::string result;
-    for (auto& ch : chars) {
-        uint32_t cp = utf8_codepoint(ch);
-        if (cp >= 'A' && cp <= 'Z') {
-            result += static_cast<char>(cp + 32);
-        } else if (cp >= 0xC0 && cp <= 0xD6) {
-            result += utf8_encode(cp + 32);
-        } else if (cp >= 0xD8 && cp <= 0xDE) {
-            result += utf8_encode(cp + 32);
-        } else {
-            result += ch;
-        }
-    }
-    return result;
-}
-
-// Check if a token is whitespace.
-static bool is_ws_token(const std::string& tok) {
-    for (char c : tok) {
-        if (c != ' ' && c != '\t' && c != '\n' && c != '\r') return false;
-    }
-    return !tok.empty();
-}
-
-// Check if a token is punctuation.
-static bool is_punct_tok(const std::string& tok) {
-    if (tok.empty()) return false;
-    if (tok.size() == 1) {
-        char c = tok[0];
-        return c == ',' || c == '.' || c == '!' || c == '?' ||
-               c == ';' || c == ':' || c == '-' || c == '\'' || c == '"';
-    }
-    uint32_t cp = utf8_codepoint(tok);
-    return cp == 0xBF || cp == 0xA1;
-}
-
-/// Generic dictionary-first phonemizer.
-/// Splits text into words, looks up each in dict, falls back to g2p_fn.
-static std::string dict_first_phonemize(
-    const std::string& text,
-    const std::unordered_map<std::string, std::string>& dict,
-    std::string (*g2p_fn)(const std::string&))
-{
-    auto tokens = split_into_tokens(text);
-    std::string result;
-
-    for (auto& tok : tokens) {
-        if (is_ws_token(tok)) {
-            result += " ";
-            continue;
-        }
-        if (is_punct_tok(tok)) {
-            auto mapped = punct_to_ipa(tok);
-            if (!mapped.empty()) result += mapped;
-            continue;
-        }
-
-        // Try dictionary lookup (lowercase)
-        auto lower = utf8_to_lower(tok);
-        auto it = dict.find(lower);
-        if (it != dict.end() && !it->second.empty()) {
-            result += kokoro_postprocess(it->second);
-            continue;
-        }
-
-        // Fallback to rule-based G2P
-        result += g2p_fn(tok);
-    }
-
-    return result;
-}
-
-std::string multilingual::french_phonemize(
-    const std::string& text,
-    const std::unordered_map<std::string, std::string>& dict)
-{
-    return dict_first_phonemize(text, dict, french_g2p);
-}
-
-std::string multilingual::spanish_phonemize(
-    const std::string& text,
-    const std::unordered_map<std::string, std::string>& dict)
-{
-    return dict_first_phonemize(text, dict, spanish_g2p);
-}
-
-std::string multilingual::italian_phonemize(
-    const std::string& text,
-    const std::unordered_map<std::string, std::string>& dict)
-{
-    return dict_first_phonemize(text, dict, italian_g2p);
-}
-
-std::string multilingual::portuguese_phonemize(
-    const std::string& text,
-    const std::unordered_map<std::string, std::string>& dict)
-{
-    return dict_first_phonemize(text, dict, portuguese_g2p);
-}
-
-std::string multilingual::hindi_phonemize(
-    const std::string& text,
-    const std::unordered_map<std::string, std::string>& dict)
-{
-    return dict_first_phonemize(text, dict, hindi_g2p);
-}
-
-std::string multilingual::japanese_phonemize(const std::string& text) {
-    return japanese_g2p(text);
-}
-
-std::string multilingual::chinese_phonemize(const std::string& text) {
-    return chinese_g2p(text);
-}
diff --git a/sdk/src/main/cpp/models/kokoro_multilingual.h b/sdk/src/main/cpp/models/kokoro_multilingual.h
deleted file mode 100644
index 387cfb3..0000000
--- a/sdk/src/main/cpp/models/kokoro_multilingual.h
+++ /dev/null
@@ -1,51 +0,0 @@
-#pragma once
-
-#include <string>
-#include <unordered_map>
-
-/// Non-English phonemizers for Kokoro TTS.
-///
-/// Dictionary-first approach with rule-based G2P fallback:
-/// 1. Split text into words (whitespace + punctuation boundaries)
-/// 2. For each word: try dictionary lookup (lowercase), if found use it
-/// 3. If not found: apply rule-based grapheme-to-phoneme conversion
-/// 4. Pass punctuation tokens through
-///
-/// Languages:
-/// - French, Spanish, Portuguese, Italian, Hindi — dictionary + rule-based
-/// - Japanese — Katakana/Hiragana tables, kanji passthrough
-/// - Chinese — Pinyin->IPA conversion (requires pre-segmented pinyin input)
-namespace multilingual {
-
-// --- Dictionary-first phonemizers (preferred entry points) ---
-
-std::string french_phonemize(const std::string& text,
-    const std::unordered_map<std::string, std::string>& dict);
-
-std::string spanish_phonemize(const std::string& text,
-    const std::unordered_map<std::string, std::string>& dict);
-
-std::string italian_phonemize(const std::string& text,
-    const std::unordered_map<std::string, std::string>& dict);
-
-std::string portuguese_phonemize(const std::string& text,
-    const std::unordered_map<std::string, std::string>& dict);
-
-std::string hindi_phonemize(const std::string& text,
-    const std::unordered_map<std::string, std::string>& dict);
-
-std::string japanese_phonemize(const std::string& text);
-
-std::string chinese_phonemize(const std::string& text);
-
-// --- Rule-based G2P fallback (used when word not in dictionary) ---
-
-std::string french_g2p(const std::string& text);
-std::string spanish_g2p(const std::string& text);
-std::string portuguese_g2p(const std::string& text);
-std::string italian_g2p(const std::string& text);
-std::string japanese_g2p(const std::string& text);
-std::string chinese_g2p(const std::string& text);
-std::string hindi_g2p(const std::string& text);
-
-} // namespace multilingual
diff --git a/sdk/src/main/cpp/models/kokoro_phonemizer.cpp b/sdk/src/main/cpp/models/kokoro_phonemizer.cpp
deleted file mode 100644
index 05c3d03..0000000
--- a/sdk/src/main/cpp/models/kokoro_phonemizer.cpp
+++ /dev/null
@@ -1,456 +0,0 @@
-#include "kokoro_phonemizer.h"
-#include "kokoro_multilingual.h"
-#include <algorithm>
-#include <cctype>
-
-// ---------------------------------------------------------------------------
-// UTF-8 helpers
-// ---------------------------------------------------------------------------
-
-/// Iterate UTF-8 string one character (potentially multi-byte) at a time.
-static std::vector<std::string> utf8_chars(const std::string& s) {
-    std::vector<std::string> chars;
-    size_t i = 0;
-    while (i < s.size()) {
-        size_t len = 1;
-        unsigned char c = static_cast<unsigned char>(s[i]);
-        if ((c & 0xE0) == 0xC0) len = 2;
-        else if ((c & 0xF0) == 0xE0) len = 3;
-        else if ((c & 0xF8) == 0xF0) len = 4;
-        chars.push_back(s.substr(i, len));
-        i += len;
-    }
-    return chars;
-}
-
-static std::string to_lower(const std::string& s) {
-    std::string result = s;
-    std::transform(result.begin(), result.end(), result.begin(),
-        [](unsigned char c) { return std::tolower(c); });
-    return result;
-}
-
-static std::string capitalize(const std::string& s) {
-    if (s.empty()) return s;
-    std::string result = s;
-    result[0] = static_cast<char>(std::toupper(static_cast<unsigned char>(result[0])));
-    return result;
-}
-
-static bool is_punct(char c) {
-    return std::ispunct(static_cast<unsigned char>(c)) != 0;
-}
-
-static bool is_whitespace(const std::string& s) {
-    for (char c : s) if (!std::isspace(static_cast<unsigned char>(c))) return false;
-    return !s.empty();
-}
-
-static bool is_all_punct(const std::string& s) {
-    for (char c : s) if (!is_punct(c)) return false;
-    return !s.empty();
-}
-
-static bool ends_with(const std::string& s, const std::string& suffix) {
-    if (suffix.size() > s.size()) return false;
-    return s.compare(s.size() - suffix.size(), suffix.size(), suffix) == 0;
-}
-
-static void replace_all(std::string& s, const std::string& from, const std::string& to) {
-    size_t pos = 0;
-    while ((pos = s.find(from, pos)) != std::string::npos) {
-        s.replace(pos, from.size(), to);
-        pos += to.size();
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Loading
-// ---------------------------------------------------------------------------
-
-bool KokoroPhonemizer::load_vocab(const std::string& path) {
-    auto text = json::read_file(path);
-    if (text.empty()) return false;
-    vocab_ = json::parse_vocab_index(text);
-    return !vocab_.empty();
-}
-
-bool KokoroPhonemizer::load_dictionaries(const std::string& dir) {
-    auto gold_text = json::read_file(dir + "/us_gold.json");
-    if (!gold_text.empty()) {
-        gold_dict_ = json::parse_dictionary(gold_text);
-        grow_dictionary(gold_dict_);
-    }
-
-    auto silver_text = json::read_file(dir + "/us_silver.json");
-    if (!silver_text.empty()) {
-        silver_dict_ = json::parse_dictionary(silver_text);
-        grow_dictionary(silver_dict_);
-    }
-
-    return !gold_dict_.empty() || !silver_dict_.empty();
-}
-
-bool KokoroPhonemizer::load_language_dict(
-    const std::string& lang, const std::string& path)
-{
-    auto text = json::read_file(path);
-    if (text.empty()) return false;
-
-    // Language dicts are flat {"word": "phonemes"} format
-    auto dict = json::parse_flat_object(text);
-    if (dict.empty()) return false;
-
-    lang_dicts_[lang] = std::move(dict);
-    return true;
-}
-
-void KokoroPhonemizer::set_language(const std::string& lang) {
-    language_ = lang;
-}
-
-void KokoroPhonemizer::grow_dictionary(
-    std::unordered_map<std::string, json::DictEntry>& dict)
-{
-    std::unordered_map<std::string, json::DictEntry> additions;
-    for (auto& [key, entry] : dict) {
-        auto lower = to_lower(key);
-        if (key == lower && !key.empty()) {
-            auto cap = capitalize(key);
-            if (dict.find(cap) == dict.end()) additions[cap] = entry;
-        }
-        if (!key.empty() && std::isupper(static_cast<unsigned char>(key[0]))) {
-            if (dict.find(lower) == dict.end()) additions[lower] = entry;
-        }
-    }
-    for (auto& [k, v] : additions) dict[k] = std::move(v);
-}
-
-// ---------------------------------------------------------------------------
-// Tokenization
-// ---------------------------------------------------------------------------
-
-std::vector<int64_t> KokoroPhonemizer::tokenize(
-    const std::string& text, int max_length)
-{
-    auto phonemes = text_to_phonemes(text);
-    std::vector<int64_t> ids = {BOS_ID};
-
-    // Tokenize IPA string character by character
-    // Spaces dropped (not in vocab) — matches iOS behavior
-    auto chars = utf8_chars(phonemes);
-    for (auto& ch : chars) {
-        auto it = vocab_.find(ch);
-        if (it != vocab_.end()) {
-            ids.push_back(it->second);
-        }
-        // Unknown chars (including spaces) silently dropped
-    }
-
-    ids.push_back(EOS_ID);
-
-    if (static_cast<int>(ids.size()) > max_length) {
-        ids.resize(max_length - 1);
-        ids.push_back(EOS_ID);
-    }
-
-    return ids;
-}
-
-std::vector<int64_t> KokoroPhonemizer::pad(
-    const std::vector<int64_t>& ids, int length)
-{
-    if (static_cast<int>(ids.size()) >= length) {
-        return std::vector<int64_t>(ids.begin(), ids.begin() + length);
-    }
-    auto result = ids;
-    result.resize(length, PAD_ID);
-    return result;
-}
-
-// ---------------------------------------------------------------------------
-// Text → Phonemes pipeline
-// ---------------------------------------------------------------------------
-
-std::string KokoroPhonemizer::text_to_phonemes(const std::string& text) {
-    // Route non-English languages to multilingual phonemizers
-    if (language_ == "fr") {
-        auto it = lang_dicts_.find("fr");
-        static const std::unordered_map<std::string, std::string> empty;
-        return multilingual::french_phonemize(text, it != lang_dicts_.end() ? it->second : empty);
-    }
-    if (language_ == "es") {
-        auto it = lang_dicts_.find("es");
-        static const std::unordered_map<std::string, std::string> empty;
-        return multilingual::spanish_phonemize(text, it != lang_dicts_.end() ? it->second : empty);
-    }
-    if (language_ == "it") {
-        auto it = lang_dicts_.find("it");
-        static const std::unordered_map<std::string, std::string> empty;
-        return multilingual::italian_phonemize(text, it != lang_dicts_.end() ? it->second : empty);
-    }
-    if (language_ == "pt") {
-        auto it = lang_dicts_.find("pt");
-        static const std::unordered_map<std::string, std::string> empty;
-        return multilingual::portuguese_phonemize(text, it != lang_dicts_.end() ? it->second : empty);
-    }
-    if (language_ == "hi") {
-        auto it = lang_dicts_.find("hi");
-        static const std::unordered_map<std::string, std::string> empty;
-        return multilingual::hindi_phonemize(text, it != lang_dicts_.end() ? it->second : empty);
-    }
-    if (language_ == "ja") {
-        return multilingual::japanese_phonemize(text);
-    }
-    if (language_ == "zh") {
-        return multilingual::chinese_phonemize(text);
-    }
-
-    // English (default)
-    auto normalized = normalize_text(text);
-    auto words = split_words(normalized);
-
-    std::string result;
-    for (auto& word : words) {
-        if (is_whitespace(word)) {
-            result += " ";
-            continue;
-        }
-        if (is_all_punct(word)) {
-            auto mapped = punctuation_to_phoneme(word);
-            if (!mapped.empty()) result += mapped;
-            continue;
-        }
-        auto phonemes = resolve_word(word);
-        result += phonemes;
-    }
-    return result;
-}
-
-// ---------------------------------------------------------------------------
-// Text normalization
-// ---------------------------------------------------------------------------
-
-std::string KokoroPhonemizer::normalize_text(const std::string& text) {
-    std::string result = text;
-
-    struct Contraction { const char* from; const char* to; };
-    static const Contraction contractions[] = {
-        {"can't", "can not"}, {"won't", "will not"}, {"don't", "do not"},
-        {"doesn't", "does not"}, {"didn't", "did not"}, {"isn't", "is not"},
-        {"aren't", "are not"}, {"wasn't", "was not"}, {"weren't", "were not"},
-        {"couldn't", "could not"}, {"wouldn't", "would not"}, {"shouldn't", "should not"},
-        {"haven't", "have not"}, {"hasn't", "has not"}, {"hadn't", "had not"},
-        {"i'm", "i am"}, {"i've", "i have"}, {"i'll", "i will"}, {"i'd", "i would"},
-        {"you're", "you are"}, {"you've", "you have"}, {"you'll", "you will"},
-        {"he's", "he is"}, {"she's", "she is"}, {"it's", "it is"},
-        {"we're", "we are"}, {"we've", "we have"}, {"we'll", "we will"},
-        {"they're", "they are"}, {"they've", "they have"}, {"they'll", "they will"},
-        {"that's", "that is"}, {"there's", "there is"}, {"let's", "let us"},
-    };
-
-    auto lower = to_lower(result);
-    for (auto& c : contractions) {
-        if (lower.find(c.from) != std::string::npos) {
-            // Case-insensitive replace
-            std::string from_lower(c.from);
-            size_t pos = lower.find(from_lower);
-            while (pos != std::string::npos) {
-                result.replace(pos, from_lower.size(), c.to);
-                lower = to_lower(result);
-                pos = lower.find(from_lower, pos + std::string(c.to).size());
-            }
-        }
-    }
-
-    // Collapse multiple spaces
-    replace_all(result, "  ", " ");
-
-    // Trim
-    size_t start = result.find_first_not_of(" \t\n\r");
-    size_t end = result.find_last_not_of(" \t\n\r");
-    if (start == std::string::npos) return "";
-    return result.substr(start, end - start + 1);
-}
-
-std::vector<std::string> KokoroPhonemizer::split_words(const std::string& text) {
-    std::vector<std::string> words;
-    std::string current;
-
-    for (char c : text) {
-        if (std::isspace(static_cast<unsigned char>(c))) {
-            if (!current.empty()) { words.push_back(current); current.clear(); }
-            words.emplace_back(1, ' ');
-        } else if (is_punct(c)) {
-            if (!current.empty()) { words.push_back(current); current.clear(); }
-            words.emplace_back(1, c);
-        } else {
-            current += c;
-        }
-    }
-    if (!current.empty()) words.push_back(current);
-    return words;
-}
-
-// ---------------------------------------------------------------------------
-// Word resolution
-// ---------------------------------------------------------------------------
-
-std::string KokoroPhonemizer::resolve_word(const std::string& word) {
-    auto lower = to_lower(word);
-    auto sp = special_case(lower);
-    if (!sp.empty()) return sp;
-    auto dict = lookup_dict(lower);
-    if (!dict.empty()) return dict;
-    auto stemmed = stem_and_lookup(lower);
-    if (!stemmed.empty()) return stemmed;
-    // Fallback: return word as-is (will be mostly dropped during tokenization)
-    return lower;
-}
-
-std::string KokoroPhonemizer::lookup_dict(const std::string& word) {
-    auto it = gold_dict_.find(word);
-    if (it != gold_dict_.end()) return resolve_entry(it->second);
-    it = silver_dict_.find(word);
-    if (it != silver_dict_.end()) return resolve_entry(it->second);
-    return "";
-}
-
-std::string KokoroPhonemizer::resolve_entry(const json::DictEntry& entry) {
-    if (!entry.is_heteronym()) return entry.simple;
-    auto it = entry.pos_map.find("DEFAULT");
-    if (it != entry.pos_map.end()) return it->second;
-    if (!entry.pos_map.empty()) return entry.pos_map.begin()->second;
-    return "";
-}
-
-std::string KokoroPhonemizer::special_case(const std::string& word) {
-    if (word == "the") return "\xC3\xB0\xC9\x99";  // ðə
-    if (word == "a") return "\xC9\x90";              // ɐ
-    if (word == "an") return "\xC9\x99n";            // ən
-    if (word == "to") return "t\xCA\x8A";            // tʊ
-    if (word == "of") return "\xCA\x8Cv";            // ʌv
-    if (word == "i") return "a\xC9\xAA";             // aɪ
-    return "";
-}
-
-std::string KokoroPhonemizer::punctuation_to_phoneme(const std::string& text) {
-    if (text == "," || text == "." || text == "!" || text == "?" ||
-        text == ";" || text == ":" || text == "-" || text == "'") {
-        return text;
-    }
-    return "";
-}
-
-// ---------------------------------------------------------------------------
-// Suffix stemming
-// ---------------------------------------------------------------------------
-
-std::string KokoroPhonemizer::stem_and_lookup(const std::string& word) {
-    auto r = stem_s(word);
-    if (!r.empty()) return r;
-    r = stem_ed(word);
-    if (!r.empty()) return r;
-    r = stem_ing(word);
-    if (!r.empty()) return r;
-    return "";
-}
-
-std::string KokoroPhonemizer::stem_s(const std::string& word) {
-    if (!ends_with(word, "s") || word.size() <= 2) return "";
-
-    if (ends_with(word, "ies")) {
-        auto stem = word.substr(0, word.size() - 3) + "y";
-        auto phonemes = lookup_dict(stem);
-        if (!phonemes.empty()) return phonemes + "z";
-    }
-
-    if (ends_with(word, "es") && word.size() > 3) {
-        auto stem = word.substr(0, word.size() - 2);
-        auto phonemes = lookup_dict(stem);
-        if (!phonemes.empty()) {
-            if (!phonemes.empty()) {
-                char last = phonemes.back();
-                // After sibilants: +ɪz
-                if (last == 's' || last == 'z') {
-                    return phonemes + "\xC9\xAA" "z"; // ɪz
-                }
-            }
-            return phonemes + "z";
-        }
-    }
-
-    auto stem = word.substr(0, word.size() - 1);
-    auto phonemes = lookup_dict(stem);
-    if (!phonemes.empty()) {
-        // Voiceless consonants: +s, otherwise +z
-        char last = phonemes.back();
-        if (last == 'p' || last == 't' || last == 'k' || last == 'f') {
-            return phonemes + "s";
-        }
-        return phonemes + "z";
-    }
-    return "";
-}
-
-std::string KokoroPhonemizer::stem_ed(const std::string& word) {
-    if (!ends_with(word, "ed") || word.size() <= 3) return "";
-
-    if (ends_with(word, "ied")) {
-        auto stem = word.substr(0, word.size() - 3) + "y";
-        auto phonemes = lookup_dict(stem);
-        if (!phonemes.empty()) return phonemes + "d";
-    }
-
-    auto stem_base = word.substr(0, word.size() - 2);
-    if (stem_base.size() >= 2) {
-        char last = stem_base.back();
-        char prev = stem_base[stem_base.size() - 2];
-        if (last == prev) {
-            // Doubled consonant — try dedoubled stem
-            auto dedoubled = stem_base.substr(0, stem_base.size() - 1);
-            auto phonemes = lookup_dict(dedoubled);
-            if (!phonemes.empty()) return phonemes + ed_suffix(phonemes);
-        }
-    }
-
-    auto phonemes = lookup_dict(stem_base);
-    if (!phonemes.empty()) return phonemes + ed_suffix(phonemes);
-    return "";
-}
-
-std::string KokoroPhonemizer::ed_suffix(const std::string& phonemes) {
-    if (phonemes.empty()) return "d";
-    char last = phonemes.back();
-    if (last == 't' || last == 'd') return "\xC9\xAA" "d"; // ɪd
-    if (last == 'p' || last == 'k' || last == 'f' || last == 's') {
-        return "t";
-    }
-    return "d";
-}
-
-std::string KokoroPhonemizer::stem_ing(const std::string& word) {
-    if (!ends_with(word, "ing") || word.size() <= 4) return "";
-
-    auto stem = word.substr(0, word.size() - 3);
-
-    if (stem.size() >= 2) {
-        char last = stem.back();
-        char prev = stem[stem.size() - 2];
-        if (last == prev) {
-            auto dedoubled = stem.substr(0, stem.size() - 1);
-            auto phonemes = lookup_dict(dedoubled);
-            if (!phonemes.empty()) return phonemes + "\xC9\xAA\xC5\x8B"; // ɪŋ
-        }
-    }
-
-    auto phonemes = lookup_dict(stem);
-    if (!phonemes.empty()) return phonemes + "\xC9\xAA\xC5\x8B"; // ɪŋ
-
-    // Try stem + "e" (e.g., "making" → "make")
-    auto stem_e = stem + "e";
-    phonemes = lookup_dict(stem_e);
-    if (!phonemes.empty()) return phonemes + "\xC9\xAA\xC5\x8B"; // ɪŋ
-
-    return "";
-}
diff --git a/sdk/src/main/cpp/models/kokoro_phonemizer.h b/sdk/src/main/cpp/models/kokoro_phonemizer.h
deleted file mode 100644
index 0f4db02..0000000
--- a/sdk/src/main/cpp/models/kokoro_phonemizer.h
+++ /dev/null
@@ -1,85 +0,0 @@
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "../util/json.h"
-
-/// GPL-free phonemizer for Kokoro TTS — ported from speech-swift.
-///
-/// Three-tier approach for English (all Apache-2.0 / BSD compatible):
-/// 1. Dictionary lookup �� gold + silver IPA dictionaries from misaki
-/// 2. Suffix stemming — strips -s/-ed/-ing, looks up stem, applies phonological rules
-/// 3. BART G2P — encoder-decoder neural model for OOV words (optional ONNX)
-///
-/// Non-English languages use dictionary-first with rule-based G2P fallback.
-///
-/// No eSpeak-NG dependency.
-class KokoroPhonemizer {
-public:
-    // Kokoro's vocab uses '$' (token id 0) as the BOS / EOS / padding symbol —
-    // see vocab_index.json: '$' -> 0, ';' -> 1, ':' -> 2. Earlier code used 1
-    // and 2, which prepended a literal semicolon and appended a colon to every
-    // utterance, throwing off the model's prosody and dropping the first word.
-    // Verified by round-tripping prompts through speech_synthesize +
-    // speech_transcribe: with the wrong wrap "Hello world" came back as
-    // "I wrote"; with id 0 it round-trips to "Hello world".
-    static constexpr int PAD_ID = 0;
-    static constexpr int BOS_ID = 0;
-    static constexpr int EOS_ID = 0;
-
-    KokoroPhonemizer() = default;
-
-    /// Load IPA symbol → token ID vocabulary from vocab_index.json.
-    bool load_vocab(const std::string& path);
-
-    /// Load pronunciation dictionaries (us_gold.json, us_silver.json).
-    bool load_dictionaries(const std::string& dir);
-
-    /// Load a language-specific pronunciation dictionary (dict_fr.json, etc.).
-    /// Returns true if the dictionary was loaded successfully.
-    bool load_language_dict(const std::string& lang, const std::string& path);
-
-    /// Set the active language for phonemization.
-    /// Supported: "en" (default), "fr", "es", "it", "pt", "hi", "ja", "zh".
-    void set_language(const std::string& lang);
-
-    /// Convert text → phoneme token IDs (with BOS/EOS, max 510).
-    std::vector<int64_t> tokenize(const std::string& text, int max_length = 510);
-
-    /// Pad token IDs to fixed length.
-    std::vector<int64_t> pad(const std::vector<int64_t>& ids, int length);
-
-    /// Convert text to IPA phoneme string.
-    std::string text_to_phonemes(const std::string& text);
-
-private:
-
-    std::string normalize_text(const std::string& text);
-    std::vector<std::string> split_words(const std::string& text);
-    std::string resolve_word(const std::string& word);
-    std::string lookup_dict(const std::string& word);
-    std::string special_case(const std::string& word);
-    std::string stem_and_lookup(const std::string& word);
-    std::string stem_s(const std::string& word);
-    std::string stem_ed(const std::string& word);
-    std::string stem_ing(const std::string& word);
-    std::string ed_suffix(const std::string& phonemes);
-    std::string punctuation_to_phoneme(const std::string& text);
-
-    void grow_dictionary(std::unordered_map<std::string, json::DictEntry>& dict);
-    std::string resolve_entry(const json::DictEntry& entry);
-
-    // IPA symbol → token ID
-    std::unordered_map<std::string, int> vocab_;
-
-    // English pronunciation dictionaries
-    std::unordered_map<std::string, json::DictEntry> gold_dict_;
-    std::unordered_map<std::string, json::DictEntry> silver_dict_;
-
-    // Active language (default: English)
-    std::string language_ = "en";
-
-    // Non-English pronunciation dictionaries keyed by language code
-    std::unordered_map<std::string, std::unordered_map<std::string, std::string>> lang_dicts_;
-};
diff --git a/sdk/src/main/cpp/models/kokoro_tts.cpp b/sdk/src/main/cpp/models/kokoro_tts.cpp
deleted file mode 100644
index 63af1e5..0000000
--- a/sdk/src/main/cpp/models/kokoro_tts.cpp
+++ /dev/null
@@ -1,258 +0,0 @@
-#include "kokoro_tts.h"
-#include "onnx_engine.h"
-#include <cstring>
-#include <cstdlib>
-#include <fstream>
-
-static constexpr int MAX_PHONEMES = 128;
-
-KokoroTts::KokoroTts(
-    const std::string& model_path,
-    const std::string& voices_dir,
-    const std::string& data_dir,
-    bool nnapi)
-    : voices_dir_(voices_dir)
-{
-    auto& engine = OnnxEngine::get();
-    api_ = engine.api();
-    session_ = engine.load(model_path, nnapi);
-
-    // Load phonemizer vocabulary and dictionaries
-    phonemizer_.load_vocab(data_dir + "/vocab_index.json");
-    phonemizer_.load_dictionaries(data_dir);
-
-    // Load optional non-English pronunciation dictionaries
-    for (const char* lang : {"fr", "es", "it", "pt", "hi"}) {
-        phonemizer_.load_language_dict(lang,
-            data_dir + "/dict_" + lang + ".json");
-    }
-
-    // Load default voice
-    set_voice("af_heart");
-}
-
-KokoroTts::~KokoroTts() {
-    if (session_) api_->ReleaseSession(session_);
-}
-
-void KokoroTts::set_voice(const std::string& name) {
-    voice_embedding_ = load_voice_embedding(name);
-}
-
-std::vector<float> KokoroTts::load_voice_embedding(const std::string& name) {
-    std::string path = voices_dir_ + "/" + name + ".bin";
-    std::ifstream file(path, std::ios::binary);
-    if (!file.is_open()) {
-        LOGE("Voice file not found: %s", path.c_str());
-        return std::vector<float>(256, 0.0f);
-    }
-
-    std::vector<float> embedding(256);
-    file.read(reinterpret_cast<char*>(embedding.data()), 256 * sizeof(float));
-    return embedding;
-}
-
-void KokoroTts::auto_switch_voice(const std::string& lang) {
-    if (lang == current_lang_) return;
-    current_lang_ = lang;
-
-    // Map language to default voice
-    struct LangVoice { const char* lang; const char* voice; };
-    static const LangVoice map[] = {
-        {"en", "af_heart"},
-        {"fr", "ff_siwis"},
-        {"es", "ef_dora"},
-        {"it", "if_sara"},
-        {"pt", "pf_dora"},
-        {"hi", "hf_alpha"},
-        {"ja", "jf_alpha"},
-        {"zh", "zf_xiaobei"},
-        {"ko", "kf_somi"},
-    };
-
-    for (auto& entry : map) {
-        if (lang == entry.lang) {
-            auto emb = load_voice_embedding(entry.voice);
-            if (emb[0] != 0.0f || emb[1] != 0.0f) {  // check not zeroed (missing file)
-                voice_embedding_ = std::move(emb);
-                LOGI("TTS: auto-switched voice to %s for language %s", entry.voice, entry.lang);
-            }
-            return;
-        }
-    }
-    // Unknown language — keep current voice
-}
-
-void KokoroTts::synthesize(
-    const char* text, const char* language,
-    ChunkCallback on_chunk, void* ctx)
-{
-    cancelled_ = false;
-
-    // Set language and auto-switch voice if language changed
-    std::string lang = (language && language[0]) ? language : "en";
-    phonemizer_.set_language(lang);
-    auto_switch_voice(lang);
-    auto* mem = OnnxEngine::get().cpu_memory();
-
-    // Text → phoneme token IDs
-    auto raw_tokens = phonemizer_.tokenize(text, MAX_PHONEMES);
-    if (raw_tokens.empty() || cancelled_) return;
-
-    size_t token_count = raw_tokens.size();
-
-    LOGI("TTS: text='%.60s' tokens=%zu", text, token_count);
-
-    // Pad to fixed MAX_PHONEMES with attention mask
-    std::vector<int64_t> input_ids(MAX_PHONEMES, 0);
-    std::vector<int64_t> attention_mask(MAX_PHONEMES, 0);
-    for (size_t i = 0; i < token_count && i < MAX_PHONEMES; i++) {
-        input_ids[i] = raw_tokens[i];
-        attention_mask[i] = 1;
-    }
-
-    // --- input tensors ---
-
-    const int64_t ids_shape[] = {1, MAX_PHONEMES};
-
-    // input_ids [1, 128]
-    OrtValue* t_ids = nullptr;
-    ort_check(api_, api_->CreateTensorWithDataAsOrtValue(
-        mem, input_ids.data(), input_ids.size() * sizeof(int64_t),
-        ids_shape, 2, ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, &t_ids));
-
-    // attention_mask [1, 128]
-    OrtValue* t_mask = nullptr;
-    ort_check(api_, api_->CreateTensorWithDataAsOrtValue(
-        mem, attention_mask.data(), attention_mask.size() * sizeof(int64_t),
-        ids_shape, 2, ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, &t_mask));
-
-    // ref_s / voice embedding [1, 256]
-    const int64_t style_shape[] = {1, 256};
-    OrtValue* t_style = nullptr;
-    ort_check(api_, api_->CreateTensorWithDataAsOrtValue(
-        mem, voice_embedding_.data(), voice_embedding_.size() * sizeof(float),
-        style_shape, 2, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &t_style));
-
-    // speed [1]
-    float speed = 0.85f;
-    const int64_t speed_shape[] = {1};
-    OrtValue* t_speed = nullptr;
-    ort_check(api_, api_->CreateTensorWithDataAsOrtValue(
-        mem, &speed, sizeof(float),
-        speed_shape, 1, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &t_speed));
-
-    // random_phases [1, 9]
-    float phases[9];
-    for (int i = 0; i < 9; i++)
-        phases[i] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
-    const int64_t phases_shape[] = {1, 9};
-    OrtValue* t_phases = nullptr;
-    ort_check(api_, api_->CreateTensorWithDataAsOrtValue(
-        mem, phases, sizeof(phases),
-        phases_shape, 2, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &t_phases));
-
-    // --- run ---
-
-    const char* in_names[]  = {"input_ids", "attention_mask", "ref_s", "speed", "random_phases"};
-    const char* out_names[] = {"audio", "audio_length_samples", "pred_dur"};
-    OrtValue* inputs[]  = {t_ids, t_mask, t_style, t_speed, t_phases};
-    OrtValue* outputs[] = {nullptr, nullptr, nullptr};
-
-    ort_check(api_, api_->Run(
-        session_, nullptr,
-        in_names, inputs, 5,
-        out_names, 3, outputs));
-
-    if (!cancelled_) {
-        float* audio = nullptr;
-        ort_check(api_, api_->GetTensorMutableData(outputs[0], (void**)&audio));
-
-        // Get valid sample count from model
-        int64_t* len_ptr = nullptr;
-        ort_check(api_, api_->GetTensorMutableData(outputs[1], (void**)&len_ptr));
-        size_t valid_samples = static_cast<size_t>(len_ptr[0]);
-
-        // Inspect peak before any processing — short prompts (≤5 tokens) can
-        // make the E2E ONNX export numerically explode (peak in the hundreds).
-        // Treat that as a synthesis failure rather than amplifying garbage.
-        float peak = 0.0f;
-        for (size_t i = 0; i < valid_samples; i++) {
-            float a = std::abs(audio[i]);
-            if (a > peak) peak = a;
-        }
-        if (peak > 2.0f) {
-            LOGI("TTS: dropping output, peak=%.2f indicates numerical instability "
-                 "(short prompt? text='%.40s')", peak, text);
-            // Cleanup outputs and return without emitting audio
-            for (int i = 2; i >= 0; i--) api_->ReleaseValue(outputs[i]);
-            api_->ReleaseValue(t_phases);
-            api_->ReleaseValue(t_speed);
-            api_->ReleaseValue(t_style);
-            api_->ReleaseValue(t_mask);
-            api_->ReleaseValue(t_ids);
-            return;
-        }
-
-        // Trim trailing artifacts — Kokoro's E2E model often emits 100-300 ms
-        // of low-energy noise + occasional loud spike past the real speech.
-        // Walk backwards through 50 ms windows; the last window above the
-        // silence floor is where speech ended. Sustained-energy threshold
-        // (50 ms window) avoids mistaking isolated artifact spikes for
-        // speech. Mirrors KokoroTTSModel.synthesize() in speech-swift.
-        constexpr int sample_rate = 24000;
-        constexpr float silence_rms = 0.030f;
-        const size_t win = std::max<size_t>(1, sample_rate / 20);  // 50 ms
-        size_t speech_end = valid_samples;
-        if (valid_samples > win) {
-            for (size_t i = valid_samples - win; i > 0; i -= win / 2) {
-                float sum_sq = 0.0f;
-                for (size_t j = 0; j < win; j++) {
-                    float v = audio[i + j];
-                    sum_sq += v * v;
-                }
-                float rms = std::sqrt(sum_sq / static_cast<float>(win));
-                if (rms > silence_rms) {
-                    speech_end = i + win;
-                    break;
-                }
-                if (i < win / 2) break;
-            }
-        }
-        if (speech_end < valid_samples) {
-            for (size_t k = speech_end; k < valid_samples; k++) audio[k] = 0.0f;
-        }
-        // ~10 ms linear fade-out at the new tail boundary so the seam is smooth.
-        const size_t fade_out = std::min<size_t>(speech_end, sample_rate / 100);
-        if (fade_out >= 2) {
-            const size_t start = speech_end - fade_out;
-            const float denom = static_cast<float>(fade_out - 1);
-            for (size_t k = 0; k < fade_out; k++) {
-                float gain = static_cast<float>(fade_out - 1 - k) / denom;
-                audio[start + k] *= gain;
-            }
-        }
-        // 5 ms fade-in to prevent click at start.
-        const size_t fade_in = std::min<size_t>(120, speech_end);
-        for (size_t i = 0; i < fade_in; i++) {
-            audio[i] *= static_cast<float>(i) / static_cast<float>(fade_in);
-        }
-
-        LOGI("TTS: valid=%zu speech_end=%zu peak=%.4f", valid_samples, speech_end, peak);
-
-        on_chunk(audio, speech_end, true, ctx);
-    }
-
-    // --- cleanup ---
-
-    for (int i = 2; i >= 0; i--) api_->ReleaseValue(outputs[i]);
-    api_->ReleaseValue(t_phases);
-    api_->ReleaseValue(t_speed);
-    api_->ReleaseValue(t_style);
-    api_->ReleaseValue(t_mask);
-    api_->ReleaseValue(t_ids);
-}
-
-void KokoroTts::cancel() {
-    cancelled_ = true;
-}
diff --git a/sdk/src/main/cpp/models/kokoro_tts.h b/sdk/src/main/cpp/models/kokoro_tts.h
deleted file mode 100644
index dc732d8..0000000
--- a/sdk/src/main/cpp/models/kokoro_tts.h
+++ /dev/null
@@ -1,41 +0,0 @@
-#pragma once
-
-#include <onnxruntime_c_api.h>
-#include <string>
-#include <vector>
-#include "kokoro_phonemizer.h"
-
-/// Kokoro 82M — lightweight text-to-speech via ONNX Runtime.
-/// Non-autoregressive, single-pass synthesis.
-/// Output: 24 kHz PCM Float32.
-class KokoroTts {
-public:
-    KokoroTts(const std::string& model_path,
-              const std::string& voices_dir,
-              const std::string& data_dir,
-              bool nnapi = true);
-    ~KokoroTts();
-
-    using ChunkCallback = void(*)(const float* samples, size_t length,
-                                  bool is_final, void* ctx);
-
-    void synthesize(const char* text, const char* language,
-                    ChunkCallback on_chunk, void* ctx);
-    void cancel();
-    int output_sample_rate() const { return 24000; }
-
-    void set_voice(const std::string& name);
-
-private:
-    std::vector<float> load_voice_embedding(const std::string& name);
-    void auto_switch_voice(const std::string& language);
-
-    const OrtApi* api_;
-    OrtSession* session_ = nullptr;
-
-    KokoroPhonemizer phonemizer_;
-    std::vector<float> voice_embedding_;
-    std::string voices_dir_;
-    std::string current_lang_;
-    bool cancelled_ = false;
-};
diff --git a/sdk/src/main/cpp/models/onnx_backend.h b/sdk/src/main/cpp/models/onnx_backend.h
deleted file mode 100644
index 0111b43..0000000
--- a/sdk/src/main/cpp/models/onnx_backend.h
+++ /dev/null
@@ -1,131 +0,0 @@
-#pragma once
-
-#include "inference_engine.h"
-#include "onnx_engine.h"
-#include <memory>
-#include <string>
-#include <vector>
-
-/// ONNX Runtime output tensor — wraps OrtValue*.
-class OnnxOutputTensor : public OutputTensor {
-public:
-    OnnxOutputTensor(const OrtApi* api, OrtValue* value) : api_(api), value_(value) {}
-
-    ~OnnxOutputTensor() override {
-        if (value_) api_->ReleaseValue(value_);
-    }
-
-    float* data_float() override {
-        float* data = nullptr;
-        ort_check(api_, api_->GetTensorMutableData(value_, (void**)&data));
-        return data;
-    }
-
-    int64_t* data_int64() override {
-        int64_t* data = nullptr;
-        ort_check(api_, api_->GetTensorMutableData(value_, (void**)&data));
-        return data;
-    }
-
-    std::vector<int64_t> shape() override {
-        OrtTensorTypeAndShapeInfo* info = nullptr;
-        ort_check(api_, api_->GetTensorTypeAndShape(value_, &info));
-        size_t dim_count = 0;
-        api_->GetDimensionsCount(info, &dim_count);
-        std::vector<int64_t> dims(dim_count);
-        api_->GetDimensions(info, dims.data(), dim_count);
-        api_->ReleaseTensorTypeAndShapeInfo(info);
-        return dims;
-    }
-
-    size_t element_count() override {
-        auto s = shape();
-        size_t n = 1;
-        for (auto d : s) n *= static_cast<size_t>(d);
-        return n;
-    }
-
-private:
-    const OrtApi* api_;
-    OrtValue* value_;
-};
-
-/// ONNX Runtime session — wraps OrtSession*.
-class OnnxSession : public InferenceSession {
-public:
-    OnnxSession(const OrtApi* api, OrtSession* session)
-        : api_(api), session_(session) {}
-
-    ~OnnxSession() override {
-        if (session_) api_->ReleaseSession(session_);
-    }
-
-    std::vector<std::unique_ptr<OutputTensor>> run(
-        const std::vector<const char*>& input_names,
-        const std::vector<TensorInfo>& inputs,
-        const std::vector<const char*>& output_names) override
-    {
-        auto* mem = OnnxEngine::get().cpu_memory();
-        size_t num_in = inputs.size();
-        size_t num_out = output_names.size();
-
-        // Create input OrtValues
-        std::vector<OrtValue*> ort_inputs(num_in, nullptr);
-        for (size_t i = 0; i < num_in; i++) {
-            auto& t = inputs[i];
-            ONNXTensorElementDataType ort_dtype;
-            switch (t.dtype) {
-                case DType::FLOAT32: ort_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; break;
-                case DType::INT64:   ort_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64; break;
-                case DType::INT32:   ort_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32; break;
-                case DType::INT8:    ort_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8; break;
-            }
-            ort_check(api_, api_->CreateTensorWithDataAsOrtValue(
-                mem,
-                const_cast<void*>(t.data),
-                t.byte_size(),
-                t.shape.data(),
-                t.shape.size(),
-                ort_dtype,
-                &ort_inputs[i]));
-        }
-
-        // Prepare output array
-        std::vector<OrtValue*> ort_outputs(num_out, nullptr);
-
-        // Run
-        ort_check(api_, api_->Run(
-            session_, nullptr,
-            input_names.data(), ort_inputs.data(), num_in,
-            output_names.data(), num_out, ort_outputs.data()));
-
-        // Release inputs
-        for (auto* v : ort_inputs) api_->ReleaseValue(v);
-
-        // Wrap outputs
-        std::vector<std::unique_ptr<OutputTensor>> results;
-        results.reserve(num_out);
-        for (auto* v : ort_outputs) {
-            results.push_back(std::make_unique<OnnxOutputTensor>(api_, v));
-        }
-        return results;
-    }
-
-private:
-    const OrtApi* api_;
-    OrtSession* session_;
-};
-
-/// ONNX Runtime backend — delegates to OnnxEngine singleton.
-class OnnxBackend : public InferenceBackend {
-public:
-    std::unique_ptr<InferenceSession> load(
-        const std::string& path, bool hw_accel = true) override
-    {
-        auto& engine = OnnxEngine::get();
-        OrtSession* session = engine.load(path, hw_accel);
-        return std::make_unique<OnnxSession>(engine.api(), session);
-    }
-
-    Backend type() const override { return Backend::ONNX; }
-};
diff --git a/sdk/src/main/cpp/models/onnx_engine.h b/sdk/src/main/cpp/models/onnx_engine.h
deleted file mode 100644
index 4dcee0f..0000000
--- a/sdk/src/main/cpp/models/onnx_engine.h
+++ /dev/null
@@ -1,122 +0,0 @@
-#pragma once
-
-#include <onnxruntime_c_api.h>
-#include <stdexcept>
-#include <string>
-
-#ifdef __ANDROID__
-#include <android/log.h>
-#define LOG_TAG "Speech"
-#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)
-#define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__)
-#else
-#include <cstdio>
-#define LOGI(...) do { fprintf(stderr, "[speech] "); fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while(0)
-#define LOGE(...) do { fprintf(stderr, "[speech ERROR] "); fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while(0)
-#endif
-
-inline void ort_check(const OrtApi* api, OrtStatus* status) {
-    if (status != nullptr) {
-        const char* msg = api->GetErrorMessage(status);
-        std::string err(msg);
-        api->ReleaseStatus(status);
-        throw std::runtime_error("ORT: " + err);
-    }
-}
-
-/// Singleton ONNX Runtime environment shared across all models.
-class OnnxEngine {
-public:
-    static OnnxEngine& get() {
-        static OnnxEngine instance;
-        return instance;
-    }
-
-    const OrtApi* api() const { return api_; }
-    OrtEnv* env() const { return env_; }
-
-    /// True if any model fell back from NNAPI to CPU during session creation.
-    bool had_nnapi_fallback() const { return nnapi_fallback_; }
-    const std::string& nnapi_fallback_reason() const { return nnapi_fallback_reason_; }
-
-    OrtSession* load(const std::string& path, bool nnapi = true) {
-        OrtSessionOptions* opts = nullptr;
-        ort_check(api_, api_->CreateSessionOptions(&opts));
-        api_->SetSessionGraphOptimizationLevel(opts, ORT_ENABLE_ALL);
-        api_->SetIntraOpNumThreads(opts, 2);
-
-        if (nnapi) {
-            LOGI("Loading model with hardware acceleration: %s",
-                 path.substr(path.find_last_of('/') + 1).c_str());
-#ifdef __ANDROID__
-            const char* keys[] = {"nnapi_flags"};
-            const char* values[] = {"0"};
-            OrtStatus* s = api_->SessionOptionsAppendExecutionProvider(
-                opts, "NNAPI", keys, values, 1);
-#else
-            const char* keys[] = {"backend_path"};
-            const char* values[] = {"libQnnHtp.so"};
-            OrtStatus* s = api_->SessionOptionsAppendExecutionProvider(
-                opts, "QNN", keys, values, 1);
-#endif
-            if (s != nullptr) {
-                LOGI("Hardware EP unavailable, using CPU");
-                api_->ReleaseStatus(s);
-            }
-        }
-
-        OrtSession* session = nullptr;
-        OrtStatus* create_status = api_->CreateSession(env_, path.c_str(), opts, &session);
-
-        // If session creation fails with NNAPI, retry CPU-only
-        if (create_status != nullptr && nnapi) {
-            const char* msg = api_->GetErrorMessage(create_status);
-            LOGI("NNAPI session failed (%s), retrying CPU-only", msg);
-            nnapi_fallback_ = true;
-            nnapi_fallback_reason_ = msg;
-            api_->ReleaseStatus(create_status);
-            api_->ReleaseSessionOptions(opts);
-
-            opts = nullptr;
-            ort_check(api_, api_->CreateSessionOptions(&opts));
-            api_->SetSessionGraphOptimizationLevel(opts, ORT_ENABLE_ALL);
-            api_->SetIntraOpNumThreads(opts, 4);
-
-            ort_check(api_, api_->CreateSession(env_, path.c_str(), opts, &session));
-        } else if (create_status != nullptr) {
-            // CPU-only also failed — propagate the error
-            const char* msg = api_->GetErrorMessage(create_status);
-            std::string err(msg);
-            api_->ReleaseStatus(create_status);
-            api_->ReleaseSessionOptions(opts);
-            throw std::runtime_error("ORT: " + err);
-        }
-
-        api_->ReleaseSessionOptions(opts);
-        return session;
-    }
-
-    OrtMemoryInfo* cpu_memory() const { return mem_; }
-
-    ~OnnxEngine() {
-        if (mem_) api_->ReleaseMemoryInfo(mem_);
-        if (env_) api_->ReleaseEnv(env_);
-    }
-
-private:
-    OnnxEngine() {
-        api_ = OrtGetApiBase()->GetApi(ORT_API_VERSION);
-        ort_check(api_, api_->CreateEnv(ORT_LOGGING_LEVEL_WARNING, "speech", &env_));
-        ort_check(api_, api_->CreateCpuMemoryInfo(
-            OrtArenaAllocator, OrtMemTypeDefault, &mem_));
-    }
-
-    OnnxEngine(const OnnxEngine&) = delete;
-    OnnxEngine& operator=(const OnnxEngine&) = delete;
-
-    const OrtApi* api_ = nullptr;
-    OrtEnv* env_ = nullptr;
-    OrtMemoryInfo* mem_ = nullptr;
-    bool nnapi_fallback_ = false;
-    std::string nnapi_fallback_reason_;
-};
diff --git a/sdk/src/main/cpp/models/parakeet_stt.cpp b/sdk/src/main/cpp/models/parakeet_stt.cpp
deleted file mode 100644
index df73bcc..0000000
--- a/sdk/src/main/cpp/models/parakeet_stt.cpp
+++ /dev/null
@@ -1,412 +0,0 @@
-#include "parakeet_stt.h"
-#include "onnx_engine.h"
-#include "../audio/mel.h"
-#include "../util/json.h"
-#include <algorithm>
-#include <cmath>
-#include <cstring>
-
-// ---------------------------------------------------------------------------
-// SentencePiece U+2581 → space, then trim
-// ---------------------------------------------------------------------------
-
-static void replace_sp_marker(std::string& s) {
-    const std::string marker = "\xE2\x96\x81";
-    size_t pos = 0;
-    while ((pos = s.find(marker, pos)) != std::string::npos) {
-        s.replace(pos, marker.size(), " ");
-        pos += 1;
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Construction
-// ---------------------------------------------------------------------------
-
-ParakeetStt::ParakeetStt(
-    const std::string& encoder_path,
-    const std::string& decoder_joint_path,
-    const std::string& vocab_path,
-    bool nnapi)
-{
-    auto& engine = OnnxEngine::get();
-    api_ = engine.api();
-    encoder_       = engine.load(encoder_path, nnapi);
-    decoder_joint_ = engine.load(decoder_joint_path, false);
-
-    load_vocab(vocab_path);
-}
-
-ParakeetStt::~ParakeetStt() {
-    if (decoder_joint_) api_->ReleaseSession(decoder_joint_);
-    if (encoder_)       api_->ReleaseSession(encoder_);
-}
-
-// ---------------------------------------------------------------------------
-// Vocabulary
-// ---------------------------------------------------------------------------
-
-bool ParakeetStt::load_vocab(const std::string& path) {
-    auto text = json::read_file(path);
-    if (text.empty()) return false;
-
-    auto flat = json::parse_flat_object(text);
-    for (auto& [key, val] : flat) {
-        try {
-            int id = std::stoi(key);
-            vocab_[id] = val;
-
-            // Index language tokens like <|en|>, <|fr|>, etc.
-            if (val.size() >= 5 && val.size() <= 6 &&
-                val.substr(0, 2) == "<|" && val.substr(val.size() - 2) == "|>") {
-                std::string code = val.substr(2, val.size() - 4);
-                lang_tokens_[id] = code;
-            }
-        } catch (...) {}
-    }
-
-    // Update config based on actual vocab size
-    if (!vocab_.empty()) {
-        cfg_.vocab_size = static_cast<int>(vocab_.size());
-        cfg_.blank_id = cfg_.vocab_size;
-        cfg_.total_logits = cfg_.vocab_size + 1 + cfg_.num_dur_bins;
-    }
-
-    LOGI("Parakeet vocab: %zu tokens, %zu language tokens, blank=%d",
-         vocab_.size(), lang_tokens_.size(), cfg_.blank_id);
-    return !vocab_.empty();
-}
-
-std::string ParakeetStt::decode_tokens(const std::vector<int>& token_ids) {
-    std::string pieces;
-    for (int id : token_ids) {
-        auto it = vocab_.find(id);
-        if (it != vocab_.end()) pieces += it->second;
-    }
-    replace_sp_marker(pieces);
-
-    size_t start = pieces.find_first_not_of(' ');
-    if (start == std::string::npos) return "";
-    size_t end = pieces.find_last_not_of(' ');
-    return pieces.substr(start, end - start + 1);
-}
-
-// ---------------------------------------------------------------------------
-// Mel spectrogram
-// ---------------------------------------------------------------------------
-
-std::vector<float> ParakeetStt::compute_mel(const float* audio, size_t length) {
-    std::vector<float> emphasized(length);
-    emphasized[0] = audio[0];
-    for (size_t i = 1; i < length; i++) {
-        emphasized[i] = audio[i] - cfg_.pre_emphasis * audio[i - 1];
-    }
-
-    auto mel = mel_spectrogram(
-        emphasized.data(), emphasized.size(),
-        cfg_.sample_rate, cfg_.n_fft, cfg_.hop_length,
-        cfg_.win_length, cfg_.num_mel_bins);
-
-    // Per-feature normalization (NeMo AudioToMelSpectrogramPreprocessor)
-    // mel layout: [num_mel_bins * num_frames], mel[m * num_frames + t]
-    int num_frames = static_cast<int>(mel.size() / cfg_.num_mel_bins);
-    if (num_frames > 1) {
-        for (int m = 0; m < cfg_.num_mel_bins; m++) {
-            float sum = 0, sq_sum = 0;
-            for (int t = 0; t < num_frames; t++) {
-                float v = mel[m * num_frames + t];
-                sum += v;
-                sq_sum += v * v;
-            }
-            float mean = sum / num_frames;
-            float var = sq_sum / num_frames - mean * mean;
-            float std = (var > 0) ? std::sqrt(var) : 1.0f;
-            for (int t = 0; t < num_frames; t++) {
-                mel[m * num_frames + t] = (mel[m * num_frames + t] - mean) / std;
-            }
-        }
-    }
-
-    return mel;
-}
-
-// ---------------------------------------------------------------------------
-// Transcribe
-// ---------------------------------------------------------------------------
-
-ParakeetStt::Result ParakeetStt::transcribe(
-    const float* audio, size_t length, int /*sample_rate*/)
-{
-    auto* mem = OnnxEngine::get().cpu_memory();
-
-    // --- mel spectrogram [B, 128, T] ---
-
-    auto mel = compute_mel(audio, length);
-    int64_t num_frames = static_cast<int64_t>(mel.size() / cfg_.num_mel_bins);
-    const int64_t mel_shape[] = {1, static_cast<int64_t>(cfg_.num_mel_bins), num_frames};
-
-    OrtValue* t_mel = nullptr;
-    ort_check(api_, api_->CreateTensorWithDataAsOrtValue(
-        mem, mel.data(), mel.size() * sizeof(float),
-        mel_shape, 3, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &t_mel));
-
-    int64_t mel_len = num_frames;
-    const int64_t len_shape[] = {1};
-    OrtValue* t_len = nullptr;
-    ort_check(api_, api_->CreateTensorWithDataAsOrtValue(
-        mem, &mel_len, sizeof(int64_t),
-        len_shape, 1, ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, &t_len));
-
-    // --- encoder: audio_signal, length → outputs, encoded_lengths ---
-
-    const char* enc_in[]  = {"audio_signal", "length"};
-    const char* enc_out[] = {"outputs", "encoded_lengths"};
-    OrtValue* enc_inputs[]  = {t_mel, t_len};
-    OrtValue* enc_outputs[] = {nullptr, nullptr};
-
-    ort_check(api_, api_->Run(
-        encoder_, nullptr, enc_in, enc_inputs, 2, enc_out, 2, enc_outputs));
-
-    // Get encoded shape [B, 1024, T']
-    OrtTensorTypeAndShapeInfo* info = nullptr;
-    ort_check(api_, api_->GetTensorTypeAndShape(enc_outputs[0], &info));
-    size_t dim_count = 0;
-    api_->GetDimensionsCount(info, &dim_count);
-    std::vector<int64_t> enc_shape(dim_count);
-    api_->GetDimensions(info, enc_shape.data(), dim_count);
-    api_->ReleaseTensorTypeAndShapeInfo(info);
-
-    float* encoded = nullptr;
-    ort_check(api_, api_->GetTensorMutableData(enc_outputs[0], (void**)&encoded));
-
-    int64_t* enc_len_ptr = nullptr;
-    ort_check(api_, api_->GetTensorMutableData(enc_outputs[1], (void**)&enc_len_ptr));
-    int64_t enc_len = enc_len_ptr[0];
-    int64_t hidden  = (dim_count >= 3) ? enc_shape[1] : cfg_.encoder_hidden;
-
-    LOGI("STT: frames=%lld enc_len=%lld hidden=%lld audio=%zu enc_range=[%.4f,%.4f]",
-         num_frames, enc_len, hidden, length,
-         [&]{ float mn=encoded[0]; for(size_t i=1;i<(size_t)(hidden*enc_len);i++) if(encoded[i]<mn) mn=encoded[i]; return mn; }(),
-         [&]{ float mx=encoded[0]; for(size_t i=1;i<(size_t)(hidden*enc_len);i++) if(encoded[i]>mx) mx=encoded[i]; return mx; }());
-
-    // --- TDT greedy decode ---
-
-    auto result = tdt_decode(encoded, enc_len, hidden);
-
-    LOGI("STT: text='%.60s' conf=%.4f", result.text.c_str(), result.confidence);
-
-    // --- cleanup ---
-
-    api_->ReleaseValue(enc_outputs[1]);
-    api_->ReleaseValue(enc_outputs[0]);
-    api_->ReleaseValue(t_len);
-    api_->ReleaseValue(t_mel);
-
-    return result;
-}
-
-// ---------------------------------------------------------------------------
-// TDT greedy decoding with fused decoder_joint model
-// ---------------------------------------------------------------------------
-
-ParakeetStt::Result ParakeetStt::tdt_decode(
-    const float* encoded, int64_t enc_len, int64_t hidden)
-{
-    auto* mem = OnnxEngine::get().cpu_memory();
-
-    std::vector<int> token_ids;
-    std::string detected_language;
-    float log_prob_sum = 0.0f;
-    int log_prob_count = 0;
-
-    // LSTM states: [2, 1, 640]
-    int64_t state_size = cfg_.decoder_layers * 1 * cfg_.decoder_hidden;
-    std::vector<float> h_state(state_size, 0.0f);
-    std::vector<float> c_state(state_size, 0.0f);
-    const int64_t lstm_shape[] = {
-        static_cast<int64_t>(cfg_.decoder_layers), 1,
-        static_cast<int64_t>(cfg_.decoder_hidden)
-    };
-
-    int64_t prev_token = static_cast<int64_t>(cfg_.blank_id);
-    int64_t t = 0;
-
-    while (t < enc_len) {
-        // Encoder frame at time t: [1, hidden, 1]
-        // NeMo encoder output is [B, hidden, T'] so frame is at offset t
-        // But decoder_joint expects encoder_outputs [B, hidden, T']
-        // For greedy decode, we pass a single frame [1, hidden, 1]
-        std::vector<float> enc_frame(hidden);
-        for (int64_t h = 0; h < hidden; h++) {
-            enc_frame[h] = encoded[h * enc_len + t];  // [B, H, T] layout
-        }
-
-        const int64_t enc_frame_shape[] = {1, hidden, 1};
-        OrtValue* t_enc = nullptr;
-        ort_check(api_, api_->CreateTensorWithDataAsOrtValue(
-            mem, enc_frame.data(), enc_frame.size() * sizeof(float),
-            enc_frame_shape, 3, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &t_enc));
-
-        // Target: previous token [1, 1]
-        const int64_t tok_shape[] = {1, 1};
-        OrtValue* t_tok = nullptr;
-        ort_check(api_, api_->CreateTensorWithDataAsOrtValue(
-            mem, &prev_token, sizeof(int64_t),
-            tok_shape, 2, ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, &t_tok));
-
-        // Target length: [1] = 1
-        int64_t tgt_len = 1;
-        const int64_t tgt_len_shape[] = {1};
-        OrtValue* t_tgt_len = nullptr;
-        ort_check(api_, api_->CreateTensorWithDataAsOrtValue(
-            mem, &tgt_len, sizeof(int64_t),
-            tgt_len_shape, 1, ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, &t_tgt_len));
-
-        // LSTM states
-        OrtValue* t_h = nullptr;
-        ort_check(api_, api_->CreateTensorWithDataAsOrtValue(
-            mem, h_state.data(), h_state.size() * sizeof(float),
-            lstm_shape, 3, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &t_h));
-
-        OrtValue* t_c = nullptr;
-        ort_check(api_, api_->CreateTensorWithDataAsOrtValue(
-            mem, c_state.data(), c_state.size() * sizeof(float),
-            lstm_shape, 3, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &t_c));
-
-        // Run decoder_joint (v3 uses "prednet_lengths_orig" instead of "target_length")
-        const char* in_names[]  = {"encoder_outputs", "targets", "prednet_lengths_orig",
-                                   "input_states_1", "input_states_2"};
-        const char* out_names[] = {"outputs", "prednet_lengths",
-                                   "output_states_1", "output_states_2"};
-        OrtValue* inputs[]  = {t_enc, t_tok, t_tgt_len, t_h, t_c};
-        OrtValue* outputs[] = {nullptr, nullptr, nullptr, nullptr};
-
-        ort_check(api_, api_->Run(
-            decoder_joint_, nullptr,
-            in_names, inputs, 5,
-            out_names, 4, outputs));
-
-        // Logits: [1, 1, 1, total_logits] — token logits + duration logits
-        float* logits = nullptr;
-        ort_check(api_, api_->GetTensorMutableData(outputs[0], (void**)&logits));
-
-        int token_end = cfg_.vocab_size + 1;  // includes blank
-
-        // Greedy argmax: token
-        int best_token = 0;
-        float best_score = logits[0];
-        for (int i = 1; i < token_end; i++) {
-            if (logits[i] > best_score) {
-                best_score = logits[i];
-                best_token = i;
-            }
-        }
-
-        if (best_token == cfg_.blank_id) {
-            // Blank: advance time, keep LSTM state unchanged
-            t += 1;
-        } else {
-            if (best_token >= cfg_.first_text_token && best_token < cfg_.vocab_size) {
-                // Check if this is a language token
-                auto lang_it = lang_tokens_.find(best_token);
-                if (lang_it != lang_tokens_.end()) {
-                    if (detected_language.empty()) {
-                        detected_language = lang_it->second;
-                    }
-                    // Don't add language tokens to output text
-                } else {
-                    token_ids.push_back(best_token);
-                    log_prob_sum += best_score;
-                    log_prob_count++;
-                }
-            }
-
-            // Duration logits start after token logits
-            float* dur_logits = logits + token_end;
-            int dur_idx = 0;
-            float best_dur = dur_logits[0];
-            for (int d = 1; d < cfg_.num_dur_bins; d++) {
-                if (dur_logits[d] > best_dur) {
-                    best_dur = dur_logits[d];
-                    dur_idx = d;
-                }
-            }
-            t += std::max(cfg_.duration_bins[dur_idx], 1);
-
-            prev_token = best_token;
-
-            // Update LSTM states only on non-blank emission
-            float* h_out = nullptr;
-            ort_check(api_, api_->GetTensorMutableData(outputs[2], (void**)&h_out));
-            std::memcpy(h_state.data(), h_out, state_size * sizeof(float));
-
-            float* c_out = nullptr;
-            ort_check(api_, api_->GetTensorMutableData(outputs[3], (void**)&c_out));
-            std::memcpy(c_state.data(), c_out, state_size * sizeof(float));
-        }
-
-        // Cleanup
-        for (int i = 3; i >= 0; i--) api_->ReleaseValue(outputs[i]);
-        api_->ReleaseValue(t_c);
-        api_->ReleaseValue(t_h);
-        api_->ReleaseValue(t_tgt_len);
-        api_->ReleaseValue(t_tok);
-        api_->ReleaseValue(t_enc);
-    }
-
-    // --- build result ---
-
-    Result result;
-    result.text = decode_tokens(token_ids);
-    result.language = detected_language;
-
-    if (log_prob_count > 0) {
-        float mean_logit = log_prob_sum / static_cast<float>(log_prob_count);
-        result.confidence = 1.0f / (1.0f + std::exp(-mean_logit * 0.1f));
-    }
-
-    if (!result.language.empty()) {
-        LOGI("STT: detected language=%s", result.language.c_str());
-    }
-
-    return result;
-}
-
-// ---------------------------------------------------------------------------
-// Streaming: accumulate audio and re-transcribe
-// ---------------------------------------------------------------------------
-
-void ParakeetStt::begin_stream(int sample_rate) {
-    stream_buffer_.clear();
-    stream_sample_rate_ = sample_rate;
-    streaming_ = true;
-}
-
-ParakeetStt::Result ParakeetStt::push_chunk(const float* audio, size_t length) {
-    stream_buffer_.insert(stream_buffer_.end(), audio, audio + length);
-
-    // Need at least 0.5s of audio for meaningful transcription
-    if (stream_buffer_.size() < static_cast<size_t>(stream_sample_rate_ / 2)) {
-        return {};
-    }
-
-    return transcribe(stream_buffer_.data(), stream_buffer_.size(), stream_sample_rate_);
-}
-
-ParakeetStt::Result ParakeetStt::end_stream() {
-    streaming_ = false;
-    if (stream_buffer_.empty()) return {};
-
-    auto result = transcribe(stream_buffer_.data(), stream_buffer_.size(), stream_sample_rate_);
-    stream_buffer_.clear();
-    return result;
-}
-
-void ParakeetStt::cancel_stream() {
-    stream_buffer_.clear();
-    streaming_ = false;
-}
-
-void ParakeetStt::flush_stream() {
-    // No-op — single-utterance sessions only
-}
diff --git a/sdk/src/main/cpp/models/parakeet_stt.h b/sdk/src/main/cpp/models/parakeet_stt.h
deleted file mode 100644
index 5bd974f..0000000
--- a/sdk/src/main/cpp/models/parakeet_stt.h
+++ /dev/null
@@ -1,78 +0,0 @@
-#pragma once
-
-#include <onnxruntime_c_api.h>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-/// Parakeet TDT v3 (0.6B) — speech recognition via ONNX Runtime.
-/// FastConformer encoder + fused LSTM decoder/joint network.
-/// Exported via NeMo as 2 ONNX models: encoder + decoder_joint.
-/// Input: PCM Float32 audio at 16 kHz.
-/// Output: transcribed text with language detection.
-class ParakeetStt {
-public:
-    struct Config {
-        int num_mel_bins    = 128;
-        int sample_rate     = 16000;
-        int n_fft           = 512;
-        int hop_length      = 160;
-        int win_length      = 400;
-        float pre_emphasis  = 0.97f;
-        int encoder_hidden  = 1024;
-        int decoder_hidden  = 640;
-        int decoder_layers  = 2;
-        int vocab_size      = 1024;   // SentencePiece BPE
-        int blank_id        = 1024;   // vocab_size
-        int num_dur_bins    = 5;
-        int duration_bins[5] = {0, 1, 2, 3, 4};
-        int total_logits    = 1030;   // vocab_size+1 + num_dur_bins
-        int first_text_token = 0;     // Only token 0 (<unk>) is special
-    };
-
-    struct Result {
-        std::string text;
-        std::string language;
-        float confidence = 0.0f;
-    };
-
-    /// Load encoder + decoder_joint ONNX models and vocabulary.
-    ParakeetStt(const std::string& encoder_path,
-                const std::string& decoder_joint_path,
-                const std::string& vocab_path,
-                bool nnapi = true);
-    ~ParakeetStt();
-
-    Result transcribe(const float* audio, size_t length, int sample_rate);
-    int input_sample_rate() const { return cfg_.sample_rate; }
-
-    // Streaming: accumulate audio and re-transcribe on each push_chunk call
-    bool supports_streaming() const { return true; }
-    void begin_stream(int sample_rate);
-    Result push_chunk(const float* audio, size_t length);
-    Result end_stream();
-    void cancel_stream();
-    void flush_stream();
-
-private:
-    bool load_vocab(const std::string& path);
-    std::vector<float> compute_mel(const float* audio, size_t length);
-    Result tdt_decode(const float* encoded, int64_t enc_len, int64_t hidden);
-    std::string decode_tokens(const std::vector<int>& token_ids);
-
-    const OrtApi* api_;
-    OrtSession* encoder_       = nullptr;
-    OrtSession* decoder_joint_ = nullptr;
-    Config cfg_;
-
-    // SentencePiece vocabulary: token ID → token string
-    std::unordered_map<int, std::string> vocab_;
-
-    // Language tokens: token ID → ISO 639-1 code (e.g. 64 → "en", 71 → "fr")
-    std::unordered_map<int, std::string> lang_tokens_;
-
-    // Streaming state
-    std::vector<float> stream_buffer_;
-    int stream_sample_rate_ = 16000;
-    bool streaming_ = false;
-};
diff --git a/sdk/src/main/cpp/models/silero_vad.cpp b/sdk/src/main/cpp/models/silero_vad.cpp
deleted file mode 100644
index 978797b..0000000
--- a/sdk/src/main/cpp/models/silero_vad.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-#include "silero_vad.h"
-#include "onnx_engine.h"
-#include <cstring>
-
-SileroVad::SileroVad(const std::string& model_path, bool nnapi) {
-    auto& engine = OnnxEngine::get();
-    api_ = engine.api();
-    session_ = engine.load(model_path, nnapi);
-    reset();
-}
-
-SileroVad::~SileroVad() {
-    if (session_) api_->ReleaseSession(session_);
-}
-
-void SileroVad::reset() {
-    state_.fill(0.0f);
-}
-
-float SileroVad::process_chunk(const float* samples, size_t length) {
-    auto* mem = OnnxEngine::get().cpu_memory();
-
-    // --- input tensors ---
-
-    const int64_t input_shape[] = {1, static_cast<int64_t>(length)};
-    OrtValue* t_input = nullptr;
-    ort_check(api_, api_->CreateTensorWithDataAsOrtValue(
-        mem, const_cast<float*>(samples), length * sizeof(float),
-        input_shape, 2, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &t_input));
-
-    // sr is a scalar (no shape dimensions)
-    OrtValue* t_sr = nullptr;
-    ort_check(api_, api_->CreateTensorWithDataAsOrtValue(
-        mem, &sr_, sizeof(int64_t),
-        nullptr, 0, ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, &t_sr));
-
-    const int64_t state_shape[] = {2, 1, 128};
-    OrtValue* t_state = nullptr;
-    ort_check(api_, api_->CreateTensorWithDataAsOrtValue(
-        mem, state_.data(), state_.size() * sizeof(float),
-        state_shape, 3, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &t_state));
-
-    // --- run ---
-
-    const char* in_names[]  = {"input", "state", "sr"};
-    const char* out_names[] = {"output", "stateN"};
-    OrtValue* inputs[]  = {t_input, t_state, t_sr};
-    OrtValue* outputs[] = {nullptr, nullptr};
-
-    ort_check(api_, api_->Run(
-        session_, nullptr,
-        in_names, inputs, 3,
-        out_names, 2, outputs));
-
-    // --- extract ---
-
-    float* out_data = nullptr;
-    ort_check(api_, api_->GetTensorMutableData(outputs[0], (void**)&out_data));
-    float prob = out_data[0];
-
-    float* new_state = nullptr;
-    ort_check(api_, api_->GetTensorMutableData(outputs[1], (void**)&new_state));
-    std::memcpy(state_.data(), new_state, state_.size() * sizeof(float));
-
-    // --- cleanup ---
-
-    api_->ReleaseValue(outputs[1]);
-    api_->ReleaseValue(outputs[0]);
-    api_->ReleaseValue(t_state);
-    api_->ReleaseValue(t_sr);
-    api_->ReleaseValue(t_input);
-
-    return prob;
-}
diff --git a/sdk/src/main/cpp/models/silero_vad.h b/sdk/src/main/cpp/models/silero_vad.h
deleted file mode 100644
index 7c7307c..0000000
--- a/sdk/src/main/cpp/models/silero_vad.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#pragma once
-
-#include <onnxruntime_c_api.h>
-#include <array>
-#include <string>
-
-/// Silero VAD v5 — voice activity detection via ONNX Runtime.
-/// Input: 512 samples (32 ms @ 16 kHz) per chunk.
-/// Output: speech probability [0, 1].
-class SileroVad {
-public:
-    explicit SileroVad(const std::string& model_path, bool nnapi = false);
-    ~SileroVad();
-
-    float process_chunk(const float* samples, size_t length);
-    void reset();
-
-    int input_sample_rate() const { return 16000; }
-    size_t chunk_size() const { return 512; }
-
-private:
-    const OrtApi* api_;
-    OrtSession* session_ = nullptr;
-
-    // LSTM state carried across chunks (Silero v5: [2, 1, 128])
-    static constexpr size_t kStateSize = 2 * 1 * 128;
-    std::array<float, kStateSize> state_{};
-    int64_t sr_ = 16000;
-};
diff --git a/sdk/src/main/cpp/models/soc_detect.cpp b/sdk/src/main/cpp/models/soc_detect.cpp
deleted file mode 100644
index 5d8c546..0000000
--- a/sdk/src/main/cpp/models/soc_detect.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-#include "inference_engine.h"
-#include "onnx_backend.h"
-
-#ifdef __ANDROID__
-#include <sys/system_properties.h>
-#include <android/log.h>
-#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, "Speech", __VA_ARGS__)
-#else
-#include <cstdio>
-#define LOGI(...) do { fprintf(stderr, "[speech] "); fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while(0)
-#endif
-
-#include <string>
-
-enum class SocVendor { GOOGLE_TENSOR, QUALCOMM, SAMSUNG, MEDIATEK, UNKNOWN };
-
-static SocVendor detect_soc() {
-#ifdef __ANDROID__
-    char value[92] = {};
-
-    // Google Tensor: ro.hardware.chipname starts with "gs" or "zuma"
-    __system_property_get("ro.hardware.chipname", value);
-    std::string chipname(value);
-    if (chipname.find("gs") == 0 || chipname.find("zuma") == 0) {
-        LOGI("SoC: Google Tensor (%s)", chipname.c_str());
-        return SocVendor::GOOGLE_TENSOR;
-    }
-
-    // Qualcomm: ro.board.platform starts with "msm", "sm", "sdm"
-    __system_property_get("ro.board.platform", value);
-    std::string platform(value);
-    if (platform.find("msm") == 0 || platform.find("sm") == 0 ||
-        platform.find("sdm") == 0 || platform.find("lahaina") != std::string::npos ||
-        platform.find("taro") != std::string::npos || platform.find("kalama") != std::string::npos ||
-        platform.find("pineapple") != std::string::npos || platform.find("sun") != std::string::npos) {
-        LOGI("SoC: Qualcomm (%s)", platform.c_str());
-        return SocVendor::QUALCOMM;
-    }
-
-    // Samsung Exynos
-    __system_property_get("ro.hardware", value);
-    std::string hardware(value);
-    if (hardware.find("exynos") != std::string::npos) {
-        LOGI("SoC: Samsung Exynos (%s)", hardware.c_str());
-        return SocVendor::SAMSUNG;
-    }
-
-    LOGI("SoC: Unknown (chipname=%s, platform=%s, hardware=%s)",
-         chipname.c_str(), platform.c_str(), hardware.c_str());
-#endif
-    return SocVendor::UNKNOWN;
-}
-
-Backend detect_optimal_backend() {
-    SocVendor soc = detect_soc();
-    switch (soc) {
-#ifdef SPEECH_LITERT
-        case SocVendor::GOOGLE_TENSOR:
-            return Backend::LITERT;
-#endif
-        default:
-            return Backend::ONNX;
-    }
-}
-
-std::unique_ptr<InferenceBackend> create_backend(Backend preference) {
-    Backend actual = preference;
-    if (actual == Backend::AUTO) {
-        actual = detect_optimal_backend();
-    }
-
-#ifdef SPEECH_LITERT
-    if (actual == Backend::LITERT) {
-        // LiteRT backend will be implemented in litert_backend.cpp
-        // For now, fall back to ONNX
-        LOGI("LiteRT backend not yet available, using ONNX");
-        actual = Backend::ONNX;
-    }
-#endif
-
-    if (actual == Backend::LITERT) {
-        LOGI("LiteRT requested but not compiled in, using ONNX");
-        actual = Backend::ONNX;
-    }
-
-    LOGI("Inference backend: ONNX Runtime");
-    return std::make_unique<OnnxBackend>();
-}
diff --git a/sdk/src/main/cpp/util/json.h b/sdk/src/main/cpp/util/json.h
deleted file mode 100644
index 5e44bd9..0000000
--- a/sdk/src/main/cpp/util/json.h
+++ /dev/null
@@ -1,241 +0,0 @@
-#pragma once
-
-#include <fstream>
-#include <sstream>
-#include <string>
-#include <unordered_map>
-#include <variant>
-#include <vector>
-
-/// Minimal JSON parser for our specific model config files.
-/// Handles flat objects with string/int values and one level of nesting.
-namespace json {
-
-using Dict = std::unordered_map<std::string, std::string>;
-
-inline std::string read_file(const std::string& path) {
-    std::ifstream f(path);
-    if (!f.is_open()) return "";
-    std::ostringstream ss;
-    ss << f.rdbuf();
-    return ss.str();
-}
-
-inline void skip_ws(const std::string& s, size_t& i) {
-    while (i < s.size() && (s[i] == ' ' || s[i] == '\t' || s[i] == '\n' || s[i] == '\r')) i++;
-}
-
-inline std::string parse_string(const std::string& s, size_t& i) {
-    if (i >= s.size() || s[i] != '"') return "";
-    i++; // skip opening quote
-    std::string result;
-    while (i < s.size() && s[i] != '"') {
-        if (s[i] == '\\' && i + 1 < s.size()) {
-            i++;
-            switch (s[i]) {
-                case '"': result += '"'; break;
-                case '\\': result += '\\'; break;
-                case 'n': result += '\n'; break;
-                case 't': result += '\t'; break;
-                case 'u': {
-                    // Parse \uXXXX → UTF-8
-                    if (i + 4 < s.size()) {
-                        std::string hex = s.substr(i + 1, 4);
-                        unsigned long cp = std::stoul(hex, nullptr, 16);
-                        i += 4;
-                        if (cp < 0x80) {
-                            result += static_cast<char>(cp);
-                        } else if (cp < 0x800) {
-                            result += static_cast<char>(0xC0 | (cp >> 6));
-                            result += static_cast<char>(0x80 | (cp & 0x3F));
-                        } else {
-                            result += static_cast<char>(0xE0 | (cp >> 12));
-                            result += static_cast<char>(0x80 | ((cp >> 6) & 0x3F));
-                            result += static_cast<char>(0x80 | (cp & 0x3F));
-                        }
-                    }
-                    break;
-                }
-                default: result += s[i]; break;
-            }
-        } else {
-            result += s[i];
-        }
-        i++;
-    }
-    if (i < s.size()) i++; // skip closing quote
-    return result;
-}
-
-inline std::string parse_value_raw(const std::string& s, size_t& i) {
-    skip_ws(s, i);
-    if (i >= s.size()) return "";
-
-    if (s[i] == '"') return parse_string(s, i);
-
-    // Number, bool, null
-    std::string val;
-    while (i < s.size() && s[i] != ',' && s[i] != '}' && s[i] != ']'
-           && s[i] != ' ' && s[i] != '\n' && s[i] != '\r') {
-        val += s[i++];
-    }
-    return val;
-}
-
-/// Skip a JSON value (string, number, object, array)
-inline void skip_value(const std::string& s, size_t& i) {
-    skip_ws(s, i);
-    if (i >= s.size()) return;
-    if (s[i] == '"') { parse_string(s, i); return; }
-    if (s[i] == '{') {
-        int depth = 1; i++;
-        while (i < s.size() && depth > 0) {
-            if (s[i] == '{') { depth++; i++; }
-            else if (s[i] == '}') { depth--; i++; }
-            else if (s[i] == '"') { parse_string(s, i); }
-            else { i++; }
-        }
-        return;
-    }
-    if (s[i] == '[') {
-        int depth = 1; i++;
-        while (i < s.size() && depth > 0) {
-            if (s[i] == '[') { depth++; i++; }
-            else if (s[i] == ']') { depth--; i++; }
-            else if (s[i] == '"') { parse_string(s, i); }
-            else { i++; }
-        }
-        return;
-    }
-    parse_value_raw(s, i);
-}
-
-/// Parse {"key": "value", ...} → map<string, string>
-/// Works for string and integer values (ints stored as strings).
-inline Dict parse_flat_object(const std::string& text) {
-    Dict result;
-    size_t i = 0;
-    skip_ws(text, i);
-    if (i >= text.size() || text[i] != '{') return result;
-    i++;
-
-    while (i < text.size()) {
-        skip_ws(text, i);
-        if (text[i] == '}') break;
-        if (text[i] == ',') { i++; continue; }
-
-        auto key = parse_string(text, i);
-        skip_ws(text, i);
-        if (i < text.size() && text[i] == ':') i++;
-        skip_ws(text, i);
-
-        if (i < text.size() && text[i] == '{') {
-            // Nested object — skip it for flat parsing
-            skip_value(text, i);
-        } else {
-            auto val = parse_value_raw(text, i);
-            result[key] = val;
-        }
-    }
-    return result;
-}
-
-/// Heteronym entry: either a simple string or POS-tagged map.
-struct DictEntry {
-    std::string simple;
-    std::unordered_map<std::string, std::string> pos_map;  // empty if simple
-    bool is_heteronym() const { return !pos_map.empty(); }
-};
-
-/// Parse pronunciation dictionary: {"word": "phonemes", "word2": {"VERB": "p1", "DEFAULT": "p2"}}
-inline std::unordered_map<std::string, DictEntry> parse_dictionary(const std::string& text) {
-    std::unordered_map<std::string, DictEntry> result;
-    size_t i = 0;
-    skip_ws(text, i);
-    if (i >= text.size() || text[i] != '{') return result;
-    i++;
-
-    while (i < text.size()) {
-        skip_ws(text, i);
-        if (text[i] == '}') break;
-        if (text[i] == ',') { i++; continue; }
-
-        auto key = parse_string(text, i);
-        skip_ws(text, i);
-        if (i < text.size() && text[i] == ':') i++;
-        skip_ws(text, i);
-
-        DictEntry entry;
-        if (i < text.size() && text[i] == '"') {
-            entry.simple = parse_string(text, i);
-        } else if (i < text.size() && text[i] == '{') {
-            // Nested POS map
-            i++; // skip {
-            while (i < text.size()) {
-                skip_ws(text, i);
-                if (text[i] == '}') { i++; break; }
-                if (text[i] == ',') { i++; continue; }
-                auto pos = parse_string(text, i);
-                skip_ws(text, i);
-                if (i < text.size() && text[i] == ':') i++;
-                skip_ws(text, i);
-                if (i < text.size() && text[i] == 'n') {
-                    // null value
-                    skip_value(text, i);
-                } else {
-                    auto pron = parse_value_raw(text, i);
-                    entry.pos_map[pos] = pron;
-                }
-            }
-        } else {
-            skip_value(text, i);
-        }
-        result[key] = std::move(entry);
-    }
-    return result;
-}
-
-/// Parse vocab_index.json: {"vocab": {"sym": id, ...}} or flat {"sym": id, ...}
-inline std::unordered_map<std::string, int> parse_vocab_index(const std::string& text) {
-    std::unordered_map<std::string, int> result;
-    size_t i = 0;
-    skip_ws(text, i);
-    if (i >= text.size() || text[i] != '{') return result;
-    i++;
-
-    // Check if nested under "vocab" key
-    size_t save = i;
-    skip_ws(text, i);
-    auto first_key = parse_string(text, i);
-    skip_ws(text, i);
-    if (i < text.size() && text[i] == ':') i++;
-    skip_ws(text, i);
-
-    size_t obj_start;
-    if (first_key == "vocab" && i < text.size() && text[i] == '{') {
-        obj_start = i + 1;
-    } else {
-        // Flat format — restart
-        i = save;
-        obj_start = i;
-    }
-
-    i = obj_start;
-    while (i < text.size()) {
-        skip_ws(text, i);
-        if (text[i] == '}') break;
-        if (text[i] == ',') { i++; continue; }
-
-        auto sym = parse_string(text, i);
-        skip_ws(text, i);
-        if (i < text.size() && text[i] == ':') i++;
-        auto val = parse_value_raw(text, i);
-
-        try {
-            result[sym] = std::stoi(val);
-        } catch (...) {}
-    }
-    return result;
-}
-
-} // namespace json
diff --git a/setup.sh b/setup.sh
index d49af03..37f5f29 100755
--- a/setup.sh
+++ b/setup.sh
@@ -66,24 +66,6 @@ else
     echo "ONNX Runtime already installed"
 fi
 
-# --- .gitignore ---
-
-cat > "${ROOT}/.gitignore" << 'GITIGNORE'
-# Build
-.gradle/
-build/
-*.iml
-.idea/
-local.properties
-
-# ONNX Runtime (downloaded by setup.sh)
-/ort/
-
-# Native build artifacts
-.cxx/
-.externalNativeBuild/
-GITIGNORE
-
 echo ""
 echo "Done. Open the project in Android Studio or run:"
 echo "  ./gradlew :app:assembleDebug"
diff --git a/speech-core b/speech-core
index 679869d..ba75579 160000
--- a/speech-core
+++ b/speech-core
@@ -1 +1 @@
-Subproject commit 679869d9e91ec159611a086e7e5825daa073e72e
+Subproject commit ba755794e6aabf9b98580ce8e591c1abd5ee2387