pickle-com · Surya-sourav · Jul 10, 2025 · Jul 11, 2025 · Jul 11, 2025 · Jul 12, 2025
diff --git a/src/common/ai/factory.js b/src/common/ai/factory.js
@@ -84,6 +84,14 @@ const PROVIDERS = {
           { id: 'whisper-medium', name: 'Whisper Medium (769M)' },
       ],
   },
+  'soniox': {
+      name: 'Soniox',
+      handler: () => require("./providers/soniox"),
+      llmModels: [],
+      sttModels: [
+          { id: 'en_v2', name: 'Soniox English v2' }
+      ],
+  },
 };
 
 function sanitizeModelId(model) {

diff --git a/src/common/ai/providers/soniox.js b/src/common/ai/providers/soniox.js
@@ -0,0 +1,122 @@
+// Soniox STT Provider
+// https://soniox.com/docs
+
+const https = require('https');
+const EventEmitter = require('events');
+
+class SonioxSTTSession extends EventEmitter {
+    constructor(apiKey, model = 'en_v2', sessionId) {
+        super();
+        this.apiKey = apiKey;
+        this.model = model;
+        this.sessionId = sessionId || `soniox_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
+        this.isRunning = false;
+        this.audioBuffer = Buffer.alloc(0);
+        this.lastTranscription = '';
+    }
+
+    async initialize() {
+        this.isRunning = true;
+        // Soniox does not require model download, just API key
+        return true;
+    }
+
+    async transcribe(audioBuffer) {
+        // See https://soniox.com/docs/speech-recognition/api.html#recognize-audio
+        return new Promise((resolve, reject) => {
+            const options = {
+                hostname: 'api.soniox.com',
+                path: `/v2/recognize`,
+                method: 'POST',
+                headers: {
+                    'Authorization': `Bearer ${this.apiKey}`,
+                    'Content-Type': 'audio/wav',
+                    'Accept': 'application/json',
+                    'soniox-model': this.model
+                }
+            };
+            const req = https.request(options, res => {
+                let data = '';
+                res.on('data', chunk => data += chunk);
+                res.on('end', () => {
+                    try {
+                        const json = JSON.parse(data);
+                        resolve(json);
+                    } catch (e) {
+                        reject(e);
+                    }
+                });
+            });
+            req.on('error', reject);
+            req.write(audioBuffer);
+            req.end();
+        });
+    }
+
+    async processAudioChunk(audioBuffer) {
+        if (!this.isRunning) return;
+        try {
+            const result = await this.transcribe(audioBuffer);
+            const text = result.text || '';
+            this.lastTranscription = text;
+            this.emit('transcription', { text, isFinal: true });
+        } catch (err) {
+            this.emit('error', err);
+        }
+    }
+
+    async sendRealtimeInput(audioBuffer) {
+        // Accepts Buffer or base64 string
+        if (typeof audioBuffer === 'string') {
+            audioBuffer = Buffer.from(audioBuffer, 'base64');
+        }
+        // Convert to WAV if not already (assume PCM input)
+        if (!isWav(audioBuffer)) {
+            audioBuffer = pcmToWav(audioBuffer);
+        }
+        await this.processAudioChunk(audioBuffer);
+    }
+
+    stop() {
+        this.isRunning = false;
+    }
+
+    close() {
+        this.stop();
+        this.removeAllListeners();
+    }
+}
+
+// Helper: Convert PCM to WAV (16-bit, mono, 16kHz)
+function pcmToWav(buffer, sampleRate = 16000, numChannels = 1) {
+    const header = Buffer.alloc(44);
+    const dataLength = buffer.length;
+    header.write('RIFF', 0); // ChunkID
+    header.writeUInt32LE(36 + dataLength, 4); // ChunkSize
+    header.write('WAVE', 8); // Format
+    header.write('fmt ', 12); // Subchunk1ID
+    header.writeUInt32LE(16, 16); // Subchunk1Size
+    header.writeUInt16LE(1, 20); // AudioFormat (PCM)
+    header.writeUInt16LE(numChannels, 22); // NumChannels
+    header.writeUInt32LE(sampleRate, 24); // SampleRate
+    header.writeUInt32LE(sampleRate * numChannels * 2, 28); // ByteRate
+    header.writeUInt16LE(numChannels * 2, 32); // BlockAlign
+    header.writeUInt16LE(16, 34); // BitsPerSample
+    header.write('data', 36); // Subchunk2ID
+    header.writeUInt32LE(dataLength, 40); // Subchunk2Size
+    return Buffer.concat([header, buffer]);
+}
+
+// Helper: Check if buffer is already a WAV file
+function isWav(buffer) {
+    return buffer && buffer.length > 12 && buffer.toString('ascii', 0, 4) === 'RIFF' && buffer.toString('ascii', 8, 12) === 'WAVE';
+}
+
+function createSTT(opts) {
+    return new SonioxSTTSession(opts.apiKey, opts.model);
+}
+
+module.exports = {
+    createSTT,
+    SonioxSTTSession
+};
diff --git a/src/common/services/modelStateService.js b/src/common/services/modelStateService.js
@@ -284,6 +284,21 @@ class ModelStateService {
             // Default to success if no specific validator is found
             console.warn(`[ModelStateService] No validateApiKey function for provider: ${provider}. Assuming valid.`);
                     return { success: true };
+
+                }
+            case 'soniox': {
+                // Soniox API key is a 32+ char string, optionally validate format or do a real API call
+                if (typeof key !== 'string' || key.length < 32) {
+                    return { success: false, error: 'Invalid Soniox API key format.' };
+                }
+
+                this.setApiKey(provider, key);
+                console.log(`[ModelStateService] API key for ${provider} is valid.`);
+                return { success: true };
+            }
+            default:
+                return { success: false, error: 'Unknown provider.' };
+
         }
 
         try {

diff --git a/src/features/listen/stt/sttService.js b/src/features/listen/stt/sttService.js
@@ -133,45 +133,26 @@ class SttService {
                 return;
             }
 
-            if (this.modelInfo.provider === 'whisper') {
-                // Whisper STT emits 'transcription' events with different structure
+            if (this.modelInfo.provider === 'whisper' || this.modelInfo.provider === 'soniox') {
+                // Whisper and Soniox STT emit 'transcription' events with similar structure
                 if (message.text && message.text.trim()) {
                     const finalText = message.text.trim();
-
-                    // Filter out Whisper noise transcriptions
+                    // Filter out noise for Whisper, for Soniox just check length
                     const noisePatterns = [
-                        '[BLANK_AUDIO]',
-                        '[INAUDIBLE]',
-                        '[MUSIC]',
-                        '[SOUND]',
-                        '[NOISE]',
-                        '(BLANK_AUDIO)',
-                        '(INAUDIBLE)',
-                        '(MUSIC)',
-                        '(SOUND)',
-                        '(NOISE)'
+                        '[BLANK_AUDIO]', '[INAUDIBLE]', '[MUSIC]', '[SOUND]', '[NOISE]',
+                        '(BLANK_AUDIO)', '(INAUDIBLE)', '(MUSIC)', '(SOUND)', '(NOISE)'
                     ];
-
-
-
-                    const normalizedText = finalText.toLowerCase().trim();
-
-                    const isNoise = noisePatterns.some(pattern => 
-                        finalText.includes(pattern) || finalText === pattern
-                    );
-
-
+                    const isNoise = this.modelInfo.provider === 'whisper' && noisePatterns.some(pattern => finalText.includes(pattern) || finalText === pattern);
                     if (!isNoise && finalText.length > 2) {
                         this.debounceMyCompletion(finalText);
-
                         this.sendToRenderer('stt-update', {
                             speaker: 'Me',
                             text: finalText,
                             isPartial: false,
                             isFinal: true,
                             timestamp: Date.now(),
                         });
-                    } else {
+                    } else if (this.modelInfo.provider === 'whisper') {
                         console.log(`[Whisper-Me] Filtered noise: "${finalText}"`);
                     }
                 }
@@ -246,45 +227,24 @@ class SttService {
                 return;
             }
 
-            if (this.modelInfo.provider === 'whisper') {
-                // Whisper STT emits 'transcription' events with different structure
+            if (this.modelInfo.provider === 'whisper' || this.modelInfo.provider === 'soniox') {
                 if (message.text && message.text.trim()) {
                     const finalText = message.text.trim();
-
-                    // Filter out Whisper noise transcriptions
                     const noisePatterns = [
-                        '[BLANK_AUDIO]',
-                        '[INAUDIBLE]',
-                        '[MUSIC]',
-                        '[SOUND]',
-                        '[NOISE]',
-                        '(BLANK_AUDIO)',
-                        '(INAUDIBLE)',
-                        '(MUSIC)',
-                        '(SOUND)',
-                        '(NOISE)'
+                        '[BLANK_AUDIO]', '[INAUDIBLE]', '[MUSIC]', '[SOUND]', '[NOISE]',
+                        '(BLANK_AUDIO)', '(INAUDIBLE)', '(MUSIC)', '(SOUND)', '(NOISE)'
                     ];
-
-
-                    const normalizedText = finalText.toLowerCase().trim();
-
-                    const isNoise = noisePatterns.some(pattern => 
-                        finalText.includes(pattern) || finalText === pattern
-                    );
-
-
-                    // Only process if it's not noise, not a false positive, and has meaningful content
+                    const isNoise = this.modelInfo.provider === 'whisper' && noisePatterns.some(pattern => finalText.includes(pattern) || finalText === pattern);
                     if (!isNoise && finalText.length > 2) {
                         this.debounceTheirCompletion(finalText);
-
                         this.sendToRenderer('stt-update', {
                             speaker: 'Them',
                             text: finalText,
                             isPartial: false,
                             isFinal: true,
                             timestamp: Date.now(),
                         });
-                    } else {
+                    } else if (this.modelInfo.provider === 'whisper') {
                         console.log(`[Whisper-Them] Filtered noise: "${finalText}"`);
                     }
                 }
@@ -613,4 +573,4 @@ class SttService {
     }
 }
 
-module.exports = SttService; 
+module.exports = SttService;