Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions src/common/ai/factory.js
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,14 @@ const PROVIDERS = {
{ id: 'whisper-medium', name: 'Whisper Medium (769M)' },
],
},
'soniox': {
name: 'Soniox',
handler: () => require("./providers/soniox"),
llmModels: [],
sttModels: [
{ id: 'en_v2', name: 'Soniox English v2' }
],
},
};

function sanitizeModelId(model) {
Expand Down
122 changes: 122 additions & 0 deletions src/common/ai/providers/soniox.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
// Soniox STT Provider
// https://soniox.com/docs

const https = require('https');
const EventEmitter = require('events');

class SonioxSTTSession extends EventEmitter {
constructor(apiKey, model = 'en_v2', sessionId) {
super();
this.apiKey = apiKey;
this.model = model;
this.sessionId = sessionId || `soniox_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
this.isRunning = false;
this.audioBuffer = Buffer.alloc(0);
this.lastTranscription = '';
}

async initialize() {
this.isRunning = true;
// Soniox does not require model download, just API key
return true;
}

async transcribe(audioBuffer) {
// See https://soniox.com/docs/speech-recognition/api.html#recognize-audio
return new Promise((resolve, reject) => {
const options = {
hostname: 'api.soniox.com',
path: `/v2/recognize`,
method: 'POST',
headers: {
'Authorization': `Bearer ${this.apiKey}`,
'Content-Type': 'audio/wav',
'Accept': 'application/json',
'soniox-model': this.model
}
};
const req = https.request(options, res => {
let data = '';
res.on('data', chunk => data += chunk);
res.on('end', () => {
try {
const json = JSON.parse(data);
resolve(json);
} catch (e) {
reject(e);
}
});
});
req.on('error', reject);
req.write(audioBuffer);
req.end();
});
}

async processAudioChunk(audioBuffer) {
if (!this.isRunning) return;
try {
const result = await this.transcribe(audioBuffer);
const text = result.text || '';
this.lastTranscription = text;
this.emit('transcription', { text, isFinal: true });
} catch (err) {
this.emit('error', err);
}
}

async sendRealtimeInput(audioBuffer) {
// Accepts Buffer or base64 string
if (typeof audioBuffer === 'string') {
audioBuffer = Buffer.from(audioBuffer, 'base64');
}
// Convert to WAV if not already (assume PCM input)
if (!isWav(audioBuffer)) {
audioBuffer = pcmToWav(audioBuffer);
}
await this.processAudioChunk(audioBuffer);
}

stop() {
this.isRunning = false;
}

close() {
this.stop();
this.removeAllListeners();
}
}

// Helper: Convert PCM to WAV (16-bit, mono, 16kHz)
function pcmToWav(buffer, sampleRate = 16000, numChannels = 1) {
const header = Buffer.alloc(44);
const dataLength = buffer.length;
header.write('RIFF', 0); // ChunkID
header.writeUInt32LE(36 + dataLength, 4); // ChunkSize
header.write('WAVE', 8); // Format
header.write('fmt ', 12); // Subchunk1ID
header.writeUInt32LE(16, 16); // Subchunk1Size
header.writeUInt16LE(1, 20); // AudioFormat (PCM)
header.writeUInt16LE(numChannels, 22); // NumChannels
header.writeUInt32LE(sampleRate, 24); // SampleRate
header.writeUInt32LE(sampleRate * numChannels * 2, 28); // ByteRate
header.writeUInt16LE(numChannels * 2, 32); // BlockAlign
header.writeUInt16LE(16, 34); // BitsPerSample
header.write('data', 36); // Subchunk2ID
header.writeUInt32LE(dataLength, 40); // Subchunk2Size
return Buffer.concat([header, buffer]);
}

// Helper: Check if buffer is already a WAV file
function isWav(buffer) {
return buffer && buffer.length > 12 && buffer.toString('ascii', 0, 4) === 'RIFF' && buffer.toString('ascii', 8, 12) === 'WAVE';
}

function createSTT(opts) {
return new SonioxSTTSession(opts.apiKey, opts.model);
}

module.exports = {
createSTT,
SonioxSTTSession
};
15 changes: 15 additions & 0 deletions src/common/services/modelStateService.js
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,21 @@ class ModelStateService {
// Default to success if no specific validator is found
console.warn(`[ModelStateService] No validateApiKey function for provider: ${provider}. Assuming valid.`);
return { success: true };

}
case 'soniox': {
// Soniox API key is a 32+ char string, optionally validate format or do a real API call
if (typeof key !== 'string' || key.length < 32) {
return { success: false, error: 'Invalid Soniox API key format.' };
}

this.setApiKey(provider, key);
console.log(`[ModelStateService] API key for ${provider} is valid.`);
return { success: true };
}
default:
return { success: false, error: 'Unknown provider.' };

}

try {
Expand Down
66 changes: 13 additions & 53 deletions src/features/listen/stt/sttService.js
Original file line number Diff line number Diff line change
Expand Up @@ -133,45 +133,26 @@ class SttService {
return;
}

if (this.modelInfo.provider === 'whisper') {
// Whisper STT emits 'transcription' events with different structure
if (this.modelInfo.provider === 'whisper' || this.modelInfo.provider === 'soniox') {
// Whisper and Soniox STT emit 'transcription' events with similar structure
if (message.text && message.text.trim()) {
const finalText = message.text.trim();

// Filter out Whisper noise transcriptions
// Filter out noise for Whisper, for Soniox just check length
const noisePatterns = [
'[BLANK_AUDIO]',
'[INAUDIBLE]',
'[MUSIC]',
'[SOUND]',
'[NOISE]',
'(BLANK_AUDIO)',
'(INAUDIBLE)',
'(MUSIC)',
'(SOUND)',
'(NOISE)'
'[BLANK_AUDIO]', '[INAUDIBLE]', '[MUSIC]', '[SOUND]', '[NOISE]',
'(BLANK_AUDIO)', '(INAUDIBLE)', '(MUSIC)', '(SOUND)', '(NOISE)'
];



const normalizedText = finalText.toLowerCase().trim();

const isNoise = noisePatterns.some(pattern =>
finalText.includes(pattern) || finalText === pattern
);


const isNoise = this.modelInfo.provider === 'whisper' && noisePatterns.some(pattern => finalText.includes(pattern) || finalText === pattern);
if (!isNoise && finalText.length > 2) {
this.debounceMyCompletion(finalText);

this.sendToRenderer('stt-update', {
speaker: 'Me',
text: finalText,
isPartial: false,
isFinal: true,
timestamp: Date.now(),
});
} else {
} else if (this.modelInfo.provider === 'whisper') {
console.log(`[Whisper-Me] Filtered noise: "${finalText}"`);
}
}
Expand Down Expand Up @@ -246,45 +227,24 @@ class SttService {
return;
}

if (this.modelInfo.provider === 'whisper') {
// Whisper STT emits 'transcription' events with different structure
if (this.modelInfo.provider === 'whisper' || this.modelInfo.provider === 'soniox') {
if (message.text && message.text.trim()) {
const finalText = message.text.trim();

// Filter out Whisper noise transcriptions
const noisePatterns = [
'[BLANK_AUDIO]',
'[INAUDIBLE]',
'[MUSIC]',
'[SOUND]',
'[NOISE]',
'(BLANK_AUDIO)',
'(INAUDIBLE)',
'(MUSIC)',
'(SOUND)',
'(NOISE)'
'[BLANK_AUDIO]', '[INAUDIBLE]', '[MUSIC]', '[SOUND]', '[NOISE]',
'(BLANK_AUDIO)', '(INAUDIBLE)', '(MUSIC)', '(SOUND)', '(NOISE)'
];


const normalizedText = finalText.toLowerCase().trim();

const isNoise = noisePatterns.some(pattern =>
finalText.includes(pattern) || finalText === pattern
);


// Only process if it's not noise, not a false positive, and has meaningful content
const isNoise = this.modelInfo.provider === 'whisper' && noisePatterns.some(pattern => finalText.includes(pattern) || finalText === pattern);
if (!isNoise && finalText.length > 2) {
this.debounceTheirCompletion(finalText);

this.sendToRenderer('stt-update', {
speaker: 'Them',
text: finalText,
isPartial: false,
isFinal: true,
timestamp: Date.now(),
});
} else {
} else if (this.modelInfo.provider === 'whisper') {
console.log(`[Whisper-Them] Filtered noise: "${finalText}"`);
}
}
Expand Down Expand Up @@ -613,4 +573,4 @@ class SttService {
}
}

module.exports = SttService;
module.exports = SttService;