Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
33e693f
refactor: enhance end of turn penalty logic
sam-s10s Feb 23, 2026
301bcf4
refactor: enhance end of turn penalty logic
sam-s10s Feb 23, 2026
d9de589
Add Penalty when Smart Turn hasn't been run (#86)
LArmstrongDev Feb 25, 2026
3375c3d
Merge branch 'fix/smart-turn' of https://github.com/speechmatics/spee…
sam-s10s Feb 25, 2026
7a52b3f
test: add `test_no_feou_fix` for FEOU disabled
sam-s10s Mar 2, 2026
1443b33
feat: integrate config validation and improve presets
sam-s10s Mar 2, 2026
386f37b
fix: enforce use of forced end of utterance
sam-s10s Mar 3, 2026
155fceb
refactor: simplify EOU and VAD logic, improve readability
sam-s10s Mar 3, 2026
0b28473
refactor: remove forced end-of-utterance config from tests
sam-s10s Mar 3, 2026
31aa3ac
remove: Delete outdated conditional validation for 'use_forced_eou' i…
sam-s10s Mar 3, 2026
ca0f22f
fix: handle forced EOU more securely in turn management
sam-s10s Mar 3, 2026
95dda05
manually set FEOU to be disabled for the tests.
sam-s10s Mar 3, 2026
5ecc473
remove `ws_headers` as part of a different PR
sam-s10s Mar 3, 2026
e30cc5e
fix: correct logic for end of utterance handling
sam-s10s Mar 3, 2026
cd7de39
`refactor: extract config setup and ensure client disconnect`
sam-s10s Mar 3, 2026
7f03cc5
chore: pin speechmatics-rt dependency version for voice
sam-s10s Mar 5, 2026
81815ee
fix: support 8kHz audio in VAD and smart turn
sam-s10s Mar 10, 2026
0e56620
fix: only predict end of turn when speech ended
sam-s10s Mar 11, 2026
4182979
test: re-enable speaker focus test cases
sam-s10s Mar 11, 2026
5583174
test: use env var for RT URL and fix assertions
sam-s10s Mar 11, 2026
18b56f9
fix: remove unused turn extend delay and dead code
sam-s10s Mar 11, 2026
8720ed4
Merge branch 'main' into fix/smart-turn
sam-s10s Mar 24, 2026
103cac4
Relax speechmatics-rt version pin to minimum
sam-s10s Mar 24, 2026
99b8aaf
Bump speechmatics-rt to v1.0.0 and adjust timestamps for forced end o…
sam-s10s Mar 31, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion sdk/rt/speechmatics/rt/_async_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

_UNSET = object()


class AsyncClient(_BaseClient):
"""
Asynchronous client for Speechmatics real-time audio transcription.
Expand Down Expand Up @@ -195,7 +196,7 @@ async def force_end_of_utterance(self, *, timestamp: Optional[float] | object =
... await client.force_end_of_utterance()
"""

message: dict[str,Any] = {"message": ClientMessageType.FORCE_END_OF_UTTERANCE}
message: dict[str, Any] = {"message": ClientMessageType.FORCE_END_OF_UTTERANCE}

if timestamp is _UNSET:
# default: auto-set from audio_seconds_sent
Expand Down
2 changes: 1 addition & 1 deletion sdk/voice/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ authors = [{ name = "Speechmatics", email = "support@speechmatics.com" }]
license = "MIT"
requires-python = ">=3.9"
dependencies = [
"speechmatics-rt>=0.5.3",
"speechmatics-rt>=1.0.0",
"pydantic>=2.10.6,<3",
"numpy>=1.26.4,<3"
]
Expand Down
142 changes: 92 additions & 50 deletions sdk/voice/speechmatics/voice/_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,10 @@ def __init__(
preset_config = VoiceAgentConfigPreset.load(preset)
config = VoiceAgentConfigPreset._merge_configs(preset_config, config)

# Validate the final config (deferred to allow overlay/preset merging first)
if config is not None:
config.validate_config()

# Process the config
self._config, self._transcription_config, self._audio_format = self._prepare_config(config)

Expand Down Expand Up @@ -310,24 +314,26 @@ def __init__(
self._turn_handler: TurnTaskProcessor = TurnTaskProcessor(name="turn_handler", done_callback=self.finalize)
self._eot_calculation_task: Optional[asyncio.Task] = None

# Uses fixed EndOfUtterance message from STT
self._uses_fixed_eou: bool = (
self._eou_mode == EndOfUtteranceMode.FIXED
and not self._silero_detector
and not self._config.end_of_turn_config.use_forced_eou
)

# Uses ForceEndOfUtterance message
self._uses_forced_eou: bool = not self._uses_fixed_eou
# Forced end of utterance handling
# FEOU is not used in FIXED mode, unless VAD has been enabled. It can / should
# also be disabled during testing when not connected to an endpoint, as the
# waiting for FEOU response will block the test.
self._use_forced_eou: bool = self._eou_mode is not EndOfUtteranceMode.FIXED or self._uses_silero_vad
self._forced_eou_active: bool = False
self._last_forced_eou_latency: float = 0.0

# Emit EOT prediction (uses _uses_forced_eou)
self._uses_eot_prediction: bool = self._eou_mode not in [
# Emit EOT prediction
# EOT predictions are only relevant when not using the FIXED or EXTERNAL modes,
# as these use different triggers to finalize the turn.
self._emit_eot_predictions: bool = self._eou_mode not in [
EndOfUtteranceMode.FIXED,
EndOfUtteranceMode.EXTERNAL,
]

# Time slip for Forced End Of Utterance
self._feou_chunk_s: float = 0.360
self._feou_padding_s: float = 0.0

# -------------------------------------
# Diarization / Speakers
# -------------------------------------
Expand Down Expand Up @@ -360,8 +366,8 @@ def __init__(
AudioEncoding.PCM_S16LE: 2,
}.get(self._audio_format.encoding, 1)

# Default audio buffer
if not self._config.audio_buffer_length and (self._uses_smart_turn or self._uses_silero_vad):
# Default audio buffer (used when Silero VAD is enabled and with Smart Turn)
if not self._config.audio_buffer_length and self._uses_silero_vad:
self._config.audio_buffer_length = 15.0

# Audio buffer
Expand Down Expand Up @@ -447,9 +453,7 @@ def _prepare_config(
)

# Fixed end of Utterance
if bool(
config.end_of_utterance_mode == EndOfUtteranceMode.FIXED and not config.end_of_turn_config.use_forced_eou
):
if config.end_of_utterance_mode == EndOfUtteranceMode.FIXED:
transcription_config.conversation_config = ConversationConfig(
end_of_utterance_silence_trigger=config.end_of_utterance_silence_trigger,
)
Expand Down Expand Up @@ -659,8 +663,14 @@ async def send_audio(self, payload: bytes) -> None:
return

# Process with Silero VAD
if self._silero_detector:
asyncio.create_task(self._silero_detector.process_audio(payload))
if self._uses_silero_vad and self._silero_detector is not None:
asyncio.create_task(
self._silero_detector.process_audio(
payload,
sample_rate=self._audio_sample_rate,
sample_width=self._audio_sample_width,
)
)

# Add to audio buffer (use put_bytes to handle variable chunk sizes)
if self._config.audio_buffer_length > 0:
Expand Down Expand Up @@ -738,7 +748,7 @@ async def emit() -> None:
"""Wait for EndOfUtterance if needed, then emit segments."""

# Forced end of utterance message (only when no speaker is detected)
if self._config.end_of_turn_config.use_forced_eou:
if self._use_forced_eou:
await self._await_forced_eou()

# Check if the turn has changed
Expand All @@ -749,7 +759,7 @@ async def emit() -> None:
self._stt_message_queue.put_nowait(lambda: self._emit_segments(finalize=True, is_eou=True))

# Call async task (only if not already waiting for forced EOU)
if not (self._config.end_of_turn_config.use_forced_eou and self._forced_eou_active):
if not self._forced_eou_active:
asyncio.create_task(emit())

# ============================================================================
Expand Down Expand Up @@ -788,8 +798,8 @@ def _evt_on_final_transcript(message: dict[str, Any]) -> None:
return
self._stt_message_queue.put_nowait(lambda: self._handle_transcript(message, is_final=True))

# End of Utterance (FIXED mode only)
if self._uses_fixed_eou:
# End of Utterance - only when not using ForceEndOfUtterance messages
if not self._use_forced_eou:

@self.on(ServerMessageType.END_OF_UTTERANCE) # type: ignore[misc]
def _evt_on_end_of_utterance(message: dict[str, Any]) -> None:
Expand Down Expand Up @@ -1066,7 +1076,7 @@ async def _add_speech_fragments(self, message: dict[str, Any], is_final: bool =

# Metadata
metadata = message.get("metadata", {})
payload_end_time = metadata.get("end_time", 0)
payload_end_time = self._calc_adjusted_time(metadata.get("end_time", 0))

# Iterate over the results in the payload
for result in message.get("results", []):
Expand All @@ -1075,8 +1085,8 @@ async def _add_speech_fragments(self, message: dict[str, Any], is_final: bool =
# Create the new fragment
fragment = SpeechFragment(
idx=self._next_fragment_id(),
start_time=result.get("start_time", 0),
end_time=result.get("end_time", 0),
start_time=self._calc_adjusted_time(result.get("start_time", 0)),
end_time=self._calc_adjusted_time(result.get("end_time", 0)),
language=alt.get("language", "en"),
direction=alt.get("direction", "ltr"),
type_=result.get("type", "word"),
Expand Down Expand Up @@ -1121,7 +1131,7 @@ async def _add_speech_fragments(self, message: dict[str, Any], is_final: bool =
self._last_fragment_end_time = max(self._last_fragment_end_time, fragment.end_time)

# Evaluate for VAD (only done on partials)
await self._vad_evaluation(fragments, is_final=is_final)
await self._speaker_start_stop_evaluation(fragments, is_final=is_final)

# Fragments to retain
retained_fragments = [
Expand Down Expand Up @@ -1205,18 +1215,8 @@ async def _process_speech_fragments(self, change_filter: Optional[list[Annotatio
if change_filter and not changes.any(*change_filter):
return

# Skip re-evaluation if transcripts are older than smart turn cutoff
if self._smart_turn_pending_cutoff is not None and self._current_view:
latest_end_time = max(
(f.end_time for f in self._current_view.fragments if f.end_time is not None), default=0.0
)

# If all fragments end before or at the cutoff, skip re-evaluation
if latest_end_time <= self._smart_turn_pending_cutoff:
return

# Turn prediction
if self._uses_eot_prediction and self._uses_forced_eou and not self._forced_eou_active:
if self._emit_eot_predictions and not self._forced_eou_active and self._use_forced_eou:

async def fn() -> None:
ttl = await self._calculate_finalize_delay()
Expand Down Expand Up @@ -1518,14 +1518,20 @@ async def _calculate_finalize_delay(
annotation = annotation or AnnotationResult()

# VAD enabled
if self._silero_detector:
if self._uses_silero_vad:
annotation.add(AnnotationFlags.VAD_ACTIVE)
else:
annotation.add(AnnotationFlags.VAD_INACTIVE)

# Smart Turn enabled
if self._smart_turn_detector:
annotation.add(AnnotationFlags.SMART_TURN_ACTIVE)
# If Smart Turn hasn't returned a result yet but is enabled, add NO_SIGNAL annotation.
# This covers the case where the TTL fires before VAD triggers Smart Turn inference.
if not annotation.has(AnnotationFlags.SMART_TURN_TRUE) and not annotation.has(
AnnotationFlags.SMART_TURN_FALSE
):
annotation.add(AnnotationFlags.SMART_TURN_NO_SIGNAL)
else:
annotation.add(AnnotationFlags.SMART_TURN_INACTIVE)

Expand All @@ -1551,8 +1557,7 @@ async def _calculate_finalize_delay(
delay = round(self._config.end_of_utterance_silence_trigger * multiplier, 3)

# Trim off the most recent forced EOU delay if we're in forced EOU mode
if self._uses_forced_eou:
delay -= self._last_forced_eou_latency
delay -= self._last_forced_eou_latency

# Clamp to max delay and adjust for TTFB
clamped_delay = min(delay, self._config.end_of_utterance_max_delay)
Expand Down Expand Up @@ -1586,7 +1591,10 @@ async def _eot_prediction(
# Wait for Smart Turn result
if self._smart_turn_detector and end_time is not None:
result = await self._smart_turn_prediction(end_time, self._config.language, speaker=speaker)
if result.prediction:
if result.error:
# No valid prediction — SMART_TURN_NO_SIGNAL will be applied by _calculate_finalize_delay
pass
elif result.prediction:
annotation.add(AnnotationFlags.SMART_TURN_TRUE)
else:
annotation.add(AnnotationFlags.SMART_TURN_FALSE)
Expand Down Expand Up @@ -1676,17 +1684,18 @@ async def _await_forced_eou(self, timeout: float = 1.0) -> None:
# Add listener
self.once(AgentServerMessageType.END_OF_UTTERANCE, lambda message: eou_received.set())

# Trigger EOU message
self._emit_diagnostic_message("ForceEndOfUtterance sent - waiting for EndOfUtterance")

# Wait for EOU
try:
# Track the start time
start_time = time.time()
self._forced_eou_active = True

# Send the force EOU and wait for the response
await self.force_end_of_utterance()
timestamp = await self._calc_force_end_of_utterance()
await self.force_end_of_utterance(timestamp=timestamp)
self._emit_diagnostic_message(f"ForceEndOfUtterance sent - waiting for EndOfUtterance ({timestamp=})")

# Wait for the response
await asyncio.wait_for(eou_received.wait(), timeout=timeout)

# Record the latency
Expand All @@ -1698,11 +1707,43 @@ async def _await_forced_eou(self, timeout: float = 1.0) -> None:
finally:
self._forced_eou_active = False

async def _calc_force_end_of_utterance(self) -> float:
"""Force the end of the current utterance."""

# Seconds sent
timestamp: float = float(self._audio_bytes_sent) / (
self._audio_format.sample_rate * self._audio_format.bytes_per_sample
)

# Add padding for transcriber chunk size
padding: float = self._feou_chunk_s - (timestamp % self._feou_chunk_s)
self._feou_padding_s += padding

# Send silence
padding_silence = b"\x00" * int(padding * self._audio_format.sample_rate * self._audio_format.bytes_per_sample)
await self.send_audio(padding_silence)

# Return the time
return timestamp + padding

def _calc_adjusted_time(self, timestamp: float) -> float:
"""Calculate the adjusted timestamp.

As forced end of utterance is used, the time needs to get padded to fill the chunk size processed
by the engine. This needs to be kept track of and removed from timestamps that are returned within
the conversation.
"""

if not self._use_forced_eou:
return timestamp

return round(timestamp - self._feou_padding_s, 4)

# ============================================================================
# VAD (VOICE ACTIVITY DETECTION) / SPEAKER DETECTION
# ============================================================================

async def _vad_evaluation(self, fragments: list[SpeechFragment], is_final: bool) -> None:
async def _speaker_start_stop_evaluation(self, fragments: list[SpeechFragment], is_final: bool) -> None:
"""Emit a VAD event.

This will emit `SPEAKER_STARTED` and `SPEAKER_ENDED` events to the client and is
Expand Down Expand Up @@ -1850,18 +1891,20 @@ def _handle_silero_vad_result(self, result: SileroVADResult) -> None:
annotation.add(AnnotationFlags.VAD_STARTED)

# If speech has ended, we need to predict the end of turn
if result.speech_ended and self._uses_eot_prediction:
if self._emit_eot_predictions and result.speech_ended:
"""VAD-based end of turn prediction."""

# Set cutoff to prevent late transcripts from cancelling finalization
self._smart_turn_pending_cutoff = event_time

# Async callback
async def fn() -> None:
ttl = await self._eot_prediction(
end_time=event_time, speaker=self._current_speaker, annotation=annotation
)
self._turn_handler.update_timer(ttl)

# Call the eot calculation asynchronously
self._run_background_eot_calculation(fn, "silero_vad")

async def _handle_speaker_started(self, speaker: Optional[str], event_time: float) -> None:
Expand All @@ -1878,8 +1921,7 @@ async def _handle_speaker_started(self, speaker: Optional[str], event_time: floa
await self._emit_start_of_turn(event_time)

# Update the turn handler
if self._uses_forced_eou:
self._turn_handler.reset()
self._turn_handler.reset()

# Emit the event
self._emit_message(
Expand All @@ -1902,7 +1944,7 @@ async def _handle_speaker_stopped(self, speaker: Optional[str], event_time: floa
self._last_speak_end_latency = self._total_time - event_time

# Turn prediction
if self._uses_eot_prediction and not self._forced_eou_active:
if self._emit_eot_predictions and not self._forced_eou_active:

async def fn() -> None:
ttl = await self._eot_prediction(event_time, speaker)
Expand Down
Loading
Loading