speechmatics · sam-s10s · Feb 23, 2026 · Feb 23, 2026 · Feb 25, 2026 · Feb 25, 2026
diff --git a/sdk/rt/speechmatics/rt/_async_client.py b/sdk/rt/speechmatics/rt/_async_client.py
@@ -23,6 +23,7 @@
 
 _UNSET = object()
 
+
 class AsyncClient(_BaseClient):
     """
     Asynchronous client for Speechmatics real-time audio transcription.
@@ -195,7 +196,7 @@ async def force_end_of_utterance(self, *, timestamp: Optional[float] | object =
                 ...     await client.force_end_of_utterance()
         """
 
-        message: dict[str,Any] = {"message": ClientMessageType.FORCE_END_OF_UTTERANCE}
+        message: dict[str, Any] = {"message": ClientMessageType.FORCE_END_OF_UTTERANCE}
 
         if timestamp is _UNSET:
             # default: auto-set from audio_seconds_sent

diff --git a/sdk/voice/pyproject.toml b/sdk/voice/pyproject.toml
@@ -11,7 +11,7 @@ authors = [{ name = "Speechmatics", email = "support@speechmatics.com" }]
 license = "MIT"
 requires-python = ">=3.9"
 dependencies = [
-    "speechmatics-rt>=0.5.3",
+    "speechmatics-rt>=1.0.0",
     "pydantic>=2.10.6,<3",
     "numpy>=1.26.4,<3"
 ]

diff --git a/sdk/voice/speechmatics/voice/_client.py b/sdk/voice/speechmatics/voice/_client.py
@@ -176,6 +176,10 @@ def __init__(
             preset_config = VoiceAgentConfigPreset.load(preset)
             config = VoiceAgentConfigPreset._merge_configs(preset_config, config)
 
+        # Validate the final config (deferred to allow overlay/preset merging first)
+        if config is not None:
+            config.validate_config()
+
         # Process the config
         self._config, self._transcription_config, self._audio_format = self._prepare_config(config)
 
@@ -310,24 +314,26 @@ def __init__(
         self._turn_handler: TurnTaskProcessor = TurnTaskProcessor(name="turn_handler", done_callback=self.finalize)
         self._eot_calculation_task: Optional[asyncio.Task] = None
 
-        # Uses fixed EndOfUtterance message from STT
-        self._uses_fixed_eou: bool = (
-            self._eou_mode == EndOfUtteranceMode.FIXED
-            and not self._silero_detector
-            and not self._config.end_of_turn_config.use_forced_eou
-        )
-
-        # Uses ForceEndOfUtterance message
-        self._uses_forced_eou: bool = not self._uses_fixed_eou
+        # Forced end of utterance handling
+        #   FEOU is not used in FIXED mode, unless VAD has been enabled. It can / should
+        #   also be disabled during testing when not connected to an endpoint, as the
+        #   waiting for FEOU response will block the test.
+        self._use_forced_eou: bool = self._eou_mode is not EndOfUtteranceMode.FIXED or self._uses_silero_vad
         self._forced_eou_active: bool = False
         self._last_forced_eou_latency: float = 0.0
 
-        # Emit EOT prediction (uses _uses_forced_eou)
-        self._uses_eot_prediction: bool = self._eou_mode not in [
+        # Emit EOT prediction
+        #   EOT predictions are only relevant when not using the FIXED or EXTERNAL modes,
+        #   as these use different triggers to finalize the turn.
+        self._emit_eot_predictions: bool = self._eou_mode not in [
             EndOfUtteranceMode.FIXED,
             EndOfUtteranceMode.EXTERNAL,
         ]
 
+        # Time slip for Forced End Of Utterance
+        self._feou_chunk_s: float = 0.360
+        self._feou_padding_s: float = 0.0
+
         # -------------------------------------
         # Diarization / Speakers
         # -------------------------------------
@@ -360,8 +366,8 @@ def __init__(
             AudioEncoding.PCM_S16LE: 2,
         }.get(self._audio_format.encoding, 1)
 
-        # Default audio buffer
-        if not self._config.audio_buffer_length and (self._uses_smart_turn or self._uses_silero_vad):
+        # Default audio buffer (used when Silero VAD is enabled and with Smart Turn)
+        if not self._config.audio_buffer_length and self._uses_silero_vad:
             self._config.audio_buffer_length = 15.0
 
         # Audio buffer
@@ -447,9 +453,7 @@ def _prepare_config(
             )
 
         # Fixed end of Utterance
-        if bool(
-            config.end_of_utterance_mode == EndOfUtteranceMode.FIXED and not config.end_of_turn_config.use_forced_eou
-        ):
+        if config.end_of_utterance_mode == EndOfUtteranceMode.FIXED:
             transcription_config.conversation_config = ConversationConfig(
                 end_of_utterance_silence_trigger=config.end_of_utterance_silence_trigger,
             )
@@ -659,8 +663,14 @@ async def send_audio(self, payload: bytes) -> None:
             return
 
         # Process with Silero VAD
-        if self._silero_detector:
-            asyncio.create_task(self._silero_detector.process_audio(payload))
+        if self._uses_silero_vad and self._silero_detector is not None:
+            asyncio.create_task(
+                self._silero_detector.process_audio(
+                    payload,
+                    sample_rate=self._audio_sample_rate,
+                    sample_width=self._audio_sample_width,
+                )
+            )
 
         # Add to audio buffer (use put_bytes to handle variable chunk sizes)
         if self._config.audio_buffer_length > 0:
@@ -738,7 +748,7 @@ async def emit() -> None:
             """Wait for EndOfUtterance if needed, then emit segments."""
 
             # Forced end of utterance message (only when no speaker is detected)
-            if self._config.end_of_turn_config.use_forced_eou:
+            if self._use_forced_eou:
                 await self._await_forced_eou()
 
             # Check if the turn has changed
@@ -749,7 +759,7 @@ async def emit() -> None:
             self._stt_message_queue.put_nowait(lambda: self._emit_segments(finalize=True, is_eou=True))
 
         # Call async task (only if not already waiting for forced EOU)
-        if not (self._config.end_of_turn_config.use_forced_eou and self._forced_eou_active):
+        if not self._forced_eou_active:
             asyncio.create_task(emit())
 
     # ============================================================================
@@ -788,8 +798,8 @@ def _evt_on_final_transcript(message: dict[str, Any]) -> None:
                 return
             self._stt_message_queue.put_nowait(lambda: self._handle_transcript(message, is_final=True))
 
-        # End of Utterance (FIXED mode only)
-        if self._uses_fixed_eou:
+        # End of Utterance - only when not using ForceEndOfUtterance messages
+        if not self._use_forced_eou:
 
             @self.on(ServerMessageType.END_OF_UTTERANCE)  # type: ignore[misc]
             def _evt_on_end_of_utterance(message: dict[str, Any]) -> None:
@@ -1066,7 +1076,7 @@ async def _add_speech_fragments(self, message: dict[str, Any], is_final: bool =
 
             # Metadata
             metadata = message.get("metadata", {})
-            payload_end_time = metadata.get("end_time", 0)
+            payload_end_time = self._calc_adjusted_time(metadata.get("end_time", 0))
 
             # Iterate over the results in the payload
             for result in message.get("results", []):
@@ -1075,8 +1085,8 @@ async def _add_speech_fragments(self, message: dict[str, Any], is_final: bool =
                     # Create the new fragment
                     fragment = SpeechFragment(
                         idx=self._next_fragment_id(),
-                        start_time=result.get("start_time", 0),
-                        end_time=result.get("end_time", 0),
+                        start_time=self._calc_adjusted_time(result.get("start_time", 0)),
+                        end_time=self._calc_adjusted_time(result.get("end_time", 0)),
                         language=alt.get("language", "en"),
                         direction=alt.get("direction", "ltr"),
                         type_=result.get("type", "word"),
@@ -1121,7 +1131,7 @@ async def _add_speech_fragments(self, message: dict[str, Any], is_final: bool =
                     self._last_fragment_end_time = max(self._last_fragment_end_time, fragment.end_time)
 
             # Evaluate for VAD (only done on partials)
-            await self._vad_evaluation(fragments, is_final=is_final)
+            await self._speaker_start_stop_evaluation(fragments, is_final=is_final)
 
             # Fragments to retain
             retained_fragments = [
@@ -1205,18 +1215,8 @@ async def _process_speech_fragments(self, change_filter: Optional[list[Annotatio
         if change_filter and not changes.any(*change_filter):
             return
 
-        # Skip re-evaluation if transcripts are older than smart turn cutoff
-        if self._smart_turn_pending_cutoff is not None and self._current_view:
-            latest_end_time = max(
-                (f.end_time for f in self._current_view.fragments if f.end_time is not None), default=0.0
-            )
-
-            # If all fragments end before or at the cutoff, skip re-evaluation
-            if latest_end_time <= self._smart_turn_pending_cutoff:
-                return
-
         # Turn prediction
-        if self._uses_eot_prediction and self._uses_forced_eou and not self._forced_eou_active:
+        if self._emit_eot_predictions and not self._forced_eou_active and self._use_forced_eou:
 
             async def fn() -> None:
                 ttl = await self._calculate_finalize_delay()
@@ -1518,14 +1518,20 @@ async def _calculate_finalize_delay(
         annotation = annotation or AnnotationResult()
 
         # VAD enabled
-        if self._silero_detector:
+        if self._uses_silero_vad:
             annotation.add(AnnotationFlags.VAD_ACTIVE)
         else:
             annotation.add(AnnotationFlags.VAD_INACTIVE)
 
         # Smart Turn enabled
         if self._smart_turn_detector:
             annotation.add(AnnotationFlags.SMART_TURN_ACTIVE)
+            # If Smart Turn hasn't returned a result yet but is enabled, add NO_SIGNAL annotation.
+            # This covers the case where the TTL fires before VAD triggers Smart Turn inference.
+            if not annotation.has(AnnotationFlags.SMART_TURN_TRUE) and not annotation.has(
+                AnnotationFlags.SMART_TURN_FALSE
+            ):
+                annotation.add(AnnotationFlags.SMART_TURN_NO_SIGNAL)
         else:
             annotation.add(AnnotationFlags.SMART_TURN_INACTIVE)
 
@@ -1551,8 +1557,7 @@ async def _calculate_finalize_delay(
         delay = round(self._config.end_of_utterance_silence_trigger * multiplier, 3)
 
         # Trim off the most recent forced EOU delay if we're in forced EOU mode
-        if self._uses_forced_eou:
-            delay -= self._last_forced_eou_latency
+        delay -= self._last_forced_eou_latency
 
         # Clamp to max delay and adjust for TTFB
         clamped_delay = min(delay, self._config.end_of_utterance_max_delay)
@@ -1586,7 +1591,10 @@ async def _eot_prediction(
         # Wait for Smart Turn result
         if self._smart_turn_detector and end_time is not None:
             result = await self._smart_turn_prediction(end_time, self._config.language, speaker=speaker)
-            if result.prediction:
+            if result.error:
+                # No valid prediction — SMART_TURN_NO_SIGNAL will be applied by _calculate_finalize_delay
+                pass
+            elif result.prediction:
                 annotation.add(AnnotationFlags.SMART_TURN_TRUE)
             else:
                 annotation.add(AnnotationFlags.SMART_TURN_FALSE)
@@ -1676,17 +1684,18 @@ async def _await_forced_eou(self, timeout: float = 1.0) -> None:
         # Add listener
         self.once(AgentServerMessageType.END_OF_UTTERANCE, lambda message: eou_received.set())
 
-        # Trigger EOU message
-        self._emit_diagnostic_message("ForceEndOfUtterance sent - waiting for EndOfUtterance")
-
         # Wait for EOU
         try:
             # Track the start time
             start_time = time.time()
             self._forced_eou_active = True
 
             # Send the force EOU and wait for the response
-            await self.force_end_of_utterance()
+            timestamp = await self._calc_force_end_of_utterance()
+            await self.force_end_of_utterance(timestamp=timestamp)
+            self._emit_diagnostic_message(f"ForceEndOfUtterance sent - waiting for EndOfUtterance ({timestamp=})")
+
+            # Wait for the response
             await asyncio.wait_for(eou_received.wait(), timeout=timeout)
 
             # Record the latency
@@ -1698,11 +1707,43 @@ async def _await_forced_eou(self, timeout: float = 1.0) -> None:
         finally:
             self._forced_eou_active = False
 
+    async def _calc_force_end_of_utterance(self) -> float:
+        """Force the end of the current utterance."""
+
+        # Seconds sent
+        timestamp: float = float(self._audio_bytes_sent) / (
+            self._audio_format.sample_rate * self._audio_format.bytes_per_sample
+        )
+
+        # Add padding for transcriber chunk size
+        padding: float = self._feou_chunk_s - (timestamp % self._feou_chunk_s)
+        self._feou_padding_s += padding
+
+        # Send silence
+        padding_silence = b"\x00" * int(padding * self._audio_format.sample_rate * self._audio_format.bytes_per_sample)
+        await self.send_audio(padding_silence)
+
+        # Return the time
+        return timestamp + padding
+
+    def _calc_adjusted_time(self, timestamp: float) -> float:
+        """Calculate the adjusted timestamp.
+
+        As forced end of utterance is used, the time needs to get padded to fill the chunk size processed
+        by the engine. This needs to be kept track of and removed from timestamps that are returned within
+        the conversation.
+        """
+
+        if not self._use_forced_eou:
+            return timestamp
+
+        return round(timestamp - self._feou_padding_s, 4)
+
     # ============================================================================
     # VAD (VOICE ACTIVITY DETECTION) / SPEAKER DETECTION
     # ============================================================================
 
-    async def _vad_evaluation(self, fragments: list[SpeechFragment], is_final: bool) -> None:
+    async def _speaker_start_stop_evaluation(self, fragments: list[SpeechFragment], is_final: bool) -> None:
         """Emit a VAD event.
 
         This will emit `SPEAKER_STARTED` and `SPEAKER_ENDED` events to the client and is
@@ -1850,18 +1891,20 @@ def _handle_silero_vad_result(self, result: SileroVADResult) -> None:
             annotation.add(AnnotationFlags.VAD_STARTED)
 
         # If speech has ended, we need to predict the end of turn
-        if result.speech_ended and self._uses_eot_prediction:
+        if self._emit_eot_predictions and result.speech_ended:
             """VAD-based end of turn prediction."""
 
             # Set cutoff to prevent late transcripts from cancelling finalization
             self._smart_turn_pending_cutoff = event_time
 
+            # Async callback
             async def fn() -> None:
                 ttl = await self._eot_prediction(
                     end_time=event_time, speaker=self._current_speaker, annotation=annotation
                 )
                 self._turn_handler.update_timer(ttl)
 
+            # Call the eot calculation asynchronously
             self._run_background_eot_calculation(fn, "silero_vad")
 
     async def _handle_speaker_started(self, speaker: Optional[str], event_time: float) -> None:
@@ -1878,8 +1921,7 @@ async def _handle_speaker_started(self, speaker: Optional[str], event_time: floa
             await self._emit_start_of_turn(event_time)
 
         # Update the turn handler
-        if self._uses_forced_eou:
-            self._turn_handler.reset()
+        self._turn_handler.reset()
 
         # Emit the event
         self._emit_message(
@@ -1902,7 +1944,7 @@ async def _handle_speaker_stopped(self, speaker: Optional[str], event_time: floa
         self._last_speak_end_latency = self._total_time - event_time
 
         # Turn prediction
-        if self._uses_eot_prediction and not self._forced_eou_active:
+        if self._emit_eot_predictions and not self._forced_eou_active:
 
             async def fn() -> None:
                 ttl = await self._eot_prediction(event_time, speaker)