Add optional timestamp parameter to ForceEndOfUtterance message

giorgosHadji · web-flow · commit ea666dd5fe30 · 2026-03-19T10:12:16.000Z
## What's Changed?

Add an extra **optional** attribute** to the `ForceEndOfUtterance` message to help the engine know the real-world time marker for when the forced end of utterance should be considered. It should **only** be used with transcribers that support this extended message.

```
{"message": "ForceEndOfUtterance", "timestamp": 1.23}
```

Using the `client.audio_seconds_sent` property will return the number of seconds of audio that have been sent to the transcriber, calculated using encoding format and sample rate of the session's audio (`AudioFormat`). This can then be used when calling the force end of utterance:

```
client.force_end_of_utterance(timestamp=client.audio_seconds_sent)
```

For transcribers that don't support the new timestamp parameter in this message (**version &lt; 15.2.0**), we could disable it by passing in the function `timestamp=None` as such:
```
client.force_end_of_utterance(timestamp=None)
```
diff --git a/sdk/rt/speechmatics/rt/_async_client.py b/sdk/rt/speechmatics/rt/_async_client.py
@@ -12,6 +12,7 @@
 from ._exceptions import TimeoutError
 from ._exceptions import TranscriptionError
 from ._logging import get_logger
+from ._models import AudioEncoding
 from ._models import AudioEventsConfig
 from ._models import AudioFormat
 from ._models import ClientMessageType
@@ -20,6 +21,7 @@
 from ._models import TranscriptionConfig
 from ._models import TranslationConfig
 
+_UNSET = object()
 
 class AsyncClient(_BaseClient):
     """
@@ -97,6 +99,8 @@ def __init__(
         self.on(ServerMessageType.WARNING, self._on_warning)
         self.on(ServerMessageType.AUDIO_ADDED, self._on_audio_added)
 
+        self._audio_format = AudioFormat(encoding=AudioEncoding.PCM_S16LE, sample_rate=44100, chunk_size=4096)
+
         self._logger.debug("AsyncClient initialized (request_id=%s)", self._session.request_id)
 
     async def start_session(
@@ -133,6 +137,9 @@ async def start_session(
                 ...     await client.start_session()
                 ...     await client.send_audio(frame)
         """
+        if audio_format is not None:
+            self._audio_format = audio_format
+
         await self._start_recognition_session(
             transcription_config=transcription_config,
             audio_format=audio_format,
@@ -161,16 +168,24 @@ async def stop_session(self) -> None:
         await self._session_done_evt.wait()  # Wait for end of transcript event to indicate we can stop listening
         await self.close()
 
-    async def force_end_of_utterance(self) -> None:
+    async def force_end_of_utterance(self, *, timestamp: Optional[float] | object = _UNSET) -> None:
         """
         This method sends a ForceEndOfUtterance message to the server to signal
         the end of an utterance. Forcing end of utterance will cause the final
         transcript to be sent to the client early.
 
+        Takes an optional timestamp parameter to specify a marker for the engine
+        to use for timing of the end of the utterance. If not provided, the timestamp
+        will be calculated based on the cumulative audio sent to the server. If the provided
+        timestamp is None, the ForceEndOfUtterance message will not include a timestamp.
+        Args:
+            timestamp: Optional timestamp for the request.
+
         Raises:
             ConnectionError: If the WebSocket connection fails.
             TranscriptionError: If the server reports an error during teardown.
             TimeoutError: If the connection or teardown times out.
+            ValueError: If the audio format does not have an encoding set.
 
         Examples:
             Basic streaming:
@@ -179,7 +194,27 @@ async def force_end_of_utterance(self) -> None:
                 ...     await client.send_audio(frame)
                 ...     await client.force_end_of_utterance()
         """
-        await self.send_message({"message": ClientMessageType.FORCE_END_OF_UTTERANCE})
+
+        message: dict[str,Any] = {"message": ClientMessageType.FORCE_END_OF_UTTERANCE}
+
+        if timestamp is _UNSET:
+            # default: auto-set from audio_seconds_sent
+            message["timestamp"] = self.audio_seconds_sent
+        elif timestamp is not None:
+            # user provided explicit value
+            message["timestamp"] = timestamp
+        # if timestamp is None: omit entirely
+
+        await self.send_message(message)
+
+    @property
+    def audio_seconds_sent(self) -> float:
+        """Number of audio seconds sent to the server.
+
+        Raises:
+            ValueError: If the audio format does not have an encoding set.
+        """
+        return self._audio_bytes_sent / (self._audio_format.sample_rate * self._audio_format.bytes_per_sample)
 
     async def transcribe(
         self,
diff --git a/sdk/rt/speechmatics/rt/_base_client.py b/sdk/rt/speechmatics/rt/_base_client.py
@@ -42,6 +42,7 @@ def __init__(self, transport: Transport) -> None:
         self._recv_task: Optional[asyncio.Task[None]] = None
         self._closed_evt = asyncio.Event()
         self._eos_sent = False
+        self._audio_bytes_sent = 0
         self._seq_no = 0
 
         self._logger = get_logger("speechmatics.rt.base_client")
@@ -122,11 +123,17 @@ async def send_audio(self, payload: bytes) -> None:
 
         try:
             await self._transport.send_message(payload)
+            self._audio_bytes_sent += len(payload)
             self._seq_no += 1
         except Exception:
             self._closed_evt.set()
             raise
 
+    @property
+    def audio_bytes_sent(self) -> int:
+        """Number of audio bytes sent to the server."""
+        return self._audio_bytes_sent
+
     async def send_message(self, message: dict[str, Any]) -> None:
         """
         Send a message through the WebSocket.
diff --git a/sdk/rt/speechmatics/rt/_models.py b/sdk/rt/speechmatics/rt/_models.py
@@ -183,6 +183,29 @@ class AudioFormat:
     sample_rate: int = 44100
     chunk_size: int = 4096
 
+    _BYTES_PER_SAMPLE = {
+        AudioEncoding.PCM_F32LE: 4,
+        AudioEncoding.PCM_S16LE: 2,
+        AudioEncoding.MULAW: 1,
+    }
+
+    @property
+    def bytes_per_sample(self) -> int:
+        """Number of bytes per audio sample based on encoding.
+
+        Raises:
+            ValueError: If encoding is None (file type) or unrecognized.
+        """
+        if self.encoding is None:
+            raise ValueError(
+                "Cannot determine bytes per sample for file-type audio format. "
+                "Set an explicit encoding on AudioFormat."
+            )
+        try:
+            return self._BYTES_PER_SAMPLE[self.encoding]
+        except KeyError:
+            raise ValueError(f"Unknown encoding: {self.encoding}")
+
     def to_dict(self) -> dict[str, Any]:
         """
         Convert audio format to dictionary.