Skip to content

Commit ea666dd

Browse files
authored
Add optional timestamp parameter to ForceEndOfUtterance message
## What's Changed? Add an extra **optional** attribute** to the `ForceEndOfUtterance` message to help the engine know the real-world time marker for when the forced end of utterance should be considered. It should **only** be used with transcribers that support this extended message. ``` {"message": "ForceEndOfUtterance", "timestamp": 1.23} ``` Using the `client.audio_seconds_sent` property will return the number of seconds of audio that have been sent to the transcriber, calculated using encoding format and sample rate of the session's audio (`AudioFormat`). This can then be used when calling the force end of utterance: ``` client.force_end_of_utterance(timestamp=client.audio_seconds_sent) ``` For transcribers that don't support the new timestamp parameter in this message (**version < 15.2.0**), we could disable it by passing in the function `timestamp=None` as such: ``` client.force_end_of_utterance(timestamp=None) ```
2 parents 4252b44 + 46be795 commit ea666dd

3 files changed

Lines changed: 67 additions & 2 deletions

File tree

sdk/rt/speechmatics/rt/_async_client.py

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from ._exceptions import TimeoutError
1313
from ._exceptions import TranscriptionError
1414
from ._logging import get_logger
15+
from ._models import AudioEncoding
1516
from ._models import AudioEventsConfig
1617
from ._models import AudioFormat
1718
from ._models import ClientMessageType
@@ -20,6 +21,7 @@
2021
from ._models import TranscriptionConfig
2122
from ._models import TranslationConfig
2223

24+
_UNSET = object()
2325

2426
class AsyncClient(_BaseClient):
2527
"""
@@ -97,6 +99,8 @@ def __init__(
9799
self.on(ServerMessageType.WARNING, self._on_warning)
98100
self.on(ServerMessageType.AUDIO_ADDED, self._on_audio_added)
99101

102+
self._audio_format = AudioFormat(encoding=AudioEncoding.PCM_S16LE, sample_rate=44100, chunk_size=4096)
103+
100104
self._logger.debug("AsyncClient initialized (request_id=%s)", self._session.request_id)
101105

102106
async def start_session(
@@ -133,6 +137,9 @@ async def start_session(
133137
... await client.start_session()
134138
... await client.send_audio(frame)
135139
"""
140+
if audio_format is not None:
141+
self._audio_format = audio_format
142+
136143
await self._start_recognition_session(
137144
transcription_config=transcription_config,
138145
audio_format=audio_format,
@@ -161,16 +168,24 @@ async def stop_session(self) -> None:
161168
await self._session_done_evt.wait() # Wait for end of transcript event to indicate we can stop listening
162169
await self.close()
163170

164-
async def force_end_of_utterance(self) -> None:
171+
async def force_end_of_utterance(self, *, timestamp: Optional[float] | object = _UNSET) -> None:
165172
"""
166173
This method sends a ForceEndOfUtterance message to the server to signal
167174
the end of an utterance. Forcing end of utterance will cause the final
168175
transcript to be sent to the client early.
169176
177+
Takes an optional timestamp parameter to specify a marker for the engine
178+
to use for timing of the end of the utterance. If not provided, the timestamp
179+
will be calculated based on the cumulative audio sent to the server. If the provided
180+
timestamp is None, the ForceEndOfUtterance message will not include a timestamp.
181+
Args:
182+
timestamp: Optional timestamp for the request.
183+
170184
Raises:
171185
ConnectionError: If the WebSocket connection fails.
172186
TranscriptionError: If the server reports an error during teardown.
173187
TimeoutError: If the connection or teardown times out.
188+
ValueError: If the audio format does not have an encoding set.
174189
175190
Examples:
176191
Basic streaming:
@@ -179,7 +194,27 @@ async def force_end_of_utterance(self) -> None:
179194
... await client.send_audio(frame)
180195
... await client.force_end_of_utterance()
181196
"""
182-
await self.send_message({"message": ClientMessageType.FORCE_END_OF_UTTERANCE})
197+
198+
message: dict[str,Any] = {"message": ClientMessageType.FORCE_END_OF_UTTERANCE}
199+
200+
if timestamp is _UNSET:
201+
# default: auto-set from audio_seconds_sent
202+
message["timestamp"] = self.audio_seconds_sent
203+
elif timestamp is not None:
204+
# user provided explicit value
205+
message["timestamp"] = timestamp
206+
# if timestamp is None: omit entirely
207+
208+
await self.send_message(message)
209+
210+
@property
211+
def audio_seconds_sent(self) -> float:
212+
"""Number of audio seconds sent to the server.
213+
214+
Raises:
215+
ValueError: If the audio format does not have an encoding set.
216+
"""
217+
return self._audio_bytes_sent / (self._audio_format.sample_rate * self._audio_format.bytes_per_sample)
183218

184219
async def transcribe(
185220
self,

sdk/rt/speechmatics/rt/_base_client.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ def __init__(self, transport: Transport) -> None:
4242
self._recv_task: Optional[asyncio.Task[None]] = None
4343
self._closed_evt = asyncio.Event()
4444
self._eos_sent = False
45+
self._audio_bytes_sent = 0
4546
self._seq_no = 0
4647

4748
self._logger = get_logger("speechmatics.rt.base_client")
@@ -122,11 +123,17 @@ async def send_audio(self, payload: bytes) -> None:
122123

123124
try:
124125
await self._transport.send_message(payload)
126+
self._audio_bytes_sent += len(payload)
125127
self._seq_no += 1
126128
except Exception:
127129
self._closed_evt.set()
128130
raise
129131

132+
@property
133+
def audio_bytes_sent(self) -> int:
134+
"""Number of audio bytes sent to the server."""
135+
return self._audio_bytes_sent
136+
130137
async def send_message(self, message: dict[str, Any]) -> None:
131138
"""
132139
Send a message through the WebSocket.

sdk/rt/speechmatics/rt/_models.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,29 @@ class AudioFormat:
183183
sample_rate: int = 44100
184184
chunk_size: int = 4096
185185

186+
_BYTES_PER_SAMPLE = {
187+
AudioEncoding.PCM_F32LE: 4,
188+
AudioEncoding.PCM_S16LE: 2,
189+
AudioEncoding.MULAW: 1,
190+
}
191+
192+
@property
193+
def bytes_per_sample(self) -> int:
194+
"""Number of bytes per audio sample based on encoding.
195+
196+
Raises:
197+
ValueError: If encoding is None (file type) or unrecognized.
198+
"""
199+
if self.encoding is None:
200+
raise ValueError(
201+
"Cannot determine bytes per sample for file-type audio format. "
202+
"Set an explicit encoding on AudioFormat."
203+
)
204+
try:
205+
return self._BYTES_PER_SAMPLE[self.encoding]
206+
except KeyError:
207+
raise ValueError(f"Unknown encoding: {self.encoding}")
208+
186209
def to_dict(self) -> dict[str, Any]:
187210
"""
188211
Convert audio format to dictionary.

0 commit comments

Comments
 (0)