@@ -79,6 +79,8 @@ class STTOptions:
7979 min_confidence_threshold : float
8080 profanity_filter : bool
8181 keywords : NotGivenOr [list [tuple [str , float ]]] = NOT_GIVEN
82+ speech_start_timeout : NotGivenOr [float ] = NOT_GIVEN
83+ speech_end_timeout : NotGivenOr [float ] = NOT_GIVEN
8284
8385 @property
8486 def version (self ) -> int :
@@ -135,6 +137,8 @@ def __init__(
135137 credentials_info : NotGivenOr [dict ] = NOT_GIVEN ,
136138 credentials_file : NotGivenOr [str ] = NOT_GIVEN ,
137139 keywords : NotGivenOr [list [tuple [str , float ]]] = NOT_GIVEN ,
140+ speech_start_timeout : NotGivenOr [float ] = NOT_GIVEN ,
141+ speech_end_timeout : NotGivenOr [float ] = NOT_GIVEN ,
138142 use_streaming : NotGivenOr [bool ] = NOT_GIVEN ,
139143 ):
140144 """
@@ -162,6 +166,8 @@ def __init__(
162166 credentials_info(dict): the credentials info to use for recognition (default: None)
163167 credentials_file(str): the credentials file to use for recognition (default: None)
164168 keywords(List[tuple[str, float]]): list of keywords to recognize (default: None)
169+ speech_start_timeout(float): maximum seconds to wait for speech to begin before timeout (default: None)
170+ speech_end_timeout(float): seconds of silence before marking utterance as complete (default: None)
165171 use_streaming(bool): whether to use streaming for recognition (default: True)
166172 """
167173 if not is_given (use_streaming ):
@@ -217,6 +223,8 @@ def __init__(
217223 sample_rate = sample_rate ,
218224 min_confidence_threshold = min_confidence_threshold ,
219225 keywords = keywords ,
226+ speech_start_timeout = speech_start_timeout ,
227+ speech_end_timeout = speech_end_timeout ,
220228 )
221229 self ._streams = weakref .WeakSet [SpeechStream ]()
222230 self ._pool = utils .ConnectionPool [SpeechAsyncClientV2 | SpeechAsyncClientV1 ](
@@ -398,6 +406,8 @@ def update_options(
398406 model : NotGivenOr [SpeechModels ] = NOT_GIVEN ,
399407 location : NotGivenOr [str ] = NOT_GIVEN ,
400408 keywords : NotGivenOr [list [tuple [str , float ]]] = NOT_GIVEN ,
409+ speech_start_timeout : NotGivenOr [float ] = NOT_GIVEN ,
410+ speech_end_timeout : NotGivenOr [float ] = NOT_GIVEN ,
401411 ) -> None :
402412 if is_given (languages ):
403413 if isinstance (languages , str ):
@@ -425,6 +435,10 @@ def update_options(
425435 self ._pool .invalidate ()
426436 if is_given (keywords ):
427437 self ._config .keywords = keywords
438+ if is_given (speech_start_timeout ):
439+ self ._config .speech_start_timeout = speech_start_timeout
440+ if is_given (speech_end_timeout ):
441+ self ._config .speech_end_timeout = speech_end_timeout
428442
429443 for stream in self ._streams :
430444 stream .update_options (
@@ -436,6 +450,8 @@ def update_options(
436450 profanity_filter = profanity_filter ,
437451 model = model ,
438452 keywords = keywords ,
453+ speech_start_timeout = speech_start_timeout ,
454+ speech_end_timeout = speech_end_timeout ,
439455 )
440456
441457 async def aclose (self ) -> None :
@@ -473,6 +489,8 @@ def update_options(
473489 model : NotGivenOr [SpeechModels ] = NOT_GIVEN ,
474490 min_confidence_threshold : NotGivenOr [float ] = NOT_GIVEN ,
475491 keywords : NotGivenOr [list [tuple [str , float ]]] = NOT_GIVEN ,
492+ speech_start_timeout : NotGivenOr [float ] = NOT_GIVEN ,
493+ speech_end_timeout : NotGivenOr [float ] = NOT_GIVEN ,
476494 ) -> None :
477495 if is_given (languages ):
478496 if isinstance (languages , str ):
@@ -497,13 +515,36 @@ def update_options(
497515 self ._config .min_confidence_threshold = min_confidence_threshold
498516 if is_given (keywords ):
499517 self ._config .keywords = keywords
518+ if is_given (speech_start_timeout ):
519+ self ._config .speech_start_timeout = speech_start_timeout
520+ if is_given (speech_end_timeout ):
521+ self ._config .speech_end_timeout = speech_end_timeout
500522
501523 self ._reconnect_event .set ()
502524
503525 def _build_streaming_config (
504526 self ,
505527 ) -> cloud_speech_v2 .StreamingRecognitionConfig | cloud_speech_v1 .StreamingRecognitionConfig :
506528 if self ._config .version == 2 :
529+ # Build voice activity timeout if either timeout is specified
530+ voice_activity_timeout = None
531+ if is_given (self ._config .speech_start_timeout ) or is_given (
532+ self ._config .speech_end_timeout
533+ ):
534+ voice_activity_timeout = (
535+ cloud_speech_v2 .StreamingRecognitionFeatures .VoiceActivityTimeout ()
536+ )
537+ if is_given (self ._config .speech_start_timeout ):
538+ voice_activity_timeout .speech_start_timeout = Duration (
539+ seconds = int (self ._config .speech_start_timeout ),
540+ nanos = int ((self ._config .speech_start_timeout % 1 ) * 1e9 ),
541+ )
542+ if is_given (self ._config .speech_end_timeout ):
543+ voice_activity_timeout .speech_end_timeout = Duration (
544+ seconds = int (self ._config .speech_end_timeout ),
545+ nanos = int ((self ._config .speech_end_timeout % 1 ) * 1e9 ),
546+ )
547+
507548 return cloud_speech_v2 .StreamingRecognitionConfig (
508549 config = cloud_speech_v2 .RecognitionConfig (
509550 explicit_decoding_config = cloud_speech_v2 .ExplicitDecodingConfig (
@@ -524,7 +565,11 @@ def _build_streaming_config(
524565 ),
525566 streaming_features = cloud_speech_v2 .StreamingRecognitionFeatures (
526567 interim_results = self ._config .interim_results ,
527- enable_voice_activity_events = self ._config .enable_voice_activity_events ,
568+ # Auto-enable voice activity events when voice_activity_timeout is specified,
569+ # as per Google API documentation requirements
570+ enable_voice_activity_events = self ._config .enable_voice_activity_events
571+ or (voice_activity_timeout is not None ),
572+ voice_activity_timeout = voice_activity_timeout ,
528573 ),
529574 )
530575
0 commit comments