Merge branch 'livekit:main' into main

CoderOMaster · web-flow · commit 309e80d99a92 · 2026-02-13T13:06:21.000+05:30
diff --git a/examples/drive-thru/database.py b/examples/drive-thru/database.py
@@ -44,6 +44,8 @@
     "E.g: a hamburger isn't a cheeseburger\n"
     "Do not ask for size unless the item has more than one size option specified. \n"
     "If an item does not require a size according to the menu, **NEVER** ask the customer to choose one or mention size at all. \n"
+    "\n\n"
+    "If there is any error from the tool, you should inform the customer and ask them to try again."
 )
 
 
diff --git a/livekit-agents/livekit/agents/utils/log.py b/livekit-agents/livekit/agents/utils/log.py
@@ -1,5 +1,5 @@
-import asyncio
 import functools
+import inspect
 import logging
 from collections.abc import Callable
 from typing import Any, TypeVar, cast
@@ -9,7 +9,7 @@
 
 def log_exceptions(msg: str = "", logger: logging.Logger = logging.getLogger()) -> Callable[[F], F]:  # noqa: B008
     def deco(fn: F) -> F:
-        if asyncio.iscoroutinefunction(fn):
+        if inspect.iscoroutinefunction(fn):
 
             @functools.wraps(fn)
             async def async_fn_logs(*args: Any, **kwargs: Any) -> Any:
diff --git a/livekit-agents/livekit/agents/voice/agent_session.py b/livekit-agents/livekit/agents/voice/agent_session.py
@@ -933,7 +933,7 @@ def say(
     def generate_reply(
         self,
         *,
-        user_input: NotGivenOr[str] = NOT_GIVEN,
+        user_input: NotGivenOr[str | llm.ChatMessage] = NOT_GIVEN,
         instructions: NotGivenOr[str] = NOT_GIVEN,
         tool_choice: NotGivenOr[llm.ToolChoice] = NOT_GIVEN,
         allow_interruptions: NotGivenOr[bool] = NOT_GIVEN,
@@ -943,7 +943,7 @@ def generate_reply(
         """Generate a reply for the agent to speak to the user.
 
         Args:
-            user_input (NotGivenOr[str], optional): The user's input that may influence the reply,
+            user_input (NotGivenOr[str | llm.ChatMessage], optional): The user's input that may influence the reply,
                 such as answering a question.
             instructions (NotGivenOr[str], optional): Additional instructions for generating the reply.
             tool_choice (NotGivenOr[llm.ToolChoice], optional): Specifies the external tool to use when
@@ -961,8 +961,8 @@ def generate_reply(
 
         user_message = (
             llm.ChatMessage(role="user", content=[user_input])
-            if is_given(user_input)
-            else NOT_GIVEN
+            if isinstance(user_input, str)
+            else user_input
         )
 
         run_state = self._global_run_state
diff --git a/livekit-agents/livekit/agents/voice/generation.py b/livekit-agents/livekit/agents/voice/generation.py
@@ -574,7 +574,7 @@ async def _run_mock(mock: Callable, *fnc_args: Any, **fnc_kwargs: Any) -> Any:
                         bound = sig.bind_partial(*trimmed_args, **trimmed_kwargs)
                         bound.apply_defaults()
 
-                        if asyncio.iscoroutinefunction(mock):
+                        if inspect.iscoroutinefunction(mock):
                             return await mock(*bound.args, **bound.kwargs)
                         else:
                             return mock(*bound.args, **bound.kwargs)
diff --git a/livekit-agents/livekit/agents/voice/room_io/types.py b/livekit-agents/livekit/agents/voice/room_io/types.py
@@ -50,8 +50,8 @@ class NoiseCancellationParams:
 ]
 
 
-def _default_text_input_cb(sess: AgentSession, ev: TextInputEvent) -> None:
-    sess.interrupt()
+async def _default_text_input_cb(sess: AgentSession, ev: TextInputEvent) -> None:
+    await sess.interrupt()
     sess.generate_reply(user_input=ev.text)
 
 
diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/stt.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/stt.py
@@ -79,6 +79,8 @@ class STTOptions:
     min_confidence_threshold: float
     profanity_filter: bool
     keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN
+    speech_start_timeout: NotGivenOr[float] = NOT_GIVEN
+    speech_end_timeout: NotGivenOr[float] = NOT_GIVEN
 
     @property
     def version(self) -> int:
@@ -135,6 +137,8 @@ def __init__(
         credentials_info: NotGivenOr[dict] = NOT_GIVEN,
         credentials_file: NotGivenOr[str] = NOT_GIVEN,
         keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN,
+        speech_start_timeout: NotGivenOr[float] = NOT_GIVEN,
+        speech_end_timeout: NotGivenOr[float] = NOT_GIVEN,
         use_streaming: NotGivenOr[bool] = NOT_GIVEN,
     ):
         """
@@ -162,6 +166,8 @@ def __init__(
             credentials_info(dict): the credentials info to use for recognition (default: None)
             credentials_file(str): the credentials file to use for recognition (default: None)
             keywords(List[tuple[str, float]]): list of keywords to recognize (default: None)
+            speech_start_timeout(float): maximum seconds to wait for speech to begin before timeout (default: None)
+            speech_end_timeout(float): seconds of silence before marking utterance as complete (default: None)
             use_streaming(bool): whether to use streaming for recognition (default: True)
         """
         if not is_given(use_streaming):
@@ -217,6 +223,8 @@ def __init__(
             sample_rate=sample_rate,
             min_confidence_threshold=min_confidence_threshold,
             keywords=keywords,
+            speech_start_timeout=speech_start_timeout,
+            speech_end_timeout=speech_end_timeout,
         )
         self._streams = weakref.WeakSet[SpeechStream]()
         self._pool = utils.ConnectionPool[SpeechAsyncClientV2 | SpeechAsyncClientV1](
@@ -398,6 +406,8 @@ def update_options(
         model: NotGivenOr[SpeechModels] = NOT_GIVEN,
         location: NotGivenOr[str] = NOT_GIVEN,
         keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN,
+        speech_start_timeout: NotGivenOr[float] = NOT_GIVEN,
+        speech_end_timeout: NotGivenOr[float] = NOT_GIVEN,
     ) -> None:
         if is_given(languages):
             if isinstance(languages, str):
@@ -425,6 +435,10 @@ def update_options(
             self._pool.invalidate()
         if is_given(keywords):
             self._config.keywords = keywords
+        if is_given(speech_start_timeout):
+            self._config.speech_start_timeout = speech_start_timeout
+        if is_given(speech_end_timeout):
+            self._config.speech_end_timeout = speech_end_timeout
 
         for stream in self._streams:
             stream.update_options(
@@ -436,6 +450,8 @@ def update_options(
                 profanity_filter=profanity_filter,
                 model=model,
                 keywords=keywords,
+                speech_start_timeout=speech_start_timeout,
+                speech_end_timeout=speech_end_timeout,
             )
 
     async def aclose(self) -> None:
@@ -473,6 +489,8 @@ def update_options(
         model: NotGivenOr[SpeechModels] = NOT_GIVEN,
         min_confidence_threshold: NotGivenOr[float] = NOT_GIVEN,
         keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN,
+        speech_start_timeout: NotGivenOr[float] = NOT_GIVEN,
+        speech_end_timeout: NotGivenOr[float] = NOT_GIVEN,
     ) -> None:
         if is_given(languages):
             if isinstance(languages, str):
@@ -497,13 +515,36 @@ def update_options(
             self._config.min_confidence_threshold = min_confidence_threshold
         if is_given(keywords):
             self._config.keywords = keywords
+        if is_given(speech_start_timeout):
+            self._config.speech_start_timeout = speech_start_timeout
+        if is_given(speech_end_timeout):
+            self._config.speech_end_timeout = speech_end_timeout
 
         self._reconnect_event.set()
 
     def _build_streaming_config(
         self,
     ) -> cloud_speech_v2.StreamingRecognitionConfig | cloud_speech_v1.StreamingRecognitionConfig:
         if self._config.version == 2:
+            # Build voice activity timeout if either timeout is specified
+            voice_activity_timeout = None
+            if is_given(self._config.speech_start_timeout) or is_given(
+                self._config.speech_end_timeout
+            ):
+                voice_activity_timeout = (
+                    cloud_speech_v2.StreamingRecognitionFeatures.VoiceActivityTimeout()
+                )
+                if is_given(self._config.speech_start_timeout):
+                    voice_activity_timeout.speech_start_timeout = Duration(
+                        seconds=int(self._config.speech_start_timeout),
+                        nanos=int((self._config.speech_start_timeout % 1) * 1e9),
+                    )
+                if is_given(self._config.speech_end_timeout):
+                    voice_activity_timeout.speech_end_timeout = Duration(
+                        seconds=int(self._config.speech_end_timeout),
+                        nanos=int((self._config.speech_end_timeout % 1) * 1e9),
+                    )
+
             return cloud_speech_v2.StreamingRecognitionConfig(
                 config=cloud_speech_v2.RecognitionConfig(
                     explicit_decoding_config=cloud_speech_v2.ExplicitDecodingConfig(
@@ -524,7 +565,11 @@ def _build_streaming_config(
                 ),
                 streaming_features=cloud_speech_v2.StreamingRecognitionFeatures(
                     interim_results=self._config.interim_results,
-                    enable_voice_activity_events=self._config.enable_voice_activity_events,
+                    # Auto-enable voice activity events when voice_activity_timeout is specified,
+                    # as per Google API documentation requirements
+                    enable_voice_activity_events=self._config.enable_voice_activity_events
+                    or (voice_activity_timeout is not None),
+                    voice_activity_timeout=voice_activity_timeout,
                 ),
             )
 
diff --git a/livekit-plugins/livekit-plugins-turn-detector/README.md b/livekit-plugins/livekit-plugins-turn-detector/README.md
@@ -18,7 +18,7 @@ pip install livekit-plugins-turn-detector
 
 ### Multilingual model
 
-We've trained a multilingual model that supports the following languages: `English, French, Spanish, German, Italian, Portuguese, Dutch, Chinese, Japanese, Korean, Indonesian, Russian, Turkish`
+We've trained a multilingual model that supports the following languages: `English, French, Spanish, German, Italian, Portuguese, Dutch, Chinese, Japanese, Korean, Indonesian, Russian, Turkish, Hindi`
 
 The multilingual model requires ~400MB of RAM and completes inferences in ~25ms.
 
diff --git a/tests/test_plugin_google_stt.py b/tests/test_plugin_google_stt.py
@@ -322,3 +322,101 @@ async def test_recognize_response_to_speech_event_words():
             ],
         )
     ]
+
+
+async def test_voice_activity_timeout_defaults():
+    """Test voice activity timeouts are not set by default."""
+    from livekit.agents.types import NOT_GIVEN
+    from livekit.plugins.google import STT
+
+    stt = STT()
+    assert stt._config.speech_start_timeout is NOT_GIVEN
+    assert stt._config.speech_end_timeout is NOT_GIVEN
+
+
+async def test_voice_activity_timeout_set():
+    """Test voice activity timeouts can be set."""
+    from livekit.plugins.google import STT
+
+    stt = STT(
+        speech_start_timeout=10.0,
+        speech_end_timeout=2.5,
+    )
+    assert stt._config.speech_start_timeout == 10.0
+    assert stt._config.speech_end_timeout == 2.5
+
+
+async def test_voice_activity_timeout_fractional_seconds():
+    """Test voice activity timeouts handle fractional seconds."""
+    from livekit.plugins.google import STT
+
+    stt = STT(
+        speech_start_timeout=5.5,
+        speech_end_timeout=1.25,
+    )
+    assert stt._config.speech_start_timeout == 5.5
+    assert stt._config.speech_end_timeout == 1.25
+
+
+async def test_voice_activity_timeout_speech_start_only():
+    """Test setting only speech_start_timeout."""
+    from livekit.agents.types import NOT_GIVEN
+    from livekit.plugins.google import STT
+
+    stt = STT(speech_start_timeout=15.0)
+    assert stt._config.speech_start_timeout == 15.0
+    assert stt._config.speech_end_timeout is NOT_GIVEN
+
+
+async def test_voice_activity_timeout_speech_end_only():
+    """Test setting only speech_end_timeout."""
+    from livekit.agents.types import NOT_GIVEN
+    from livekit.plugins.google import STT
+
+    stt = STT(speech_end_timeout=3.0)
+    assert stt._config.speech_end_timeout == 3.0
+    assert stt._config.speech_start_timeout is NOT_GIVEN
+
+
+async def test_voice_activity_timeout_v2_model():
+    """Test that V2 model detection works correctly."""
+    from livekit.plugins.google import STT
+
+    stt_v2 = STT(model="chirp_3")
+    assert stt_v2._config.version == 2
+
+    stt_v1 = STT(model="default")
+    assert stt_v1._config.version == 1
+
+
+async def test_voice_activity_timeout_update():
+    """Test that timeout options can be updated dynamically."""
+    from livekit.plugins.google import STT
+
+    stt = STT(
+        speech_start_timeout=10.0,
+        speech_end_timeout=2.0,
+    )
+    stt.update_options(
+        speech_start_timeout=15.0,
+        speech_end_timeout=3.0,
+    )
+    assert stt._config.speech_start_timeout == 15.0
+    assert stt._config.speech_end_timeout == 3.0
+
+
+async def test_voice_activity_timeout_partial_update():
+    """Test updating only one timeout at a time."""
+    from livekit.plugins.google import STT
+
+    stt = STT(
+        speech_start_timeout=10.0,
+        speech_end_timeout=2.0,
+    )
+    stt.update_options(speech_start_timeout=20.0)
+    assert stt._config.speech_start_timeout == 20.0
+    assert stt._config.speech_end_timeout == 2.0
+
+    stt.update_options(speech_end_timeout=5.0)
+    assert stt._config.speech_start_timeout == 20.0
+    assert stt._config.speech_end_timeout == 5.0

Original file line number	Diff line number	Diff line change
`@@ -44,6 +44,8 @@`
`44`	`44`	`"E.g: a hamburger isn't a cheeseburger\n"`
`45`	`45`	`"Do not ask for size unless the item has more than one size option specified. \n"`
`46`	`46`	`"If an item does not require a size according to the menu, NEVER ask the customer to choose one or mention size at all. \n"`
	`47`	`+ "\n\n"`
	`48`	`+ "If there is any error from the tool, you should inform the customer and ask them to try again."`
`47`	`49`	`)`
`48`	`50`
`49`	`51`
Original file line number	Diff line number	Diff line change
`@@ -50,8 +50,8 @@ class NoiseCancellationParams:`
`50`	`50`	`]`
`51`	`51`
`52`	`52`
`53`		`-def _default_text_input_cb(sess: AgentSession, ev: TextInputEvent) -> None:`
`54`		`- sess.interrupt()`
	`53`	`+async def _default_text_input_cb(sess: AgentSession, ev: TextInputEvent) -> None:`
	`54`	`+ await sess.interrupt()`
`55`	`55`	`sess.generate_reply(user_input=ev.text)`
`56`	`56`
`57`	`57`