Skip to content

Commit 309e80d

Browse files
authored
Merge branch 'livekit:main' into main
2 parents 466ac4d + b87c5d2 commit 309e80d

8 files changed

Lines changed: 156 additions & 11 deletions

File tree

examples/drive-thru/database.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@
4444
"E.g: a hamburger isn't a cheeseburger\n"
4545
"Do not ask for size unless the item has more than one size option specified. \n"
4646
"If an item does not require a size according to the menu, **NEVER** ask the customer to choose one or mention size at all. \n"
47+
"\n\n"
48+
"If there is any error from the tool, you should inform the customer and ask them to try again."
4749
)
4850

4951

livekit-agents/livekit/agents/utils/log.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
import asyncio
21
import functools
2+
import inspect
33
import logging
44
from collections.abc import Callable
55
from typing import Any, TypeVar, cast
@@ -9,7 +9,7 @@
99

1010
def log_exceptions(msg: str = "", logger: logging.Logger = logging.getLogger()) -> Callable[[F], F]: # noqa: B008
1111
def deco(fn: F) -> F:
12-
if asyncio.iscoroutinefunction(fn):
12+
if inspect.iscoroutinefunction(fn):
1313

1414
@functools.wraps(fn)
1515
async def async_fn_logs(*args: Any, **kwargs: Any) -> Any:

livekit-agents/livekit/agents/voice/agent_session.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -933,7 +933,7 @@ def say(
933933
def generate_reply(
934934
self,
935935
*,
936-
user_input: NotGivenOr[str] = NOT_GIVEN,
936+
user_input: NotGivenOr[str | llm.ChatMessage] = NOT_GIVEN,
937937
instructions: NotGivenOr[str] = NOT_GIVEN,
938938
tool_choice: NotGivenOr[llm.ToolChoice] = NOT_GIVEN,
939939
allow_interruptions: NotGivenOr[bool] = NOT_GIVEN,
@@ -943,7 +943,7 @@ def generate_reply(
943943
"""Generate a reply for the agent to speak to the user.
944944
945945
Args:
946-
user_input (NotGivenOr[str], optional): The user's input that may influence the reply,
946+
user_input (NotGivenOr[str | llm.ChatMessage], optional): The user's input that may influence the reply,
947947
such as answering a question.
948948
instructions (NotGivenOr[str], optional): Additional instructions for generating the reply.
949949
tool_choice (NotGivenOr[llm.ToolChoice], optional): Specifies the external tool to use when
@@ -961,8 +961,8 @@ def generate_reply(
961961

962962
user_message = (
963963
llm.ChatMessage(role="user", content=[user_input])
964-
if is_given(user_input)
965-
else NOT_GIVEN
964+
if isinstance(user_input, str)
965+
else user_input
966966
)
967967

968968
run_state = self._global_run_state

livekit-agents/livekit/agents/voice/generation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -574,7 +574,7 @@ async def _run_mock(mock: Callable, *fnc_args: Any, **fnc_kwargs: Any) -> Any:
574574
bound = sig.bind_partial(*trimmed_args, **trimmed_kwargs)
575575
bound.apply_defaults()
576576

577-
if asyncio.iscoroutinefunction(mock):
577+
if inspect.iscoroutinefunction(mock):
578578
return await mock(*bound.args, **bound.kwargs)
579579
else:
580580
return mock(*bound.args, **bound.kwargs)

livekit-agents/livekit/agents/voice/room_io/types.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ class NoiseCancellationParams:
5050
]
5151

5252

53-
def _default_text_input_cb(sess: AgentSession, ev: TextInputEvent) -> None:
54-
sess.interrupt()
53+
async def _default_text_input_cb(sess: AgentSession, ev: TextInputEvent) -> None:
54+
await sess.interrupt()
5555
sess.generate_reply(user_input=ev.text)
5656

5757

livekit-plugins/livekit-plugins-google/livekit/plugins/google/stt.py

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,8 @@ class STTOptions:
7979
min_confidence_threshold: float
8080
profanity_filter: bool
8181
keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN
82+
speech_start_timeout: NotGivenOr[float] = NOT_GIVEN
83+
speech_end_timeout: NotGivenOr[float] = NOT_GIVEN
8284

8385
@property
8486
def version(self) -> int:
@@ -135,6 +137,8 @@ def __init__(
135137
credentials_info: NotGivenOr[dict] = NOT_GIVEN,
136138
credentials_file: NotGivenOr[str] = NOT_GIVEN,
137139
keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN,
140+
speech_start_timeout: NotGivenOr[float] = NOT_GIVEN,
141+
speech_end_timeout: NotGivenOr[float] = NOT_GIVEN,
138142
use_streaming: NotGivenOr[bool] = NOT_GIVEN,
139143
):
140144
"""
@@ -162,6 +166,8 @@ def __init__(
162166
credentials_info(dict): the credentials info to use for recognition (default: None)
163167
credentials_file(str): the credentials file to use for recognition (default: None)
164168
keywords(List[tuple[str, float]]): list of keywords to recognize (default: None)
169+
speech_start_timeout(float): maximum seconds to wait for speech to begin before timeout (default: None)
170+
speech_end_timeout(float): seconds of silence before marking utterance as complete (default: None)
165171
use_streaming(bool): whether to use streaming for recognition (default: True)
166172
"""
167173
if not is_given(use_streaming):
@@ -217,6 +223,8 @@ def __init__(
217223
sample_rate=sample_rate,
218224
min_confidence_threshold=min_confidence_threshold,
219225
keywords=keywords,
226+
speech_start_timeout=speech_start_timeout,
227+
speech_end_timeout=speech_end_timeout,
220228
)
221229
self._streams = weakref.WeakSet[SpeechStream]()
222230
self._pool = utils.ConnectionPool[SpeechAsyncClientV2 | SpeechAsyncClientV1](
@@ -398,6 +406,8 @@ def update_options(
398406
model: NotGivenOr[SpeechModels] = NOT_GIVEN,
399407
location: NotGivenOr[str] = NOT_GIVEN,
400408
keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN,
409+
speech_start_timeout: NotGivenOr[float] = NOT_GIVEN,
410+
speech_end_timeout: NotGivenOr[float] = NOT_GIVEN,
401411
) -> None:
402412
if is_given(languages):
403413
if isinstance(languages, str):
@@ -425,6 +435,10 @@ def update_options(
425435
self._pool.invalidate()
426436
if is_given(keywords):
427437
self._config.keywords = keywords
438+
if is_given(speech_start_timeout):
439+
self._config.speech_start_timeout = speech_start_timeout
440+
if is_given(speech_end_timeout):
441+
self._config.speech_end_timeout = speech_end_timeout
428442

429443
for stream in self._streams:
430444
stream.update_options(
@@ -436,6 +450,8 @@ def update_options(
436450
profanity_filter=profanity_filter,
437451
model=model,
438452
keywords=keywords,
453+
speech_start_timeout=speech_start_timeout,
454+
speech_end_timeout=speech_end_timeout,
439455
)
440456

441457
async def aclose(self) -> None:
@@ -473,6 +489,8 @@ def update_options(
473489
model: NotGivenOr[SpeechModels] = NOT_GIVEN,
474490
min_confidence_threshold: NotGivenOr[float] = NOT_GIVEN,
475491
keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN,
492+
speech_start_timeout: NotGivenOr[float] = NOT_GIVEN,
493+
speech_end_timeout: NotGivenOr[float] = NOT_GIVEN,
476494
) -> None:
477495
if is_given(languages):
478496
if isinstance(languages, str):
@@ -497,13 +515,36 @@ def update_options(
497515
self._config.min_confidence_threshold = min_confidence_threshold
498516
if is_given(keywords):
499517
self._config.keywords = keywords
518+
if is_given(speech_start_timeout):
519+
self._config.speech_start_timeout = speech_start_timeout
520+
if is_given(speech_end_timeout):
521+
self._config.speech_end_timeout = speech_end_timeout
500522

501523
self._reconnect_event.set()
502524

503525
def _build_streaming_config(
504526
self,
505527
) -> cloud_speech_v2.StreamingRecognitionConfig | cloud_speech_v1.StreamingRecognitionConfig:
506528
if self._config.version == 2:
529+
# Build voice activity timeout if either timeout is specified
530+
voice_activity_timeout = None
531+
if is_given(self._config.speech_start_timeout) or is_given(
532+
self._config.speech_end_timeout
533+
):
534+
voice_activity_timeout = (
535+
cloud_speech_v2.StreamingRecognitionFeatures.VoiceActivityTimeout()
536+
)
537+
if is_given(self._config.speech_start_timeout):
538+
voice_activity_timeout.speech_start_timeout = Duration(
539+
seconds=int(self._config.speech_start_timeout),
540+
nanos=int((self._config.speech_start_timeout % 1) * 1e9),
541+
)
542+
if is_given(self._config.speech_end_timeout):
543+
voice_activity_timeout.speech_end_timeout = Duration(
544+
seconds=int(self._config.speech_end_timeout),
545+
nanos=int((self._config.speech_end_timeout % 1) * 1e9),
546+
)
547+
507548
return cloud_speech_v2.StreamingRecognitionConfig(
508549
config=cloud_speech_v2.RecognitionConfig(
509550
explicit_decoding_config=cloud_speech_v2.ExplicitDecodingConfig(
@@ -524,7 +565,11 @@ def _build_streaming_config(
524565
),
525566
streaming_features=cloud_speech_v2.StreamingRecognitionFeatures(
526567
interim_results=self._config.interim_results,
527-
enable_voice_activity_events=self._config.enable_voice_activity_events,
568+
# Auto-enable voice activity events when voice_activity_timeout is specified,
569+
# as per Google API documentation requirements
570+
enable_voice_activity_events=self._config.enable_voice_activity_events
571+
or (voice_activity_timeout is not None),
572+
voice_activity_timeout=voice_activity_timeout,
528573
),
529574
)
530575

livekit-plugins/livekit-plugins-turn-detector/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ pip install livekit-plugins-turn-detector
1818

1919
### Multilingual model
2020

21-
We've trained a multilingual model that supports the following languages: `English, French, Spanish, German, Italian, Portuguese, Dutch, Chinese, Japanese, Korean, Indonesian, Russian, Turkish`
21+
We've trained a multilingual model that supports the following languages: `English, French, Spanish, German, Italian, Portuguese, Dutch, Chinese, Japanese, Korean, Indonesian, Russian, Turkish, Hindi`
2222

2323
The multilingual model requires ~400MB of RAM and completes inferences in ~25ms.
2424

tests/test_plugin_google_stt.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,3 +322,101 @@ async def test_recognize_response_to_speech_event_words():
322322
],
323323
)
324324
]
325+
326+
327+
async def test_voice_activity_timeout_defaults():
328+
"""Test voice activity timeouts are not set by default."""
329+
from livekit.agents.types import NOT_GIVEN
330+
from livekit.plugins.google import STT
331+
332+
stt = STT()
333+
assert stt._config.speech_start_timeout is NOT_GIVEN
334+
assert stt._config.speech_end_timeout is NOT_GIVEN
335+
336+
337+
async def test_voice_activity_timeout_set():
338+
"""Test voice activity timeouts can be set."""
339+
from livekit.plugins.google import STT
340+
341+
stt = STT(
342+
speech_start_timeout=10.0,
343+
speech_end_timeout=2.5,
344+
)
345+
assert stt._config.speech_start_timeout == 10.0
346+
assert stt._config.speech_end_timeout == 2.5
347+
348+
349+
async def test_voice_activity_timeout_fractional_seconds():
350+
"""Test voice activity timeouts handle fractional seconds."""
351+
from livekit.plugins.google import STT
352+
353+
stt = STT(
354+
speech_start_timeout=5.5,
355+
speech_end_timeout=1.25,
356+
)
357+
assert stt._config.speech_start_timeout == 5.5
358+
assert stt._config.speech_end_timeout == 1.25
359+
360+
361+
async def test_voice_activity_timeout_speech_start_only():
362+
"""Test setting only speech_start_timeout."""
363+
from livekit.agents.types import NOT_GIVEN
364+
from livekit.plugins.google import STT
365+
366+
stt = STT(speech_start_timeout=15.0)
367+
assert stt._config.speech_start_timeout == 15.0
368+
assert stt._config.speech_end_timeout is NOT_GIVEN
369+
370+
371+
async def test_voice_activity_timeout_speech_end_only():
372+
"""Test setting only speech_end_timeout."""
373+
from livekit.agents.types import NOT_GIVEN
374+
from livekit.plugins.google import STT
375+
376+
stt = STT(speech_end_timeout=3.0)
377+
assert stt._config.speech_end_timeout == 3.0
378+
assert stt._config.speech_start_timeout is NOT_GIVEN
379+
380+
381+
async def test_voice_activity_timeout_v2_model():
382+
"""Test that V2 model detection works correctly."""
383+
from livekit.plugins.google import STT
384+
385+
stt_v2 = STT(model="chirp_3")
386+
assert stt_v2._config.version == 2
387+
388+
stt_v1 = STT(model="default")
389+
assert stt_v1._config.version == 1
390+
391+
392+
async def test_voice_activity_timeout_update():
393+
"""Test that timeout options can be updated dynamically."""
394+
from livekit.plugins.google import STT
395+
396+
stt = STT(
397+
speech_start_timeout=10.0,
398+
speech_end_timeout=2.0,
399+
)
400+
stt.update_options(
401+
speech_start_timeout=15.0,
402+
speech_end_timeout=3.0,
403+
)
404+
assert stt._config.speech_start_timeout == 15.0
405+
assert stt._config.speech_end_timeout == 3.0
406+
407+
408+
async def test_voice_activity_timeout_partial_update():
409+
"""Test updating only one timeout at a time."""
410+
from livekit.plugins.google import STT
411+
412+
stt = STT(
413+
speech_start_timeout=10.0,
414+
speech_end_timeout=2.0,
415+
)
416+
stt.update_options(speech_start_timeout=20.0)
417+
assert stt._config.speech_start_timeout == 20.0
418+
assert stt._config.speech_end_timeout == 2.0
419+
420+
stt.update_options(speech_end_timeout=5.0)
421+
assert stt._config.speech_start_timeout == 20.0
422+
assert stt._config.speech_end_timeout == 5.0

0 commit comments

Comments
 (0)