From bb31cadd5390021ac8a4b890eb197b668d2ecf9d Mon Sep 17 00:00:00 2001 From: Fabian Schindler Date: Tue, 5 May 2026 14:17:35 +0200 Subject: [PATCH 1/3] feat(integrations): add support for the `litellm` `responses`/`aresponses` APIs --- sentry_sdk/integrations/litellm.py | 161 +++++++++++++--- tests/integrations/litellm/test_litellm.py | 214 +++++++++++++++++++++ 2 files changed, 345 insertions(+), 30 deletions(-) diff --git a/sentry_sdk/integrations/litellm.py b/sentry_sdk/integrations/litellm.py index 3cff0fbc23..111fa3de2d 100644 --- a/sentry_sdk/integrations/litellm.py +++ b/sentry_sdk/integrations/litellm.py @@ -6,6 +6,7 @@ from sentry_sdk.ai.monitoring import record_token_usage from sentry_sdk.ai.utils import ( get_start_span_function, + normalize_message_roles, set_data_normalized, truncate_and_annotate_messages, transform_openai_content_part, @@ -17,7 +18,7 @@ from sentry_sdk.utils import event_from_exception if TYPE_CHECKING: - from typing import Any, Dict, List + from typing import Any, Dict, List, Optional from datetime import datetime try: @@ -84,16 +85,17 @@ def _input_callback(kwargs: "Dict[str, Any]") -> None: call_type = kwargs.get("call_type", None) if call_type == "embedding" or call_type == "aembedding": operation = "embeddings" + op = consts.OP.GEN_AI_EMBEDDINGS + elif call_type == "responses" or call_type == "aresponses": + operation = "responses" + op = consts.OP.GEN_AI_RESPONSES else: operation = "chat" + op = consts.OP.GEN_AI_CHAT # Start a new span/transaction span = get_start_span_function()( - op=( - consts.OP.GEN_AI_CHAT - if operation == "chat" - else consts.OP.GEN_AI_EMBEDDINGS - ), + op=op, name=f"{operation} {model}", origin=LiteLLMIntegration.origin, ) @@ -106,14 +108,15 @@ def _input_callback(kwargs: "Dict[str, Any]") -> None: set_data_normalized(span, SPANDATA.GEN_AI_SYSTEM, provider) set_data_normalized(span, SPANDATA.GEN_AI_OPERATION_NAME, operation) - # Record input/messages if allowed - if should_send_default_pii() and integration.include_prompts: - if operation == "embeddings": - # For embeddings, look for the 'input' parameter + # Per-operation request data. Conversation id (responses) is set + # unconditionally; user-content fields are gated on PII / include_prompts. + record_prompts = should_send_default_pii() and integration.include_prompts + scope = sentry_sdk.get_current_scope() + + if operation == "embeddings": + if record_prompts: embedding_input = kwargs.get("input") if embedding_input: - scope = sentry_sdk.get_current_scope() - # Normalize to list format input_list = ( embedding_input if isinstance(embedding_input, list) @@ -129,11 +132,50 @@ def _input_callback(kwargs: "Dict[str, Any]") -> None: messages_data, unpack=False, ) - else: - # For chat, look for the 'messages' parameter + + elif operation == "responses": + # litellm unpacks `extra_body` into the request body, so the + # `conversation` field shows up in additional_args.complete_input_dict + # rather than as a top-level kwarg. + complete_input = (kwargs.get("additional_args") or {}).get( + "complete_input_dict" + ) or {} + conversation = complete_input.get("conversation") + if conversation is not None: + conversation_id: "Optional[str]" = None + if isinstance(conversation, str): + conversation_id = conversation + elif isinstance(conversation, dict): + conversation_id = conversation.get("id") + if conversation_id is not None: + set_data_normalized( + span, SPANDATA.GEN_AI_CONVERSATION_ID, conversation_id + ) + + if record_prompts: + # `input` is either a string or a list of message dicts (same + # shape as OpenAI Responses API). + responses_input = kwargs.get("input") + if responses_input: + if isinstance(responses_input, str): + input_messages = [responses_input] + else: + input_messages = list(responses_input) + normalized = normalize_message_roles(input_messages) # type: ignore[arg-type] + messages_data = truncate_and_annotate_messages(normalized, span, scope) + if messages_data is not None: + set_data_normalized( + span, + SPANDATA.GEN_AI_REQUEST_MESSAGES, + messages_data, + unpack=False, + ) + + else: + # Chat completions. + if record_prompts: messages = kwargs.get("messages", []) if messages: - scope = sentry_sdk.get_current_scope() messages = _convert_message_parts(messages) messages_data = truncate_and_annotate_messages(messages, span, scope) if messages_data is not None: @@ -166,11 +208,24 @@ async def _async_input_callback(kwargs: "Dict[str, Any]") -> None: def _success_callback( kwargs: "Dict[str, Any]", - completion_response: "Any", + response: "Any", start_time: "datetime", end_time: "datetime", ) -> None: - """Handle successful completion.""" + """Handle a successful chat completion, embeddings, or Responses API call. + + The shape of `response` differs between API paths: + - Chat Completions: ModelResponse with ``.choices[].message`` and + ``.usage`` carrying ``prompt_tokens`` / ``completion_tokens``. + - Responses API (non-streaming): ResponsesAPIResponse with ``.output[]`` + items (``message`` / ``function_call``) and ``.usage`` carrying + ``input_tokens`` / ``output_tokens``. + - Responses API (streaming): a ResponseCompletedEvent wrapper + ``{type: "response.completed", response: ResponsesAPIResponse}``, + which we unwrap below. + - Embeddings: CreateEmbeddingResponse with ``.usage`` only (no choices + or output). + """ metadata = _get_metadata_dict(kwargs) span = metadata.get("_sentry_span") @@ -181,18 +236,25 @@ def _success_callback( if integration is None: return + # Streaming Responses API: unwrap the ResponseCompletedEvent so the rest of + # the function sees the assembled ResponsesAPIResponse directly. + if getattr(response, "type", None) == "response.completed" and hasattr( + response, "response" + ): + response = response.response + try: - # Record model information - if hasattr(completion_response, "model"): - set_data_normalized( - span, SPANDATA.GEN_AI_RESPONSE_MODEL, completion_response.model - ) + # `model` is set by all API shapes (chat / responses / embeddings). + if hasattr(response, "model"): + set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_MODEL, response.model) - # Record response content if allowed + # Response content: structure depends on the API shape. Embeddings have + # neither ``choices`` nor ``output``, so we just skip this block. if should_send_default_pii() and integration.include_prompts: - if hasattr(completion_response, "choices"): + if hasattr(response, "choices"): + # Chat Completions API. response_messages = [] - for choice in completion_response.choices: + for choice in response.choices: if hasattr(choice, "message"): if hasattr(choice.message, "model_dump"): response_messages.append(choice.message.model_dump()) @@ -213,14 +275,53 @@ def _success_callback( set_data_normalized( span, SPANDATA.GEN_AI_RESPONSE_TEXT, response_messages ) + elif hasattr(response, "output"): + # Responses API: split message text from function-call items. + output_text: "List[Any]" = [] + tool_calls: "List[Any]" = [] + for output in response.output: + output_type = getattr(output, "type", None) + if output_type == "function_call": + if hasattr(output, "model_dump"): + tool_calls.append(output.model_dump()) + elif hasattr(output, "dict"): + tool_calls.append(output.dict()) + elif output_type == "message": + for content_item in getattr(output, "content", []) or []: + text = getattr(content_item, "text", None) + if text is not None: + output_text.append(text) + elif hasattr(content_item, "model_dump"): + output_text.append(content_item.model_dump()) + elif hasattr(content_item, "dict"): + output_text.append(content_item.dict()) + + if tool_calls: + set_data_normalized( + span, + SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS, + tool_calls, + unpack=False, + ) + if output_text: + set_data_normalized( + span, SPANDATA.GEN_AI_RESPONSE_TEXT, output_text + ) - # Record token usage - if hasattr(completion_response, "usage"): - usage = completion_response.usage + # Token usage field names differ across APIs: + # Chat Completions / Embeddings: prompt_tokens / completion_tokens + # Responses API (non-streaming): input_tokens / output_tokens + # Responses API (streaming): prompt_tokens / completion_tokens + # (litellm normalizes to chat-completion names when assembling the + # streaming response). + if hasattr(response, "usage"): + usage = response.usage record_token_usage( span, - input_tokens=getattr(usage, "prompt_tokens", None), - output_tokens=getattr(usage, "completion_tokens", None), + input_tokens=getattr(usage, "prompt_tokens", None) + or getattr(usage, "input_tokens", None), + output_tokens=getattr(usage, "completion_tokens", None) + or getattr(usage, "output_tokens", None), total_tokens=getattr(usage, "total_tokens", None), ) diff --git a/tests/integrations/litellm/test_litellm.py b/tests/integrations/litellm/test_litellm.py index 18f8cfaf6e..1137a88fe3 100644 --- a/tests/integrations/litellm/test_litellm.py +++ b/tests/integrations/litellm/test_litellm.py @@ -2135,3 +2135,217 @@ def test_convert_message_parts_image_url_missing_url(): converted = _convert_message_parts(messages) # Should return item unchanged assert converted[0]["content"][0]["type"] == "image_url" + + +class MockResponsesUsage: + def __init__(self, input_tokens=12, output_tokens=24, total_tokens=36): + self.input_tokens = input_tokens + self.output_tokens = output_tokens + self.total_tokens = total_tokens + + +class MockResponsesContentItem: + def __init__(self, text): + self.type = "output_text" + self.text = text + + +class MockResponsesOutputMessage: + def __init__(self, text): + self.type = "message" + self.role = "assistant" + self.content = [MockResponsesContentItem(text)] + + +class MockResponsesResponse: + def __init__( + self, + model="gpt-4.1-nano", + output=None, + usage=None, + ): + self.id = "resp-test" + self.model = model + self.output = output or [MockResponsesOutputMessage("the model response")] + self.usage = usage or MockResponsesUsage() + + +def _build_responses_kwargs( + *, + input_value="What is the capital of France?", + conversation=None, + model="openai/gpt-4.1-nano", + extra_body_conversation=None, +): + """ + Build the kwargs shape that litellm passes to input_callback for a + responses() call. `extra_body` is unpacked into + additional_args.complete_input_dict by litellm before callbacks fire. + """ + complete_input_dict = {"model": model.split("/")[-1], "input": input_value} + if extra_body_conversation is not None: + complete_input_dict["conversation"] = extra_body_conversation + elif conversation is not None: + complete_input_dict["conversation"] = conversation + + return { + "model": model, + "input": input_value, + "call_type": "responses", + "additional_args": {"complete_input_dict": complete_input_dict}, + } + + +@pytest.mark.parametrize( + "conversation, expected_id", + [ + pytest.param(None, None, id="no_conversation"), + pytest.param({"id": "conv_abc123"}, "conv_abc123", id="dict"), + pytest.param("conv_str_id", "conv_str_id", id="string"), + ], +) +def test_responses_conversation_id( + sentry_init, capture_events, conversation, expected_id +): + sentry_init( + integrations=[LiteLLMIntegration()], + traces_sample_rate=1.0, + ) + events = capture_events() + + kwargs = _build_responses_kwargs(extra_body_conversation=conversation) + + with start_transaction(name="litellm test"): + _input_callback(kwargs) + _success_callback( + kwargs, MockResponsesResponse(), datetime.now(), datetime.now() + ) + + (event,) = events + (span,) = event["spans"] + + assert span["op"] == OP.GEN_AI_RESPONSES + assert span["description"] == "responses gpt-4.1-nano" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "responses" + + if expected_id is None: + assert SPANDATA.GEN_AI_CONVERSATION_ID not in span["data"] + else: + assert span["data"][SPANDATA.GEN_AI_CONVERSATION_ID] == expected_id + + +def test_responses_records_input_output_and_usage(sentry_init, capture_events): + sentry_init( + integrations=[LiteLLMIntegration(include_prompts=True)], + traces_sample_rate=1.0, + send_default_pii=True, + ) + events = capture_events() + + kwargs = _build_responses_kwargs( + extra_body_conversation={"id": "conv_xyz"}, + ) + + with start_transaction(name="litellm test"): + _input_callback(kwargs) + _success_callback( + kwargs, MockResponsesResponse(), datetime.now(), datetime.now() + ) + + (event,) = events + (span,) = event["spans"] + + assert span["op"] == OP.GEN_AI_RESPONSES + assert span["data"][SPANDATA.GEN_AI_CONVERSATION_ID] == "conv_xyz" + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] + assert "the model response" in span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 12 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 24 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 36 + + +def test_responses_no_pii_omits_messages(sentry_init, capture_events): + sentry_init( + integrations=[LiteLLMIntegration(include_prompts=True)], + traces_sample_rate=1.0, + # send_default_pii not set -> defaults to False + ) + events = capture_events() + + kwargs = _build_responses_kwargs(extra_body_conversation={"id": "conv_xyz"}) + + with start_transaction(name="litellm test"): + _input_callback(kwargs) + _success_callback( + kwargs, MockResponsesResponse(), datetime.now(), datetime.now() + ) + + (event,) = events + (span,) = event["spans"] + + # Conversation id is not PII, but request/response content is + assert span["data"][SPANDATA.GEN_AI_CONVERSATION_ID] == "conv_xyz" + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + + +class MockResponseCompletedEvent: + """Mimics litellm/openai's response.completed streaming wrapper.""" + + def __init__(self, response): + self.type = "response.completed" + self.response = response + + +def test_responses_streaming_unwraps_completed_event(sentry_init, capture_events): + """For streaming responses, success_handler receives a ResponseCompletedEvent + wrapper. We must unwrap it to read usage/output from the inner response.""" + sentry_init( + integrations=[LiteLLMIntegration(include_prompts=True)], + traces_sample_rate=1.0, + send_default_pii=True, + ) + events = capture_events() + + kwargs = _build_responses_kwargs() + kwargs["stream"] = True + kwargs["complete_streaming_response"] = MockResponsesResponse() + + wrapper = MockResponseCompletedEvent(MockResponsesResponse()) + + with start_transaction(name="litellm test"): + _input_callback(kwargs) + _success_callback(kwargs, wrapper, datetime.now(), datetime.now()) + + (event,) = events + (span,) = event["spans"] + + assert span["op"] == OP.GEN_AI_RESPONSES + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 12 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 24 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 36 + assert "the model response" in span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + + +def test_aresponses_call_type_treated_as_responses(sentry_init, capture_events): + """aresponses (async) call_type should produce a responses span.""" + sentry_init( + integrations=[LiteLLMIntegration()], + traces_sample_rate=1.0, + ) + events = capture_events() + + kwargs = _build_responses_kwargs() + kwargs["call_type"] = "aresponses" + + with start_transaction(name="litellm test"): + _input_callback(kwargs) + _success_callback( + kwargs, MockResponsesResponse(), datetime.now(), datetime.now() + ) + + (event,) = events + (span,) = event["spans"] + + assert span["op"] == OP.GEN_AI_RESPONSES + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "responses" From d39b073c8167484e64bae36ad2e00551bafd015d Mon Sep 17 00:00:00 2001 From: Fabian Schindler Date: Tue, 5 May 2026 14:45:04 +0200 Subject: [PATCH 2/3] fix: different handlings for some token usages --- sentry_sdk/integrations/litellm.py | 31 ++++++++++++--- tests/integrations/litellm/test_litellm.py | 44 ++++++++++++++++++++++ 2 files changed, 69 insertions(+), 6 deletions(-) diff --git a/sentry_sdk/integrations/litellm.py b/sentry_sdk/integrations/litellm.py index 111fa3de2d..5fee701ed9 100644 --- a/sentry_sdk/integrations/litellm.py +++ b/sentry_sdk/integrations/litellm.py @@ -40,6 +40,23 @@ def _get_metadata_dict(kwargs: "Dict[str, Any]") -> "Dict[str, Any]": return metadata +def _read_usage_field(usage: "Any", *names: str) -> "Optional[int]": + """Read the first non-None field from a usage container. + + The usage object can be either a typed Pydantic model (attribute access) or + a plain dict (litellm hands us a dict for the assembled async-streaming + response), so we try both shapes. + """ + for name in names: + if isinstance(usage, dict): + value = usage.get(name) + else: + value = getattr(usage, name, None) + if value is not None: + return value + return None + + def _convert_message_parts(messages: "List[Dict[str, Any]]") -> "List[Dict[str, Any]]": """ Convert the message parts from OpenAI format to the `gen_ai.request.messages` format @@ -313,16 +330,18 @@ def _success_callback( # Responses API (non-streaming): input_tokens / output_tokens # Responses API (streaming): prompt_tokens / completion_tokens # (litellm normalizes to chat-completion names when assembling the - # streaming response). + # streaming response). For the async-streaming variant, the + # assembled `usage` is a plain dict, not a Pydantic model — hence + # `_read_usage_field` supports both shapes. if hasattr(response, "usage"): usage = response.usage record_token_usage( span, - input_tokens=getattr(usage, "prompt_tokens", None) - or getattr(usage, "input_tokens", None), - output_tokens=getattr(usage, "completion_tokens", None) - or getattr(usage, "output_tokens", None), - total_tokens=getattr(usage, "total_tokens", None), + input_tokens=_read_usage_field(usage, "prompt_tokens", "input_tokens"), + output_tokens=_read_usage_field( + usage, "completion_tokens", "output_tokens" + ), + total_tokens=_read_usage_field(usage, "total_tokens"), ) finally: diff --git a/tests/integrations/litellm/test_litellm.py b/tests/integrations/litellm/test_litellm.py index 1137a88fe3..5dcf16003f 100644 --- a/tests/integrations/litellm/test_litellm.py +++ b/tests/integrations/litellm/test_litellm.py @@ -2327,6 +2327,50 @@ def test_responses_streaming_unwraps_completed_event(sentry_init, capture_events assert "the model response" in span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] +class MockResponseWithDictUsage: + """Mimics the assembled async-streaming responses object: usage is a dict + (not a Pydantic model), as litellm hands us for that path.""" + + def __init__(self): + self.id = "resp-test" + self.model = "gpt-4.1-nano" + self.output = [MockResponsesOutputMessage("hi")] + self.usage = { + "prompt_tokens": 7, + "completion_tokens": 2, + "total_tokens": 9, + } + + +def test_responses_async_streaming_dict_usage(sentry_init, capture_events): + """For async streaming responses, litellm assembles `usage` as a plain dict. + `getattr(dict, ...)` would silently miss it; we need to support both shapes.""" + sentry_init( + integrations=[LiteLLMIntegration()], + traces_sample_rate=1.0, + ) + events = capture_events() + + kwargs = _build_responses_kwargs() + kwargs["call_type"] = "aresponses" + kwargs["stream"] = True + kwargs["async_complete_streaming_response"] = MockResponseWithDictUsage() + + with start_transaction(name="litellm test"): + _input_callback(kwargs) + _success_callback( + kwargs, MockResponseWithDictUsage(), datetime.now(), datetime.now() + ) + + (event,) = events + (span,) = event["spans"] + + assert span["op"] == OP.GEN_AI_RESPONSES + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 7 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 2 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 9 + + def test_aresponses_call_type_treated_as_responses(sentry_init, capture_events): """aresponses (async) call_type should produce a responses span.""" sentry_init( From b3d837c704d4e292bc45b2289cc097c0f1b58f3a Mon Sep 17 00:00:00 2001 From: Fabian Schindler Date: Mon, 18 May 2026 13:31:11 +0200 Subject: [PATCH 3/3] feat(tests): enhance tests for Responses API integration and conversation ID handling --- sentry_sdk/integrations/litellm.py | 284 +++++++++--------- tests/integrations/litellm/test_litellm.py | 318 ++++++++++++--------- 2 files changed, 343 insertions(+), 259 deletions(-) diff --git a/sentry_sdk/integrations/litellm.py b/sentry_sdk/integrations/litellm.py index 5fee701ed9..ed890ef6fd 100644 --- a/sentry_sdk/integrations/litellm.py +++ b/sentry_sdk/integrations/litellm.py @@ -18,12 +18,17 @@ from sentry_sdk.utils import event_from_exception if TYPE_CHECKING: - from typing import Any, Dict, List, Optional + from typing import Any, Dict, List from datetime import datetime try: import litellm # type: ignore[import-not-found] from litellm import input_callback, success_callback, failure_callback + from litellm.types.llms.openai import ( # type: ignore[import-not-found] + ResponseAPIUsage, + ResponseCompletedEvent, + ResponsesAPIResponse, + ) except ImportError: raise DidNotEnable("LiteLLM not installed") @@ -40,23 +45,6 @@ def _get_metadata_dict(kwargs: "Dict[str, Any]") -> "Dict[str, Any]": return metadata -def _read_usage_field(usage: "Any", *names: str) -> "Optional[int]": - """Read the first non-None field from a usage container. - - The usage object can be either a typed Pydantic model (attribute access) or - a plain dict (litellm hands us a dict for the assembled async-streaming - response), so we try both shapes. - """ - for name in names: - if isinstance(usage, dict): - value = usage.get(name) - else: - value = getattr(usage, name, None) - if value is not None: - return value - return None - - def _convert_message_parts(messages: "List[Dict[str, Any]]") -> "List[Dict[str, Any]]": """ Convert the message parts from OpenAI format to the `gen_ai.request.messages` format @@ -84,6 +72,48 @@ def _convert_message_parts(messages: "List[Dict[str, Any]]") -> "List[Dict[str, return messages +def _record_responses_conversation_id( + span: "Any", complete_input: "Dict[str, Any]" +) -> None: + """Set the conversation id on the span when the Responses API request carries one.""" + conversation = complete_input.get("conversation") + if conversation is None: + return + + if isinstance(conversation, str): + conversation_id = conversation + elif isinstance(conversation, dict): + conversation_id = conversation.get("id") + else: + conversation_id = None + + if conversation_id is not None: + set_data_normalized(span, SPANDATA.GEN_AI_CONVERSATION_ID, conversation_id) + + +def _record_responses_input_messages( + span: "Any", scope: "Any", responses_input: "Any" +) -> None: + """Record the request messages for a Responses API call.""" + if not responses_input: + return + + # `input` is either a string or a list of message dicts (same shape as + # the OpenAI Responses API). + if isinstance(responses_input, str): + input_messages = [responses_input] + else: + input_messages = list(responses_input) + normalized = normalize_message_roles(input_messages) # type: ignore[arg-type] + messages_data = truncate_and_annotate_messages(normalized, span, scope) + if messages_data is not None: + span.set_data( + SPANDATA.GEN_AI_REQUEST_MESSAGES, + messages_data, + unpack=False, + ) + + def _input_callback(kwargs: "Dict[str, Any]") -> None: """Handle the start of a request.""" integration = sentry_sdk.get_client().get_integration(LiteLLMIntegration) @@ -157,36 +187,9 @@ def _input_callback(kwargs: "Dict[str, Any]") -> None: complete_input = (kwargs.get("additional_args") or {}).get( "complete_input_dict" ) or {} - conversation = complete_input.get("conversation") - if conversation is not None: - conversation_id: "Optional[str]" = None - if isinstance(conversation, str): - conversation_id = conversation - elif isinstance(conversation, dict): - conversation_id = conversation.get("id") - if conversation_id is not None: - set_data_normalized( - span, SPANDATA.GEN_AI_CONVERSATION_ID, conversation_id - ) - + _record_responses_conversation_id(span, complete_input) if record_prompts: - # `input` is either a string or a list of message dicts (same - # shape as OpenAI Responses API). - responses_input = kwargs.get("input") - if responses_input: - if isinstance(responses_input, str): - input_messages = [responses_input] - else: - input_messages = list(responses_input) - normalized = normalize_message_roles(input_messages) # type: ignore[arg-type] - messages_data = truncate_and_annotate_messages(normalized, span, scope) - if messages_data is not None: - set_data_normalized( - span, - SPANDATA.GEN_AI_REQUEST_MESSAGES, - messages_data, - unpack=False, - ) + _record_responses_input_messages(span, scope, kwargs.get("input")) else: # Chat completions. @@ -223,6 +226,103 @@ async def _async_input_callback(kwargs: "Dict[str, Any]") -> None: return _input_callback(kwargs) +def _record_chat_response_messages(span: "Any", response: "Any") -> None: + """Record response.text from a Chat Completions response.""" + response_messages = [] + for choice in response.choices: + message = getattr(choice, "message", None) + if message is None: + continue + if hasattr(message, "model_dump"): + response_messages.append(message.model_dump()) + elif hasattr(message, "dict"): + response_messages.append(message.dict()) + else: + # Fallback for basic message objects + msg = {} + if hasattr(message, "role"): + msg["role"] = message.role + if hasattr(message, "content"): + msg["content"] = message.content + if hasattr(message, "tool_calls"): + msg["tool_calls"] = message.tool_calls + response_messages.append(msg) + + if response_messages: + set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, response_messages) + + +def _record_responses_output(span: "Any", response: "ResponsesAPIResponse") -> None: + """Record response text and tool calls from a Responses API response.""" + output_text = [] # type: List[Any] + tool_calls = [] # type: List[Any] + for output in response.output: + output_type = getattr(output, "type", None) + if output_type == "function_call": + if hasattr(output, "model_dump"): + tool_calls.append(output.model_dump()) + elif hasattr(output, "dict"): + tool_calls.append(output.dict()) + elif output_type == "message": + for content_item in getattr(output, "content", []) or []: + text = getattr(content_item, "text", None) + if text is not None: + output_text.append(text) + elif hasattr(content_item, "model_dump"): + output_text.append(content_item.model_dump()) + elif hasattr(content_item, "dict"): + output_text.append(content_item.dict()) + + if tool_calls: + set_data_normalized( + span, + SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS, + tool_calls, + unpack=False, + ) + if output_text: + set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, output_text) + + +def _record_token_usage_from_response(span: "Any", response: "Any") -> None: + """Record token usage. The shape of ``usage`` depends on the litellm + processing pipeline rather than the API path: + + - ``ResponseAPIUsage``: raw Responses API usage (``input_tokens`` / + ``output_tokens``). Seen when litellm has not yet normalized the value. + - ``dict``: chat-style dict (``prompt_tokens`` / ``completion_tokens``). + litellm assembles streaming Responses API usage as a dict. + - Otherwise: chat-style Pydantic ``Usage`` (``prompt_tokens`` / + ``completion_tokens``). Used for Chat Completions, Embeddings, and + non-streaming Responses API after litellm's post-processing. + """ + usage = getattr(response, "usage", None) + if usage is None: + return + + if isinstance(usage, ResponseAPIUsage): + record_token_usage( + span, + input_tokens=usage.input_tokens, + output_tokens=usage.output_tokens, + total_tokens=usage.total_tokens, + ) + elif isinstance(usage, dict): + record_token_usage( + span, + input_tokens=usage.get("prompt_tokens"), + output_tokens=usage.get("completion_tokens"), + total_tokens=usage.get("total_tokens"), + ) + else: + record_token_usage( + span, + input_tokens=getattr(usage, "prompt_tokens", None), + output_tokens=getattr(usage, "completion_tokens", None), + total_tokens=getattr(usage, "total_tokens", None), + ) + + def _success_callback( kwargs: "Dict[str, Any]", response: "Any", @@ -237,9 +337,8 @@ def _success_callback( - Responses API (non-streaming): ResponsesAPIResponse with ``.output[]`` items (``message`` / ``function_call``) and ``.usage`` carrying ``input_tokens`` / ``output_tokens``. - - Responses API (streaming): a ResponseCompletedEvent wrapper - ``{type: "response.completed", response: ResponsesAPIResponse}``, - which we unwrap below. + - Responses API (streaming): a ResponseCompletedEvent wrapping a + ``ResponsesAPIResponse``, which we unwrap below. - Embeddings: CreateEmbeddingResponse with ``.usage`` only (no choices or output). """ @@ -255,9 +354,7 @@ def _success_callback( # Streaming Responses API: unwrap the ResponseCompletedEvent so the rest of # the function sees the assembled ResponsesAPIResponse directly. - if getattr(response, "type", None) == "response.completed" and hasattr( - response, "response" - ): + if isinstance(response, ResponseCompletedEvent): response = response.response try: @@ -265,84 +362,13 @@ def _success_callback( if hasattr(response, "model"): set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_MODEL, response.model) - # Response content: structure depends on the API shape. Embeddings have - # neither ``choices`` nor ``output``, so we just skip this block. if should_send_default_pii() and integration.include_prompts: - if hasattr(response, "choices"): - # Chat Completions API. - response_messages = [] - for choice in response.choices: - if hasattr(choice, "message"): - if hasattr(choice.message, "model_dump"): - response_messages.append(choice.message.model_dump()) - elif hasattr(choice.message, "dict"): - response_messages.append(choice.message.dict()) - else: - # Fallback for basic message objects - msg = {} - if hasattr(choice.message, "role"): - msg["role"] = choice.message.role - if hasattr(choice.message, "content"): - msg["content"] = choice.message.content - if hasattr(choice.message, "tool_calls"): - msg["tool_calls"] = choice.message.tool_calls - response_messages.append(msg) - - if response_messages: - set_data_normalized( - span, SPANDATA.GEN_AI_RESPONSE_TEXT, response_messages - ) - elif hasattr(response, "output"): - # Responses API: split message text from function-call items. - output_text: "List[Any]" = [] - tool_calls: "List[Any]" = [] - for output in response.output: - output_type = getattr(output, "type", None) - if output_type == "function_call": - if hasattr(output, "model_dump"): - tool_calls.append(output.model_dump()) - elif hasattr(output, "dict"): - tool_calls.append(output.dict()) - elif output_type == "message": - for content_item in getattr(output, "content", []) or []: - text = getattr(content_item, "text", None) - if text is not None: - output_text.append(text) - elif hasattr(content_item, "model_dump"): - output_text.append(content_item.model_dump()) - elif hasattr(content_item, "dict"): - output_text.append(content_item.dict()) - - if tool_calls: - set_data_normalized( - span, - SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS, - tool_calls, - unpack=False, - ) - if output_text: - set_data_normalized( - span, SPANDATA.GEN_AI_RESPONSE_TEXT, output_text - ) + if isinstance(response, ResponsesAPIResponse): + _record_responses_output(span, response) + elif hasattr(response, "choices"): + _record_chat_response_messages(span, response) - # Token usage field names differ across APIs: - # Chat Completions / Embeddings: prompt_tokens / completion_tokens - # Responses API (non-streaming): input_tokens / output_tokens - # Responses API (streaming): prompt_tokens / completion_tokens - # (litellm normalizes to chat-completion names when assembling the - # streaming response). For the async-streaming variant, the - # assembled `usage` is a plain dict, not a Pydantic model — hence - # `_read_usage_field` supports both shapes. - if hasattr(response, "usage"): - usage = response.usage - record_token_usage( - span, - input_tokens=_read_usage_field(usage, "prompt_tokens", "input_tokens"), - output_tokens=_read_usage_field( - usage, "completion_tokens", "output_tokens" - ), - total_tokens=_read_usage_field(usage, "total_tokens"), - ) + _record_token_usage_from_response(span, response) finally: is_streaming = kwargs.get("stream") diff --git a/tests/integrations/litellm/test_litellm.py b/tests/integrations/litellm/test_litellm.py index 5dcf16003f..18f18cbfeb 100644 --- a/tests/integrations/litellm/test_litellm.py +++ b/tests/integrations/litellm/test_litellm.py @@ -2137,63 +2137,52 @@ def test_convert_message_parts_image_url_missing_url(): assert converted[0]["content"][0]["type"] == "image_url" -class MockResponsesUsage: - def __init__(self, input_tokens=12, output_tokens=24, total_tokens=36): - self.input_tokens = input_tokens - self.output_tokens = output_tokens - self.total_tokens = total_tokens - - -class MockResponsesContentItem: - def __init__(self, text): - self.type = "output_text" - self.text = text - - -class MockResponsesOutputMessage: - def __init__(self, text): - self.type = "message" - self.role = "assistant" - self.content = [MockResponsesContentItem(text)] - - -class MockResponsesResponse: - def __init__( - self, - model="gpt-4.1-nano", - output=None, - usage=None, - ): - self.id = "resp-test" - self.model = model - self.output = output or [MockResponsesOutputMessage("the model response")] - self.usage = usage or MockResponsesUsage() - - -def _build_responses_kwargs( - *, - input_value="What is the capital of France?", - conversation=None, - model="openai/gpt-4.1-nano", - extra_body_conversation=None, +def _make_responses_api_response( + model="gpt-4.1-nano", + text="the model response", + input_tokens=12, + output_tokens=24, + total_tokens=36, ): - """ - Build the kwargs shape that litellm passes to input_callback for a - responses() call. `extra_body` is unpacked into - additional_args.complete_input_dict by litellm before callbacks fire. - """ - complete_input_dict = {"model": model.split("/")[-1], "input": input_value} - if extra_body_conversation is not None: - complete_input_dict["conversation"] = extra_body_conversation - elif conversation is not None: - complete_input_dict["conversation"] = conversation - - return { - "model": model, - "input": input_value, - "call_type": "responses", - "additional_args": {"complete_input_dict": complete_input_dict}, - } + """Build a real openai.types.responses.Response for use as a fake HTTP + payload. litellm parses the JSON into a litellm ResponsesAPIResponse.""" + import openai.types.responses as resp_types + + return resp_types.Response( + id="resp-test", + output=[ + resp_types.ResponseOutputMessage( + id="msg_123", + type="message", + status="completed", + content=[ + resp_types.ResponseOutputText( + text=text, + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model=model, + object="response", + usage=resp_types.ResponseUsage( + input_tokens=input_tokens, + input_tokens_details=resp_types.response_usage.InputTokensDetails( + cached_tokens=0, + ), + output_tokens=output_tokens, + output_tokens_details=resp_types.response_usage.OutputTokensDetails( + reasoning_tokens=0, + ), + total_tokens=total_tokens, + ), + ) @pytest.mark.parametrize( @@ -2205,7 +2194,12 @@ def _build_responses_kwargs( ], ) def test_responses_conversation_id( - sentry_init, capture_events, conversation, expected_id + reset_litellm_executor, + sentry_init, + capture_events, + get_model_response, + conversation, + expected_id, ): sentry_init( integrations=[LiteLLMIntegration()], @@ -2213,18 +2207,30 @@ def test_responses_conversation_id( ) events = capture_events() - kwargs = _build_responses_kwargs(extra_body_conversation=conversation) + client = HTTPHandler() + fake_response = get_model_response( + _make_responses_api_response(), + serialize_pydantic=True, + ) - with start_transaction(name="litellm test"): - _input_callback(kwargs) - _success_callback( - kwargs, MockResponsesResponse(), datetime.now(), datetime.now() - ) + extra_body = {"conversation": conversation} if conversation is not None else {} + + with mock.patch.object(client, "post", return_value=fake_response): + with start_transaction(name="litellm test"): + litellm.responses( + model="openai/gpt-4.1-nano", + input="What is the capital of France?", + client=client, + extra_body=extra_body, + ) + litellm_utils.executor.shutdown(wait=True) (event,) = events - (span,) = event["spans"] + response_spans = [ + span for span in event["spans"] if span["op"] == OP.GEN_AI_RESPONSES + ] + (span,) = response_spans - assert span["op"] == OP.GEN_AI_RESPONSES assert span["description"] == "responses gpt-4.1-nano" assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "responses" @@ -2234,7 +2240,12 @@ def test_responses_conversation_id( assert span["data"][SPANDATA.GEN_AI_CONVERSATION_ID] == expected_id -def test_responses_records_input_output_and_usage(sentry_init, capture_events): +def test_responses_records_input_output_and_usage( + reset_litellm_executor, + sentry_init, + capture_events, + get_model_response, +): sentry_init( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, @@ -2242,20 +2253,28 @@ def test_responses_records_input_output_and_usage(sentry_init, capture_events): ) events = capture_events() - kwargs = _build_responses_kwargs( - extra_body_conversation={"id": "conv_xyz"}, + client = HTTPHandler() + fake_response = get_model_response( + _make_responses_api_response(text="the model response"), + serialize_pydantic=True, ) - with start_transaction(name="litellm test"): - _input_callback(kwargs) - _success_callback( - kwargs, MockResponsesResponse(), datetime.now(), datetime.now() - ) + with mock.patch.object(client, "post", return_value=fake_response): + with start_transaction(name="litellm test"): + litellm.responses( + model="openai/gpt-4.1-nano", + input="What is the capital of France?", + client=client, + extra_body={"conversation": {"id": "conv_xyz"}}, + ) + litellm_utils.executor.shutdown(wait=True) (event,) = events - (span,) = event["spans"] + response_spans = [ + span for span in event["spans"] if span["op"] == OP.GEN_AI_RESPONSES + ] + (span,) = response_spans - assert span["op"] == OP.GEN_AI_RESPONSES assert span["data"][SPANDATA.GEN_AI_CONVERSATION_ID] == "conv_xyz" assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] assert "the model response" in span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] @@ -2264,7 +2283,12 @@ def test_responses_records_input_output_and_usage(sentry_init, capture_events): assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 36 -def test_responses_no_pii_omits_messages(sentry_init, capture_events): +def test_responses_no_pii_omits_messages( + reset_litellm_executor, + sentry_init, + capture_events, + get_model_response, +): sentry_init( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, @@ -2272,16 +2296,27 @@ def test_responses_no_pii_omits_messages(sentry_init, capture_events): ) events = capture_events() - kwargs = _build_responses_kwargs(extra_body_conversation={"id": "conv_xyz"}) + client = HTTPHandler() + fake_response = get_model_response( + _make_responses_api_response(), + serialize_pydantic=True, + ) - with start_transaction(name="litellm test"): - _input_callback(kwargs) - _success_callback( - kwargs, MockResponsesResponse(), datetime.now(), datetime.now() - ) + with mock.patch.object(client, "post", return_value=fake_response): + with start_transaction(name="litellm test"): + litellm.responses( + model="openai/gpt-4.1-nano", + input="What is the capital of France?", + client=client, + extra_body={"conversation": {"id": "conv_xyz"}}, + ) + litellm_utils.executor.shutdown(wait=True) (event,) = events - (span,) = event["spans"] + response_spans = [ + span for span in event["spans"] if span["op"] == OP.GEN_AI_RESPONSES + ] + (span,) = response_spans # Conversation id is not PII, but request/response content is assert span["data"][SPANDATA.GEN_AI_CONVERSATION_ID] == "conv_xyz" @@ -2289,17 +2324,12 @@ def test_responses_no_pii_omits_messages(sentry_init, capture_events): assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] -class MockResponseCompletedEvent: - """Mimics litellm/openai's response.completed streaming wrapper.""" - - def __init__(self, response): - self.type = "response.completed" - self.response = response - - def test_responses_streaming_unwraps_completed_event(sentry_init, capture_events): """For streaming responses, success_handler receives a ResponseCompletedEvent - wrapper. We must unwrap it to read usage/output from the inner response.""" + wrapping the assembled ResponsesAPIResponse. We must unwrap it to read + usage/output from the inner response.""" + from litellm.types.llms.openai import ResponsesAPIResponse, ResponseCompletedEvent + sentry_init( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, @@ -2307,11 +2337,19 @@ def test_responses_streaming_unwraps_completed_event(sentry_init, capture_events ) events = capture_events() - kwargs = _build_responses_kwargs() - kwargs["stream"] = True - kwargs["complete_streaming_response"] = MockResponsesResponse() + inner_response = ResponsesAPIResponse( + **_make_responses_api_response().model_dump(by_alias=True, exclude_none=True), + ) + wrapper = ResponseCompletedEvent(type="response.completed", response=inner_response) - wrapper = MockResponseCompletedEvent(MockResponsesResponse()) + kwargs = { + "model": "openai/gpt-4.1-nano", + "input": "What is the capital of France?", + "call_type": "responses", + "stream": True, + "complete_streaming_response": inner_response, + "additional_args": {"complete_input_dict": {}}, + } with start_transaction(name="litellm test"): _input_callback(kwargs) @@ -2327,40 +2365,43 @@ def test_responses_streaming_unwraps_completed_event(sentry_init, capture_events assert "the model response" in span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] -class MockResponseWithDictUsage: - """Mimics the assembled async-streaming responses object: usage is a dict - (not a Pydantic model), as litellm hands us for that path.""" - - def __init__(self): - self.id = "resp-test" - self.model = "gpt-4.1-nano" - self.output = [MockResponsesOutputMessage("hi")] - self.usage = { - "prompt_tokens": 7, - "completion_tokens": 2, - "total_tokens": 9, - } - - def test_responses_async_streaming_dict_usage(sentry_init, capture_events): - """For async streaming responses, litellm assembles `usage` as a plain dict. - `getattr(dict, ...)` would silently miss it; we need to support both shapes.""" + """For async streaming responses, litellm transforms `usage` into a chat-style + dict on the assembled ResponsesAPIResponse (see + Logging._get_assembled_streaming_response). We must read the chat-style keys + when usage is a dict.""" + from litellm.types.llms.openai import ResponsesAPIResponse + sentry_init( integrations=[LiteLLMIntegration()], traces_sample_rate=1.0, ) events = capture_events() - kwargs = _build_responses_kwargs() - kwargs["call_type"] = "aresponses" - kwargs["stream"] = True - kwargs["async_complete_streaming_response"] = MockResponseWithDictUsage() + response_data = _make_responses_api_response().model_dump( + by_alias=True, exclude_none=True + ) + response = ResponsesAPIResponse(**response_data) + # litellm replaces ResponseAPIUsage with a chat-style dict during streaming + # assembly; mirror that mutation here. + response.usage = { + "prompt_tokens": 7, + "completion_tokens": 2, + "total_tokens": 9, + } + + kwargs = { + "model": "openai/gpt-4.1-nano", + "input": "What is the capital of France?", + "call_type": "aresponses", + "stream": True, + "async_complete_streaming_response": response, + "additional_args": {"complete_input_dict": {}}, + } with start_transaction(name="litellm test"): _input_callback(kwargs) - _success_callback( - kwargs, MockResponseWithDictUsage(), datetime.now(), datetime.now() - ) + _success_callback(kwargs, response, datetime.now(), datetime.now()) (event,) = events (span,) = event["spans"] @@ -2371,7 +2412,12 @@ def test_responses_async_streaming_dict_usage(sentry_init, capture_events): assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 9 -def test_aresponses_call_type_treated_as_responses(sentry_init, capture_events): +@pytest.mark.asyncio(loop_scope="session") +async def test_aresponses_call_type_treated_as_responses( + sentry_init, + capture_events, + get_model_response, +): """aresponses (async) call_type should produce a responses span.""" sentry_init( integrations=[LiteLLMIntegration()], @@ -2379,17 +2425,29 @@ def test_aresponses_call_type_treated_as_responses(sentry_init, capture_events): ) events = capture_events() - kwargs = _build_responses_kwargs() - kwargs["call_type"] = "aresponses" + client = AsyncHTTPHandler() + fake_response = get_model_response( + _make_responses_api_response(), + serialize_pydantic=True, + ) + + async def fake_post(*args, **kwargs): + return fake_response - with start_transaction(name="litellm test"): - _input_callback(kwargs) - _success_callback( - kwargs, MockResponsesResponse(), datetime.now(), datetime.now() - ) + with mock.patch.object(client, "post", new=fake_post): + with start_transaction(name="litellm test"): + await litellm.aresponses( + model="openai/gpt-4.1-nano", + input="What is the capital of France?", + client=client, + ) + await GLOBAL_LOGGING_WORKER.flush() + await asyncio.sleep(0.5) (event,) = events - (span,) = event["spans"] + response_spans = [ + span for span in event["spans"] if span["op"] == OP.GEN_AI_RESPONSES + ] + (span,) = response_spans - assert span["op"] == OP.GEN_AI_RESPONSES assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "responses"