From bb31cadd5390021ac8a4b890eb197b668d2ecf9d Mon Sep 17 00:00:00 2001
From: Fabian Schindler <fabian.schindler@sentry.io>
Date: Tue, 5 May 2026 14:17:35 +0200
Subject: [PATCH 1/3] feat(integrations): add support for the `litellm`
 `responses`/`aresponses` APIs

---
 sentry_sdk/integrations/litellm.py         | 161 +++++++++++++---
 tests/integrations/litellm/test_litellm.py | 214 +++++++++++++++++++++
 2 files changed, 345 insertions(+), 30 deletions(-)

diff --git a/sentry_sdk/integrations/litellm.py b/sentry_sdk/integrations/litellm.py
index 3cff0fbc23..111fa3de2d 100644
--- a/sentry_sdk/integrations/litellm.py
+++ b/sentry_sdk/integrations/litellm.py
@@ -6,6 +6,7 @@
 from sentry_sdk.ai.monitoring import record_token_usage
 from sentry_sdk.ai.utils import (
     get_start_span_function,
+    normalize_message_roles,
     set_data_normalized,
     truncate_and_annotate_messages,
     transform_openai_content_part,
@@ -17,7 +18,7 @@
 from sentry_sdk.utils import event_from_exception
 
 if TYPE_CHECKING:
-    from typing import Any, Dict, List
+    from typing import Any, Dict, List, Optional
     from datetime import datetime
 
 try:
@@ -84,16 +85,17 @@ def _input_callback(kwargs: "Dict[str, Any]") -> None:
     call_type = kwargs.get("call_type", None)
     if call_type == "embedding" or call_type == "aembedding":
         operation = "embeddings"
+        op = consts.OP.GEN_AI_EMBEDDINGS
+    elif call_type == "responses" or call_type == "aresponses":
+        operation = "responses"
+        op = consts.OP.GEN_AI_RESPONSES
     else:
         operation = "chat"
+        op = consts.OP.GEN_AI_CHAT
 
     # Start a new span/transaction
     span = get_start_span_function()(
-        op=(
-            consts.OP.GEN_AI_CHAT
-            if operation == "chat"
-            else consts.OP.GEN_AI_EMBEDDINGS
-        ),
+        op=op,
         name=f"{operation} {model}",
         origin=LiteLLMIntegration.origin,
     )
@@ -106,14 +108,15 @@ def _input_callback(kwargs: "Dict[str, Any]") -> None:
     set_data_normalized(span, SPANDATA.GEN_AI_SYSTEM, provider)
     set_data_normalized(span, SPANDATA.GEN_AI_OPERATION_NAME, operation)
 
-    # Record input/messages if allowed
-    if should_send_default_pii() and integration.include_prompts:
-        if operation == "embeddings":
-            # For embeddings, look for the 'input' parameter
+    # Per-operation request data. Conversation id (responses) is set
+    # unconditionally; user-content fields are gated on PII / include_prompts.
+    record_prompts = should_send_default_pii() and integration.include_prompts
+    scope = sentry_sdk.get_current_scope()
+
+    if operation == "embeddings":
+        if record_prompts:
             embedding_input = kwargs.get("input")
             if embedding_input:
-                scope = sentry_sdk.get_current_scope()
-                # Normalize to list format
                 input_list = (
                     embedding_input
                     if isinstance(embedding_input, list)
@@ -129,11 +132,50 @@ def _input_callback(kwargs: "Dict[str, Any]") -> None:
                         messages_data,
                         unpack=False,
                     )
-        else:
-            # For chat, look for the 'messages' parameter
+
+    elif operation == "responses":
+        # litellm unpacks `extra_body` into the request body, so the
+        # `conversation` field shows up in additional_args.complete_input_dict
+        # rather than as a top-level kwarg.
+        complete_input = (kwargs.get("additional_args") or {}).get(
+            "complete_input_dict"
+        ) or {}
+        conversation = complete_input.get("conversation")
+        if conversation is not None:
+            conversation_id: "Optional[str]" = None
+            if isinstance(conversation, str):
+                conversation_id = conversation
+            elif isinstance(conversation, dict):
+                conversation_id = conversation.get("id")
+            if conversation_id is not None:
+                set_data_normalized(
+                    span, SPANDATA.GEN_AI_CONVERSATION_ID, conversation_id
+                )
+
+        if record_prompts:
+            # `input` is either a string or a list of message dicts (same
+            # shape as OpenAI Responses API).
+            responses_input = kwargs.get("input")
+            if responses_input:
+                if isinstance(responses_input, str):
+                    input_messages = [responses_input]
+                else:
+                    input_messages = list(responses_input)
+                normalized = normalize_message_roles(input_messages)  # type: ignore[arg-type]
+                messages_data = truncate_and_annotate_messages(normalized, span, scope)
+                if messages_data is not None:
+                    set_data_normalized(
+                        span,
+                        SPANDATA.GEN_AI_REQUEST_MESSAGES,
+                        messages_data,
+                        unpack=False,
+                    )
+
+    else:
+        # Chat completions.
+        if record_prompts:
             messages = kwargs.get("messages", [])
             if messages:
-                scope = sentry_sdk.get_current_scope()
                 messages = _convert_message_parts(messages)
                 messages_data = truncate_and_annotate_messages(messages, span, scope)
                 if messages_data is not None:
@@ -166,11 +208,24 @@ async def _async_input_callback(kwargs: "Dict[str, Any]") -> None:
 
 def _success_callback(
     kwargs: "Dict[str, Any]",
-    completion_response: "Any",
+    response: "Any",
     start_time: "datetime",
     end_time: "datetime",
 ) -> None:
-    """Handle successful completion."""
+    """Handle a successful chat completion, embeddings, or Responses API call.
+
+    The shape of `response` differs between API paths:
+      - Chat Completions: ModelResponse with ``.choices[].message`` and
+        ``.usage`` carrying ``prompt_tokens`` / ``completion_tokens``.
+      - Responses API (non-streaming): ResponsesAPIResponse with ``.output[]``
+        items (``message`` / ``function_call``) and ``.usage`` carrying
+        ``input_tokens`` / ``output_tokens``.
+      - Responses API (streaming): a ResponseCompletedEvent wrapper
+        ``{type: "response.completed", response: ResponsesAPIResponse}``,
+        which we unwrap below.
+      - Embeddings: CreateEmbeddingResponse with ``.usage`` only (no choices
+        or output).
+    """
 
     metadata = _get_metadata_dict(kwargs)
     span = metadata.get("_sentry_span")
@@ -181,18 +236,25 @@ def _success_callback(
     if integration is None:
         return
 
+    # Streaming Responses API: unwrap the ResponseCompletedEvent so the rest of
+    # the function sees the assembled ResponsesAPIResponse directly.
+    if getattr(response, "type", None) == "response.completed" and hasattr(
+        response, "response"
+    ):
+        response = response.response
+
     try:
-        # Record model information
-        if hasattr(completion_response, "model"):
-            set_data_normalized(
-                span, SPANDATA.GEN_AI_RESPONSE_MODEL, completion_response.model
-            )
+        # `model` is set by all API shapes (chat / responses / embeddings).
+        if hasattr(response, "model"):
+            set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_MODEL, response.model)
 
-        # Record response content if allowed
+        # Response content: structure depends on the API shape. Embeddings have
+        # neither ``choices`` nor ``output``, so we just skip this block.
         if should_send_default_pii() and integration.include_prompts:
-            if hasattr(completion_response, "choices"):
+            if hasattr(response, "choices"):
+                # Chat Completions API.
                 response_messages = []
-                for choice in completion_response.choices:
+                for choice in response.choices:
                     if hasattr(choice, "message"):
                         if hasattr(choice.message, "model_dump"):
                             response_messages.append(choice.message.model_dump())
@@ -213,14 +275,53 @@ def _success_callback(
                     set_data_normalized(
                         span, SPANDATA.GEN_AI_RESPONSE_TEXT, response_messages
                     )
+            elif hasattr(response, "output"):
+                # Responses API: split message text from function-call items.
+                output_text: "List[Any]" = []
+                tool_calls: "List[Any]" = []
+                for output in response.output:
+                    output_type = getattr(output, "type", None)
+                    if output_type == "function_call":
+                        if hasattr(output, "model_dump"):
+                            tool_calls.append(output.model_dump())
+                        elif hasattr(output, "dict"):
+                            tool_calls.append(output.dict())
+                    elif output_type == "message":
+                        for content_item in getattr(output, "content", []) or []:
+                            text = getattr(content_item, "text", None)
+                            if text is not None:
+                                output_text.append(text)
+                            elif hasattr(content_item, "model_dump"):
+                                output_text.append(content_item.model_dump())
+                            elif hasattr(content_item, "dict"):
+                                output_text.append(content_item.dict())
+
+                if tool_calls:
+                    set_data_normalized(
+                        span,
+                        SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS,
+                        tool_calls,
+                        unpack=False,
+                    )
+                if output_text:
+                    set_data_normalized(
+                        span, SPANDATA.GEN_AI_RESPONSE_TEXT, output_text
+                    )
 
-        # Record token usage
-        if hasattr(completion_response, "usage"):
-            usage = completion_response.usage
+        # Token usage field names differ across APIs:
+        #   Chat Completions / Embeddings: prompt_tokens / completion_tokens
+        #   Responses API (non-streaming): input_tokens  / output_tokens
+        #   Responses API (streaming):     prompt_tokens / completion_tokens
+        #     (litellm normalizes to chat-completion names when assembling the
+        #      streaming response).
+        if hasattr(response, "usage"):
+            usage = response.usage
             record_token_usage(
                 span,
-                input_tokens=getattr(usage, "prompt_tokens", None),
-                output_tokens=getattr(usage, "completion_tokens", None),
+                input_tokens=getattr(usage, "prompt_tokens", None)
+                or getattr(usage, "input_tokens", None),
+                output_tokens=getattr(usage, "completion_tokens", None)
+                or getattr(usage, "output_tokens", None),
                 total_tokens=getattr(usage, "total_tokens", None),
             )
 
diff --git a/tests/integrations/litellm/test_litellm.py b/tests/integrations/litellm/test_litellm.py
index 18f8cfaf6e..1137a88fe3 100644
--- a/tests/integrations/litellm/test_litellm.py
+++ b/tests/integrations/litellm/test_litellm.py
@@ -2135,3 +2135,217 @@ def test_convert_message_parts_image_url_missing_url():
     converted = _convert_message_parts(messages)
     # Should return item unchanged
     assert converted[0]["content"][0]["type"] == "image_url"
+
+
+class MockResponsesUsage:
+    def __init__(self, input_tokens=12, output_tokens=24, total_tokens=36):
+        self.input_tokens = input_tokens
+        self.output_tokens = output_tokens
+        self.total_tokens = total_tokens
+
+
+class MockResponsesContentItem:
+    def __init__(self, text):
+        self.type = "output_text"
+        self.text = text
+
+
+class MockResponsesOutputMessage:
+    def __init__(self, text):
+        self.type = "message"
+        self.role = "assistant"
+        self.content = [MockResponsesContentItem(text)]
+
+
+class MockResponsesResponse:
+    def __init__(
+        self,
+        model="gpt-4.1-nano",
+        output=None,
+        usage=None,
+    ):
+        self.id = "resp-test"
+        self.model = model
+        self.output = output or [MockResponsesOutputMessage("the model response")]
+        self.usage = usage or MockResponsesUsage()
+
+
+def _build_responses_kwargs(
+    *,
+    input_value="What is the capital of France?",
+    conversation=None,
+    model="openai/gpt-4.1-nano",
+    extra_body_conversation=None,
+):
+    """
+    Build the kwargs shape that litellm passes to input_callback for a
+    responses() call. `extra_body` is unpacked into
+    additional_args.complete_input_dict by litellm before callbacks fire.
+    """
+    complete_input_dict = {"model": model.split("/")[-1], "input": input_value}
+    if extra_body_conversation is not None:
+        complete_input_dict["conversation"] = extra_body_conversation
+    elif conversation is not None:
+        complete_input_dict["conversation"] = conversation
+
+    return {
+        "model": model,
+        "input": input_value,
+        "call_type": "responses",
+        "additional_args": {"complete_input_dict": complete_input_dict},
+    }
+
+
+@pytest.mark.parametrize(
+    "conversation, expected_id",
+    [
+        pytest.param(None, None, id="no_conversation"),
+        pytest.param({"id": "conv_abc123"}, "conv_abc123", id="dict"),
+        pytest.param("conv_str_id", "conv_str_id", id="string"),
+    ],
+)
+def test_responses_conversation_id(
+    sentry_init, capture_events, conversation, expected_id
+):
+    sentry_init(
+        integrations=[LiteLLMIntegration()],
+        traces_sample_rate=1.0,
+    )
+    events = capture_events()
+
+    kwargs = _build_responses_kwargs(extra_body_conversation=conversation)
+
+    with start_transaction(name="litellm test"):
+        _input_callback(kwargs)
+        _success_callback(
+            kwargs, MockResponsesResponse(), datetime.now(), datetime.now()
+        )
+
+    (event,) = events
+    (span,) = event["spans"]
+
+    assert span["op"] == OP.GEN_AI_RESPONSES
+    assert span["description"] == "responses gpt-4.1-nano"
+    assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "responses"
+
+    if expected_id is None:
+        assert SPANDATA.GEN_AI_CONVERSATION_ID not in span["data"]
+    else:
+        assert span["data"][SPANDATA.GEN_AI_CONVERSATION_ID] == expected_id
+
+
+def test_responses_records_input_output_and_usage(sentry_init, capture_events):
+    sentry_init(
+        integrations=[LiteLLMIntegration(include_prompts=True)],
+        traces_sample_rate=1.0,
+        send_default_pii=True,
+    )
+    events = capture_events()
+
+    kwargs = _build_responses_kwargs(
+        extra_body_conversation={"id": "conv_xyz"},
+    )
+
+    with start_transaction(name="litellm test"):
+        _input_callback(kwargs)
+        _success_callback(
+            kwargs, MockResponsesResponse(), datetime.now(), datetime.now()
+        )
+
+    (event,) = events
+    (span,) = event["spans"]
+
+    assert span["op"] == OP.GEN_AI_RESPONSES
+    assert span["data"][SPANDATA.GEN_AI_CONVERSATION_ID] == "conv_xyz"
+    assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"]
+    assert "the model response" in span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT]
+    assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 12
+    assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 24
+    assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 36
+
+
+def test_responses_no_pii_omits_messages(sentry_init, capture_events):
+    sentry_init(
+        integrations=[LiteLLMIntegration(include_prompts=True)],
+        traces_sample_rate=1.0,
+        # send_default_pii not set -> defaults to False
+    )
+    events = capture_events()
+
+    kwargs = _build_responses_kwargs(extra_body_conversation={"id": "conv_xyz"})
+
+    with start_transaction(name="litellm test"):
+        _input_callback(kwargs)
+        _success_callback(
+            kwargs, MockResponsesResponse(), datetime.now(), datetime.now()
+        )
+
+    (event,) = events
+    (span,) = event["spans"]
+
+    # Conversation id is not PII, but request/response content is
+    assert span["data"][SPANDATA.GEN_AI_CONVERSATION_ID] == "conv_xyz"
+    assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"]
+    assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"]
+
+
+class MockResponseCompletedEvent:
+    """Mimics litellm/openai's response.completed streaming wrapper."""
+
+    def __init__(self, response):
+        self.type = "response.completed"
+        self.response = response
+
+
+def test_responses_streaming_unwraps_completed_event(sentry_init, capture_events):
+    """For streaming responses, success_handler receives a ResponseCompletedEvent
+    wrapper. We must unwrap it to read usage/output from the inner response."""
+    sentry_init(
+        integrations=[LiteLLMIntegration(include_prompts=True)],
+        traces_sample_rate=1.0,
+        send_default_pii=True,
+    )
+    events = capture_events()
+
+    kwargs = _build_responses_kwargs()
+    kwargs["stream"] = True
+    kwargs["complete_streaming_response"] = MockResponsesResponse()
+
+    wrapper = MockResponseCompletedEvent(MockResponsesResponse())
+
+    with start_transaction(name="litellm test"):
+        _input_callback(kwargs)
+        _success_callback(kwargs, wrapper, datetime.now(), datetime.now())
+
+    (event,) = events
+    (span,) = event["spans"]
+
+    assert span["op"] == OP.GEN_AI_RESPONSES
+    assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 12
+    assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 24
+    assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 36
+    assert "the model response" in span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT]
+
+
+def test_aresponses_call_type_treated_as_responses(sentry_init, capture_events):
+    """aresponses (async) call_type should produce a responses span."""
+    sentry_init(
+        integrations=[LiteLLMIntegration()],
+        traces_sample_rate=1.0,
+    )
+    events = capture_events()
+
+    kwargs = _build_responses_kwargs()
+    kwargs["call_type"] = "aresponses"
+
+    with start_transaction(name="litellm test"):
+        _input_callback(kwargs)
+        _success_callback(
+            kwargs, MockResponsesResponse(), datetime.now(), datetime.now()
+        )
+
+    (event,) = events
+    (span,) = event["spans"]
+
+    assert span["op"] == OP.GEN_AI_RESPONSES
+    assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "responses"

From d39b073c8167484e64bae36ad2e00551bafd015d Mon Sep 17 00:00:00 2001
From: Fabian Schindler <fabian.schindler@sentry.io>
Date: Tue, 5 May 2026 14:45:04 +0200
Subject: [PATCH 2/3] fix: different handlings for some token usages

---
 sentry_sdk/integrations/litellm.py         | 31 ++++++++++++---
 tests/integrations/litellm/test_litellm.py | 44 ++++++++++++++++++++++
 2 files changed, 69 insertions(+), 6 deletions(-)

diff --git a/sentry_sdk/integrations/litellm.py b/sentry_sdk/integrations/litellm.py
index 111fa3de2d..5fee701ed9 100644
--- a/sentry_sdk/integrations/litellm.py
+++ b/sentry_sdk/integrations/litellm.py
@@ -40,6 +40,23 @@ def _get_metadata_dict(kwargs: "Dict[str, Any]") -> "Dict[str, Any]":
     return metadata
 
 
+def _read_usage_field(usage: "Any", *names: str) -> "Optional[int]":
+    """Read the first non-None field from a usage container.
+
+    The usage object can be either a typed Pydantic model (attribute access) or
+    a plain dict (litellm hands us a dict for the assembled async-streaming
+    response), so we try both shapes.
+    """
+    for name in names:
+        if isinstance(usage, dict):
+            value = usage.get(name)
+        else:
+            value = getattr(usage, name, None)
+        if value is not None:
+            return value
+    return None
+
+
 def _convert_message_parts(messages: "List[Dict[str, Any]]") -> "List[Dict[str, Any]]":
     """
     Convert the message parts from OpenAI format to the `gen_ai.request.messages` format
@@ -313,16 +330,18 @@ def _success_callback(
         #   Responses API (non-streaming): input_tokens  / output_tokens
         #   Responses API (streaming):     prompt_tokens / completion_tokens
         #     (litellm normalizes to chat-completion names when assembling the
-        #      streaming response).
+        #      streaming response). For the async-streaming variant, the
+        #      assembled `usage` is a plain dict, not a Pydantic model — hence
+        #      `_read_usage_field` supports both shapes.
         if hasattr(response, "usage"):
             usage = response.usage
             record_token_usage(
                 span,
-                input_tokens=getattr(usage, "prompt_tokens", None)
-                or getattr(usage, "input_tokens", None),
-                output_tokens=getattr(usage, "completion_tokens", None)
-                or getattr(usage, "output_tokens", None),
-                total_tokens=getattr(usage, "total_tokens", None),
+                input_tokens=_read_usage_field(usage, "prompt_tokens", "input_tokens"),
+                output_tokens=_read_usage_field(
+                    usage, "completion_tokens", "output_tokens"
+                ),
+                total_tokens=_read_usage_field(usage, "total_tokens"),
             )
 
     finally:
diff --git a/tests/integrations/litellm/test_litellm.py b/tests/integrations/litellm/test_litellm.py
index 1137a88fe3..5dcf16003f 100644
--- a/tests/integrations/litellm/test_litellm.py
+++ b/tests/integrations/litellm/test_litellm.py
@@ -2327,6 +2327,50 @@ def test_responses_streaming_unwraps_completed_event(sentry_init, capture_events
     assert "the model response" in span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT]
 
 
+class MockResponseWithDictUsage:
+    """Mimics the assembled async-streaming responses object: usage is a dict
+    (not a Pydantic model), as litellm hands us for that path."""
+
+    def __init__(self):
+        self.id = "resp-test"
+        self.model = "gpt-4.1-nano"
+        self.output = [MockResponsesOutputMessage("hi")]
+        self.usage = {
+            "prompt_tokens": 7,
+            "completion_tokens": 2,
+            "total_tokens": 9,
+        }
+
+
+def test_responses_async_streaming_dict_usage(sentry_init, capture_events):
+    """For async streaming responses, litellm assembles `usage` as a plain dict.
+    `getattr(dict, ...)` would silently miss it; we need to support both shapes."""
+    sentry_init(
+        integrations=[LiteLLMIntegration()],
+        traces_sample_rate=1.0,
+    )
+    events = capture_events()
+
+    kwargs = _build_responses_kwargs()
+    kwargs["call_type"] = "aresponses"
+    kwargs["stream"] = True
+    kwargs["async_complete_streaming_response"] = MockResponseWithDictUsage()
+
+    with start_transaction(name="litellm test"):
+        _input_callback(kwargs)
+        _success_callback(
+            kwargs, MockResponseWithDictUsage(), datetime.now(), datetime.now()
+        )
+
+    (event,) = events
+    (span,) = event["spans"]
+
+    assert span["op"] == OP.GEN_AI_RESPONSES
+    assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 7
+    assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 2
+    assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 9
+
+
 def test_aresponses_call_type_treated_as_responses(sentry_init, capture_events):
     """aresponses (async) call_type should produce a responses span."""
     sentry_init(

From b3d837c704d4e292bc45b2289cc097c0f1b58f3a Mon Sep 17 00:00:00 2001
From: Fabian Schindler <fabian.schindler@sentry.io>
Date: Mon, 18 May 2026 13:31:11 +0200
Subject: [PATCH 3/3] feat(tests): enhance tests for Responses API integration
 and conversation ID handling

---
 sentry_sdk/integrations/litellm.py         | 284 +++++++++---------
 tests/integrations/litellm/test_litellm.py | 318 ++++++++++++---------
 2 files changed, 343 insertions(+), 259 deletions(-)

diff --git a/sentry_sdk/integrations/litellm.py b/sentry_sdk/integrations/litellm.py
index 5fee701ed9..ed890ef6fd 100644
--- a/sentry_sdk/integrations/litellm.py
+++ b/sentry_sdk/integrations/litellm.py
@@ -18,12 +18,17 @@
 from sentry_sdk.utils import event_from_exception
 
 if TYPE_CHECKING:
-    from typing import Any, Dict, List, Optional
+    from typing import Any, Dict, List
     from datetime import datetime
 
 try:
     import litellm  # type: ignore[import-not-found]
     from litellm import input_callback, success_callback, failure_callback
+    from litellm.types.llms.openai import (  # type: ignore[import-not-found]
+        ResponseAPIUsage,
+        ResponseCompletedEvent,
+        ResponsesAPIResponse,
+    )
 except ImportError:
     raise DidNotEnable("LiteLLM not installed")
 
@@ -40,23 +45,6 @@ def _get_metadata_dict(kwargs: "Dict[str, Any]") -> "Dict[str, Any]":
     return metadata
 
 
-def _read_usage_field(usage: "Any", *names: str) -> "Optional[int]":
-    """Read the first non-None field from a usage container.
-
-    The usage object can be either a typed Pydantic model (attribute access) or
-    a plain dict (litellm hands us a dict for the assembled async-streaming
-    response), so we try both shapes.
-    """
-    for name in names:
-        if isinstance(usage, dict):
-            value = usage.get(name)
-        else:
-            value = getattr(usage, name, None)
-        if value is not None:
-            return value
-    return None
-
-
 def _convert_message_parts(messages: "List[Dict[str, Any]]") -> "List[Dict[str, Any]]":
     """
     Convert the message parts from OpenAI format to the `gen_ai.request.messages` format
@@ -84,6 +72,48 @@ def _convert_message_parts(messages: "List[Dict[str, Any]]") -> "List[Dict[str,
     return messages
 
 
+def _record_responses_conversation_id(
+    span: "Any", complete_input: "Dict[str, Any]"
+) -> None:
+    """Set the conversation id on the span when the Responses API request carries one."""
+    conversation = complete_input.get("conversation")
+    if conversation is None:
+        return
+
+    if isinstance(conversation, str):
+        conversation_id = conversation
+    elif isinstance(conversation, dict):
+        conversation_id = conversation.get("id")
+    else:
+        conversation_id = None
+
+    if conversation_id is not None:
+        set_data_normalized(span, SPANDATA.GEN_AI_CONVERSATION_ID, conversation_id)
+
+
+def _record_responses_input_messages(
+    span: "Any", scope: "Any", responses_input: "Any"
+) -> None:
+    """Record the request messages for a Responses API call."""
+    if not responses_input:
+        return
+
+    # `input` is either a string or a list of message dicts (same shape as
+    # the OpenAI Responses API).
+    if isinstance(responses_input, str):
+        input_messages = [responses_input]
+    else:
+        input_messages = list(responses_input)
+    normalized = normalize_message_roles(input_messages)  # type: ignore[arg-type]
+    messages_data = truncate_and_annotate_messages(normalized, span, scope)
+    if messages_data is not None:
+        span.set_data(
+            SPANDATA.GEN_AI_REQUEST_MESSAGES,
+            messages_data,
+            unpack=False,
+        )
+
+
 def _input_callback(kwargs: "Dict[str, Any]") -> None:
     """Handle the start of a request."""
     integration = sentry_sdk.get_client().get_integration(LiteLLMIntegration)
@@ -157,36 +187,9 @@ def _input_callback(kwargs: "Dict[str, Any]") -> None:
         complete_input = (kwargs.get("additional_args") or {}).get(
             "complete_input_dict"
         ) or {}
-        conversation = complete_input.get("conversation")
-        if conversation is not None:
-            conversation_id: "Optional[str]" = None
-            if isinstance(conversation, str):
-                conversation_id = conversation
-            elif isinstance(conversation, dict):
-                conversation_id = conversation.get("id")
-            if conversation_id is not None:
-                set_data_normalized(
-                    span, SPANDATA.GEN_AI_CONVERSATION_ID, conversation_id
-                )
-
+        _record_responses_conversation_id(span, complete_input)
         if record_prompts:
-            # `input` is either a string or a list of message dicts (same
-            # shape as OpenAI Responses API).
-            responses_input = kwargs.get("input")
-            if responses_input:
-                if isinstance(responses_input, str):
-                    input_messages = [responses_input]
-                else:
-                    input_messages = list(responses_input)
-                normalized = normalize_message_roles(input_messages)  # type: ignore[arg-type]
-                messages_data = truncate_and_annotate_messages(normalized, span, scope)
-                if messages_data is not None:
-                    set_data_normalized(
-                        span,
-                        SPANDATA.GEN_AI_REQUEST_MESSAGES,
-                        messages_data,
-                        unpack=False,
-                    )
+            _record_responses_input_messages(span, scope, kwargs.get("input"))
 
     else:
         # Chat completions.
@@ -223,6 +226,103 @@ async def _async_input_callback(kwargs: "Dict[str, Any]") -> None:
     return _input_callback(kwargs)
 
 
+def _record_chat_response_messages(span: "Any", response: "Any") -> None:
+    """Record response.text from a Chat Completions response."""
+    response_messages = []
+    for choice in response.choices:
+        message = getattr(choice, "message", None)
+        if message is None:
+            continue
+        if hasattr(message, "model_dump"):
+            response_messages.append(message.model_dump())
+        elif hasattr(message, "dict"):
+            response_messages.append(message.dict())
+        else:
+            # Fallback for basic message objects
+            msg = {}
+            if hasattr(message, "role"):
+                msg["role"] = message.role
+            if hasattr(message, "content"):
+                msg["content"] = message.content
+            if hasattr(message, "tool_calls"):
+                msg["tool_calls"] = message.tool_calls
+            response_messages.append(msg)
+
+    if response_messages:
+        set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, response_messages)
+
+
+def _record_responses_output(span: "Any", response: "ResponsesAPIResponse") -> None:
+    """Record response text and tool calls from a Responses API response."""
+    output_text = []  # type: List[Any]
+    tool_calls = []  # type: List[Any]
+    for output in response.output:
+        output_type = getattr(output, "type", None)
+        if output_type == "function_call":
+            if hasattr(output, "model_dump"):
+                tool_calls.append(output.model_dump())
+            elif hasattr(output, "dict"):
+                tool_calls.append(output.dict())
+        elif output_type == "message":
+            for content_item in getattr(output, "content", []) or []:
+                text = getattr(content_item, "text", None)
+                if text is not None:
+                    output_text.append(text)
+                elif hasattr(content_item, "model_dump"):
+                    output_text.append(content_item.model_dump())
+                elif hasattr(content_item, "dict"):
+                    output_text.append(content_item.dict())
+
+    if tool_calls:
+        set_data_normalized(
+            span,
+            SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS,
+            tool_calls,
+            unpack=False,
+        )
+    if output_text:
+        set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, output_text)
+
+
+def _record_token_usage_from_response(span: "Any", response: "Any") -> None:
+    """Record token usage. The shape of ``usage`` depends on the litellm
+    processing pipeline rather than the API path:
+
+    - ``ResponseAPIUsage``: raw Responses API usage (``input_tokens`` /
+      ``output_tokens``). Seen when litellm has not yet normalized the value.
+    - ``dict``: chat-style dict (``prompt_tokens`` / ``completion_tokens``).
+      litellm assembles streaming Responses API usage as a dict.
+    - Otherwise: chat-style Pydantic ``Usage`` (``prompt_tokens`` /
+      ``completion_tokens``). Used for Chat Completions, Embeddings, and
+      non-streaming Responses API after litellm's post-processing.
+    """
+    usage = getattr(response, "usage", None)
+    if usage is None:
+        return
+
+    if isinstance(usage, ResponseAPIUsage):
+        record_token_usage(
+            span,
+            input_tokens=usage.input_tokens,
+            output_tokens=usage.output_tokens,
+            total_tokens=usage.total_tokens,
+        )
+    elif isinstance(usage, dict):
+        record_token_usage(
+            span,
+            input_tokens=usage.get("prompt_tokens"),
+            output_tokens=usage.get("completion_tokens"),
+            total_tokens=usage.get("total_tokens"),
+        )
+    else:
+        record_token_usage(
+            span,
+            input_tokens=getattr(usage, "prompt_tokens", None),
+            output_tokens=getattr(usage, "completion_tokens", None),
+            total_tokens=getattr(usage, "total_tokens", None),
+        )
+
+
 def _success_callback(
     kwargs: "Dict[str, Any]",
     response: "Any",
@@ -237,9 +337,8 @@ def _success_callback(
       - Responses API (non-streaming): ResponsesAPIResponse with ``.output[]``
         items (``message`` / ``function_call``) and ``.usage`` carrying
         ``input_tokens`` / ``output_tokens``.
-      - Responses API (streaming): a ResponseCompletedEvent wrapper
-        ``{type: "response.completed", response: ResponsesAPIResponse}``,
-        which we unwrap below.
+      - Responses API (streaming): a ResponseCompletedEvent wrapping a
+        ``ResponsesAPIResponse``, which we unwrap below.
       - Embeddings: CreateEmbeddingResponse with ``.usage`` only (no choices
         or output).
     """
@@ -255,9 +354,7 @@ def _success_callback(
 
     # Streaming Responses API: unwrap the ResponseCompletedEvent so the rest of
     # the function sees the assembled ResponsesAPIResponse directly.
-    if getattr(response, "type", None) == "response.completed" and hasattr(
-        response, "response"
-    ):
+    if isinstance(response, ResponseCompletedEvent):
         response = response.response
 
     try:
@@ -265,84 +362,13 @@ def _success_callback(
         if hasattr(response, "model"):
             set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_MODEL, response.model)
 
-        # Response content: structure depends on the API shape. Embeddings have
-        # neither ``choices`` nor ``output``, so we just skip this block.
         if should_send_default_pii() and integration.include_prompts:
-            if hasattr(response, "choices"):
-                # Chat Completions API.
-                response_messages = []
-                for choice in response.choices:
-                    if hasattr(choice, "message"):
-                        if hasattr(choice.message, "model_dump"):
-                            response_messages.append(choice.message.model_dump())
-                        elif hasattr(choice.message, "dict"):
-                            response_messages.append(choice.message.dict())
-                        else:
-                            # Fallback for basic message objects
-                            msg = {}
-                            if hasattr(choice.message, "role"):
-                                msg["role"] = choice.message.role
-                            if hasattr(choice.message, "content"):
-                                msg["content"] = choice.message.content
-                            if hasattr(choice.message, "tool_calls"):
-                                msg["tool_calls"] = choice.message.tool_calls
-                            response_messages.append(msg)
-
-                if response_messages:
-                    set_data_normalized(
-                        span, SPANDATA.GEN_AI_RESPONSE_TEXT, response_messages
-                    )
-            elif hasattr(response, "output"):
-                # Responses API: split message text from function-call items.
-                output_text: "List[Any]" = []
-                tool_calls: "List[Any]" = []
-                for output in response.output:
-                    output_type = getattr(output, "type", None)
-                    if output_type == "function_call":
-                        if hasattr(output, "model_dump"):
-                            tool_calls.append(output.model_dump())
-                        elif hasattr(output, "dict"):
-                            tool_calls.append(output.dict())
-                    elif output_type == "message":
-                        for content_item in getattr(output, "content", []) or []:
-                            text = getattr(content_item, "text", None)
-                            if text is not None:
-                                output_text.append(text)
-                            elif hasattr(content_item, "model_dump"):
-                                output_text.append(content_item.model_dump())
-                            elif hasattr(content_item, "dict"):
-                                output_text.append(content_item.dict())
-
-                if tool_calls:
-                    set_data_normalized(
-                        span,
-                        SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS,
-                        tool_calls,
-                        unpack=False,
-                    )
-                if output_text:
-                    set_data_normalized(
-                        span, SPANDATA.GEN_AI_RESPONSE_TEXT, output_text
-                    )
+            if isinstance(response, ResponsesAPIResponse):
+                _record_responses_output(span, response)
+            elif hasattr(response, "choices"):
+                _record_chat_response_messages(span, response)
 
-        # Token usage field names differ across APIs:
-        #   Chat Completions / Embeddings: prompt_tokens / completion_tokens
-        #   Responses API (non-streaming): input_tokens  / output_tokens
-        #   Responses API (streaming):     prompt_tokens / completion_tokens
-        #     (litellm normalizes to chat-completion names when assembling the
-        #      streaming response). For the async-streaming variant, the
-        #      assembled `usage` is a plain dict, not a Pydantic model — hence
-        #      `_read_usage_field` supports both shapes.
-        if hasattr(response, "usage"):
-            usage = response.usage
-            record_token_usage(
-                span,
-                input_tokens=_read_usage_field(usage, "prompt_tokens", "input_tokens"),
-                output_tokens=_read_usage_field(
-                    usage, "completion_tokens", "output_tokens"
-                ),
-                total_tokens=_read_usage_field(usage, "total_tokens"),
-            )
+        _record_token_usage_from_response(span, response)
 
     finally:
         is_streaming = kwargs.get("stream")
diff --git a/tests/integrations/litellm/test_litellm.py b/tests/integrations/litellm/test_litellm.py
index 5dcf16003f..18f18cbfeb 100644
--- a/tests/integrations/litellm/test_litellm.py
+++ b/tests/integrations/litellm/test_litellm.py
@@ -2137,63 +2137,52 @@ def test_convert_message_parts_image_url_missing_url():
     assert converted[0]["content"][0]["type"] == "image_url"
 
 
-class MockResponsesUsage:
-    def __init__(self, input_tokens=12, output_tokens=24, total_tokens=36):
-        self.input_tokens = input_tokens
-        self.output_tokens = output_tokens
-        self.total_tokens = total_tokens
-
-
-class MockResponsesContentItem:
-    def __init__(self, text):
-        self.type = "output_text"
-        self.text = text
-
-
-class MockResponsesOutputMessage:
-    def __init__(self, text):
-        self.type = "message"
-        self.role = "assistant"
-        self.content = [MockResponsesContentItem(text)]
-
-
-class MockResponsesResponse:
-    def __init__(
-        self,
-        model="gpt-4.1-nano",
-        output=None,
-        usage=None,
-    ):
-        self.id = "resp-test"
-        self.model = model
-        self.output = output or [MockResponsesOutputMessage("the model response")]
-        self.usage = usage or MockResponsesUsage()
-
-
-def _build_responses_kwargs(
-    *,
-    input_value="What is the capital of France?",
-    conversation=None,
-    model="openai/gpt-4.1-nano",
-    extra_body_conversation=None,
+def _make_responses_api_response(
+    model="gpt-4.1-nano",
+    text="the model response",
+    input_tokens=12,
+    output_tokens=24,
+    total_tokens=36,
 ):
-    """
-    Build the kwargs shape that litellm passes to input_callback for a
-    responses() call. `extra_body` is unpacked into
-    additional_args.complete_input_dict by litellm before callbacks fire.
-    """
-    complete_input_dict = {"model": model.split("/")[-1], "input": input_value}
-    if extra_body_conversation is not None:
-        complete_input_dict["conversation"] = extra_body_conversation
-    elif conversation is not None:
-        complete_input_dict["conversation"] = conversation
-
-    return {
-        "model": model,
-        "input": input_value,
-        "call_type": "responses",
-        "additional_args": {"complete_input_dict": complete_input_dict},
-    }
+    """Build a real openai.types.responses.Response for use as a fake HTTP
+    payload. litellm parses the JSON into a litellm ResponsesAPIResponse."""
+    import openai.types.responses as resp_types
+
+    return resp_types.Response(
+        id="resp-test",
+        output=[
+            resp_types.ResponseOutputMessage(
+                id="msg_123",
+                type="message",
+                status="completed",
+                content=[
+                    resp_types.ResponseOutputText(
+                        text=text,
+                        type="output_text",
+                        annotations=[],
+                    )
+                ],
+                role="assistant",
+            )
+        ],
+        parallel_tool_calls=False,
+        tool_choice="none",
+        tools=[],
+        created_at=10000000,
+        model=model,
+        object="response",
+        usage=resp_types.ResponseUsage(
+            input_tokens=input_tokens,
+            input_tokens_details=resp_types.response_usage.InputTokensDetails(
+                cached_tokens=0,
+            ),
+            output_tokens=output_tokens,
+            output_tokens_details=resp_types.response_usage.OutputTokensDetails(
+                reasoning_tokens=0,
+            ),
+            total_tokens=total_tokens,
+        ),
+    )
 
 
 @pytest.mark.parametrize(
@@ -2205,7 +2194,12 @@ def _build_responses_kwargs(
     ],
 )
 def test_responses_conversation_id(
-    sentry_init, capture_events, conversation, expected_id
+    reset_litellm_executor,
+    sentry_init,
+    capture_events,
+    get_model_response,
+    conversation,
+    expected_id,
 ):
     sentry_init(
         integrations=[LiteLLMIntegration()],
@@ -2213,18 +2207,30 @@ def test_responses_conversation_id(
     )
     events = capture_events()
 
-    kwargs = _build_responses_kwargs(extra_body_conversation=conversation)
+    client = HTTPHandler()
+    fake_response = get_model_response(
+        _make_responses_api_response(),
+        serialize_pydantic=True,
+    )
 
-    with start_transaction(name="litellm test"):
-        _input_callback(kwargs)
-        _success_callback(
-            kwargs, MockResponsesResponse(), datetime.now(), datetime.now()
-        )
+    extra_body = {"conversation": conversation} if conversation is not None else {}
+
+    with mock.patch.object(client, "post", return_value=fake_response):
+        with start_transaction(name="litellm test"):
+            litellm.responses(
+                model="openai/gpt-4.1-nano",
+                input="What is the capital of France?",
+                client=client,
+                extra_body=extra_body,
+            )
+            litellm_utils.executor.shutdown(wait=True)
 
     (event,) = events
-    (span,) = event["spans"]
+    response_spans = [
+        span for span in event["spans"] if span["op"] == OP.GEN_AI_RESPONSES
+    ]
+    (span,) = response_spans
 
-    assert span["op"] == OP.GEN_AI_RESPONSES
     assert span["description"] == "responses gpt-4.1-nano"
     assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "responses"
 
@@ -2234,7 +2240,12 @@ def test_responses_conversation_id(
         assert span["data"][SPANDATA.GEN_AI_CONVERSATION_ID] == expected_id
 
 
-def test_responses_records_input_output_and_usage(sentry_init, capture_events):
+def test_responses_records_input_output_and_usage(
+    reset_litellm_executor,
+    sentry_init,
+    capture_events,
+    get_model_response,
+):
     sentry_init(
         integrations=[LiteLLMIntegration(include_prompts=True)],
         traces_sample_rate=1.0,
@@ -2242,20 +2253,28 @@ def test_responses_records_input_output_and_usage(sentry_init, capture_events):
     )
     events = capture_events()
 
-    kwargs = _build_responses_kwargs(
-        extra_body_conversation={"id": "conv_xyz"},
+    client = HTTPHandler()
+    fake_response = get_model_response(
+        _make_responses_api_response(text="the model response"),
+        serialize_pydantic=True,
     )
 
-    with start_transaction(name="litellm test"):
-        _input_callback(kwargs)
-        _success_callback(
-            kwargs, MockResponsesResponse(), datetime.now(), datetime.now()
-        )
+    with mock.patch.object(client, "post", return_value=fake_response):
+        with start_transaction(name="litellm test"):
+            litellm.responses(
+                model="openai/gpt-4.1-nano",
+                input="What is the capital of France?",
+                client=client,
+                extra_body={"conversation": {"id": "conv_xyz"}},
+            )
+            litellm_utils.executor.shutdown(wait=True)
 
     (event,) = events
-    (span,) = event["spans"]
+    response_spans = [
+        span for span in event["spans"] if span["op"] == OP.GEN_AI_RESPONSES
+    ]
+    (span,) = response_spans
 
-    assert span["op"] == OP.GEN_AI_RESPONSES
     assert span["data"][SPANDATA.GEN_AI_CONVERSATION_ID] == "conv_xyz"
     assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"]
     assert "the model response" in span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT]
@@ -2264,7 +2283,12 @@ def test_responses_records_input_output_and_usage(sentry_init, capture_events):
     assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 36
 
 
-def test_responses_no_pii_omits_messages(sentry_init, capture_events):
+def test_responses_no_pii_omits_messages(
+    reset_litellm_executor,
+    sentry_init,
+    capture_events,
+    get_model_response,
+):
     sentry_init(
         integrations=[LiteLLMIntegration(include_prompts=True)],
         traces_sample_rate=1.0,
@@ -2272,16 +2296,27 @@ def test_responses_no_pii_omits_messages(sentry_init, capture_events):
     )
     events = capture_events()
 
-    kwargs = _build_responses_kwargs(extra_body_conversation={"id": "conv_xyz"})
+    client = HTTPHandler()
+    fake_response = get_model_response(
+        _make_responses_api_response(),
+        serialize_pydantic=True,
+    )
 
-    with start_transaction(name="litellm test"):
-        _input_callback(kwargs)
-        _success_callback(
-            kwargs, MockResponsesResponse(), datetime.now(), datetime.now()
-        )
+    with mock.patch.object(client, "post", return_value=fake_response):
+        with start_transaction(name="litellm test"):
+            litellm.responses(
+                model="openai/gpt-4.1-nano",
+                input="What is the capital of France?",
+                client=client,
+                extra_body={"conversation": {"id": "conv_xyz"}},
+            )
+            litellm_utils.executor.shutdown(wait=True)
 
     (event,) = events
-    (span,) = event["spans"]
+    response_spans = [
+        span for span in event["spans"] if span["op"] == OP.GEN_AI_RESPONSES
+    ]
+    (span,) = response_spans
 
     # Conversation id is not PII, but request/response content is
     assert span["data"][SPANDATA.GEN_AI_CONVERSATION_ID] == "conv_xyz"
@@ -2289,17 +2324,12 @@ def test_responses_no_pii_omits_messages(sentry_init, capture_events):
     assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"]
 
 
-class MockResponseCompletedEvent:
-    """Mimics litellm/openai's response.completed streaming wrapper."""
-
-    def __init__(self, response):
-        self.type = "response.completed"
-        self.response = response
-
-
 def test_responses_streaming_unwraps_completed_event(sentry_init, capture_events):
     """For streaming responses, success_handler receives a ResponseCompletedEvent
-    wrapper. We must unwrap it to read usage/output from the inner response."""
+    wrapping the assembled ResponsesAPIResponse. We must unwrap it to read
+    usage/output from the inner response."""
+    from litellm.types.llms.openai import ResponsesAPIResponse, ResponseCompletedEvent
+
     sentry_init(
         integrations=[LiteLLMIntegration(include_prompts=True)],
         traces_sample_rate=1.0,
@@ -2307,11 +2337,19 @@ def test_responses_streaming_unwraps_completed_event(sentry_init, capture_events
     )
     events = capture_events()
 
-    kwargs = _build_responses_kwargs()
-    kwargs["stream"] = True
-    kwargs["complete_streaming_response"] = MockResponsesResponse()
+    inner_response = ResponsesAPIResponse(
+        **_make_responses_api_response().model_dump(by_alias=True, exclude_none=True),
+    )
+    wrapper = ResponseCompletedEvent(type="response.completed", response=inner_response)
 
-    wrapper = MockResponseCompletedEvent(MockResponsesResponse())
+    kwargs = {
+        "model": "openai/gpt-4.1-nano",
+        "input": "What is the capital of France?",
+        "call_type": "responses",
+        "stream": True,
+        "complete_streaming_response": inner_response,
+        "additional_args": {"complete_input_dict": {}},
+    }
 
     with start_transaction(name="litellm test"):
         _input_callback(kwargs)
@@ -2327,40 +2365,43 @@ def test_responses_streaming_unwraps_completed_event(sentry_init, capture_events
     assert "the model response" in span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT]
 
 
-class MockResponseWithDictUsage:
-    """Mimics the assembled async-streaming responses object: usage is a dict
-    (not a Pydantic model), as litellm hands us for that path."""
-
-    def __init__(self):
-        self.id = "resp-test"
-        self.model = "gpt-4.1-nano"
-        self.output = [MockResponsesOutputMessage("hi")]
-        self.usage = {
-            "prompt_tokens": 7,
-            "completion_tokens": 2,
-            "total_tokens": 9,
-        }
-
-
 def test_responses_async_streaming_dict_usage(sentry_init, capture_events):
-    """For async streaming responses, litellm assembles `usage` as a plain dict.
-    `getattr(dict, ...)` would silently miss it; we need to support both shapes."""
+    """For async streaming responses, litellm transforms `usage` into a chat-style
+    dict on the assembled ResponsesAPIResponse (see
+    Logging._get_assembled_streaming_response). We must read the chat-style keys
+    when usage is a dict."""
+    from litellm.types.llms.openai import ResponsesAPIResponse
+
     sentry_init(
         integrations=[LiteLLMIntegration()],
         traces_sample_rate=1.0,
     )
     events = capture_events()
 
-    kwargs = _build_responses_kwargs()
-    kwargs["call_type"] = "aresponses"
-    kwargs["stream"] = True
-    kwargs["async_complete_streaming_response"] = MockResponseWithDictUsage()
+    response_data = _make_responses_api_response().model_dump(
+        by_alias=True, exclude_none=True
+    )
+    response = ResponsesAPIResponse(**response_data)
+    # litellm replaces ResponseAPIUsage with a chat-style dict during streaming
+    # assembly; mirror that mutation here.
+    response.usage = {
+        "prompt_tokens": 7,
+        "completion_tokens": 2,
+        "total_tokens": 9,
+    }
+
+    kwargs = {
+        "model": "openai/gpt-4.1-nano",
+        "input": "What is the capital of France?",
+        "call_type": "aresponses",
+        "stream": True,
+        "async_complete_streaming_response": response,
+        "additional_args": {"complete_input_dict": {}},
+    }
 
     with start_transaction(name="litellm test"):
         _input_callback(kwargs)
-        _success_callback(
-            kwargs, MockResponseWithDictUsage(), datetime.now(), datetime.now()
-        )
+        _success_callback(kwargs, response, datetime.now(), datetime.now())
 
     (event,) = events
     (span,) = event["spans"]
@@ -2371,7 +2412,12 @@ def test_responses_async_streaming_dict_usage(sentry_init, capture_events):
     assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 9
 
 
-def test_aresponses_call_type_treated_as_responses(sentry_init, capture_events):
+@pytest.mark.asyncio(loop_scope="session")
+async def test_aresponses_call_type_treated_as_responses(
+    sentry_init,
+    capture_events,
+    get_model_response,
+):
     """aresponses (async) call_type should produce a responses span."""
     sentry_init(
         integrations=[LiteLLMIntegration()],
@@ -2379,17 +2425,29 @@ def test_aresponses_call_type_treated_as_responses(sentry_init, capture_events):
     )
     events = capture_events()
 
-    kwargs = _build_responses_kwargs()
-    kwargs["call_type"] = "aresponses"
+    client = AsyncHTTPHandler()
+    fake_response = get_model_response(
+        _make_responses_api_response(),
+        serialize_pydantic=True,
+    )
+
+    async def fake_post(*args, **kwargs):
+        return fake_response
 
-    with start_transaction(name="litellm test"):
-        _input_callback(kwargs)
-        _success_callback(
-            kwargs, MockResponsesResponse(), datetime.now(), datetime.now()
-        )
+    with mock.patch.object(client, "post", new=fake_post):
+        with start_transaction(name="litellm test"):
+            await litellm.aresponses(
+                model="openai/gpt-4.1-nano",
+                input="What is the capital of France?",
+                client=client,
+            )
+            await GLOBAL_LOGGING_WORKER.flush()
+            await asyncio.sleep(0.5)
 
     (event,) = events
-    (span,) = event["spans"]
+    response_spans = [
+        span for span in event["spans"] if span["op"] == OP.GEN_AI_RESPONSES
+    ]
+    (span,) = response_spans
 
-    assert span["op"] == OP.GEN_AI_RESPONSES
     assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "responses"