From c34d2af236944fb21fc655a3704ce542a21c8a17 Mon Sep 17 00:00:00 2001 From: Abhijeet Prasad Date: Tue, 7 Apr 2026 18:04:09 -0400 Subject: [PATCH] fix(claude_agent_sdk): use result message output Prefer ResultMessage.result as the root Claude Agent task output when the SDK provides it. The streamed assistant message can differ from the canonical final result, so the integration now records the final result value on the root span and falls back to the accumulated assistant content only when no result field is present. Also tighten the existing cassette-backed calculator test to assert the root task output matches the result message. --- .../claude_agent_sdk/test_claude_agent_sdk.py | 2 ++ .../integrations/claude_agent_sdk/tracing.py | 12 +++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/py/src/braintrust/integrations/claude_agent_sdk/test_claude_agent_sdk.py b/py/src/braintrust/integrations/claude_agent_sdk/test_claude_agent_sdk.py index 6fe29edd..8e1d53b8 100644 --- a/py/src/braintrust/integrations/claude_agent_sdk/test_claude_agent_sdk.py +++ b/py/src/braintrust/integrations/claude_agent_sdk/test_claude_agent_sdk.py @@ -166,6 +166,8 @@ async def calculator_handler(args): assert task_span["output"] is not None assert result_message is not None, "Should have received result message" + assert getattr(result_message, "result", None) is not None + assert task_span["output"] == result_message.result if hasattr(result_message, "num_turns"): assert task_span.get("metadata", {}).get("num_turns") is not None if hasattr(result_message, "session_id"): diff --git a/py/src/braintrust/integrations/claude_agent_sdk/tracing.py b/py/src/braintrust/integrations/claude_agent_sdk/tracing.py index 4dea63d6..398d66f8 100644 --- a/py/src/braintrust/integrations/claude_agent_sdk/tracing.py +++ b/py/src/braintrust/integrations/claude_agent_sdk/tracing.py @@ -585,6 +585,7 @@ def __init__( self._task_order: list[str | None] = [] self._final_results: list[dict[str, Any]] = [] + self._result_output: Any | None = None self._task_events: list[dict[str, Any]] = [] _thread_local.tool_span_tracker = self._tool_tracker @@ -609,7 +610,11 @@ def add(self, message: Any) -> None: self._handle_system(message) def log_output(self) -> None: - """Log the last accumulated assistant message as the root span output.""" + """Log the canonical root span output for the request.""" + if self._result_output is not None: + self._root_span.log(output=self._result_output) + return + if self._final_results: self._root_span.log(output=self._final_results[-1]) @@ -715,6 +720,11 @@ def _handle_result(self, message: Any) -> None: ctx = self._get_context(None) if ctx.llm_span and (usage_metrics or usage_metadata): ctx.llm_span.log(metrics=usage_metrics or None, metadata=usage_metadata or None) + + result_value = getattr(message, "result", None) + if result_value is not None: + self._result_output = result_value + result_metadata = { k: v for k, v in {