fix(openai): preserve logprobs for streaming chat spans

AbhiPrasad · AbhiPrasad · commit d1c192749e93 · 2026-04-07T13:19:16.000-04:00
Accumulate per-chunk logprob content and refusal entries when postprocessing chat completion streams so traced output matches non-streaming calls. Also read finish_reason from the streamed choice and add a VCR regression test for the wrapped sync path. Closes #180
diff --git a/py/src/braintrust/integrations/openai/cassettes/test_openai_chat_streaming_sync_preserves_logprobs.yaml b/py/src/braintrust/integrations/openai/cassettes/test_openai_chat_streaming_sync_preserves_logprobs.yaml
@@ -0,0 +1,110 @@
+interactions:
+- request:
+    body: '{"messages":[{"role":"system","content":"Reply with exactly OK and nothing
+      else."},{"role":"user","content":"Reply with exactly: OK"}],"model":"gpt-4o-mini","logprobs":true,"seed":1,"stream":true,"temperature":0,"top_logprobs":2}'
+    headers:
+      Accept:
+      - application/json
+      Accept-Encoding:
+      - gzip, deflate
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '229'
+      Content-Type:
+      - application/json
+      Host:
+      - api.openai.com
+      User-Agent:
+      - OpenAI/Python 2.30.0
+      X-Stainless-Arch:
+      - arm64
+      X-Stainless-Async:
+      - 'false'
+      X-Stainless-Lang:
+      - python
+      X-Stainless-OS:
+      - MacOS
+      X-Stainless-Package-Version:
+      - 2.30.0
+      X-Stainless-Runtime:
+      - CPython
+      X-Stainless-Runtime-Version:
+      - 3.13.3
+      x-stainless-read-timeout:
+      - '600'
+      x-stainless-retry-count:
+      - '0'
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: 'data: {"id":"chatcmpl-DS469BIj7I8lPBf6o1Vrh09KdRS1C","object":"chat.completion.chunk","created":1775581309,"model":"gpt-4o-mini-2024-07-18","service_tier":"default","system_fingerprint":"fp_218cd55417","choices":[{"index":0,"delta":{"role":"assistant","content":"","refusal":null},"logprobs":{"content":[],"refusal":null},"finish_reason":null}],"obfuscation":"4JaDqpR1wg0mz"}
+
+
+        data: {"id":"chatcmpl-DS469BIj7I8lPBf6o1Vrh09KdRS1C","object":"chat.completion.chunk","created":1775581309,"model":"gpt-4o-mini-2024-07-18","service_tier":"default","system_fingerprint":"fp_218cd55417","choices":[{"index":0,"delta":{"content":"OK"},"logprobs":{"content":[{"token":"OK","logprob":0.0,"bytes":[79,75],"top_logprobs":[{"token":"OK","logprob":0.0,"bytes":[79,75]},{"token":"
+        OK","logprob":-19.0,"bytes":[32,79,75]}]}],"refusal":null},"finish_reason":null}],"obfuscation":""}
+
+
+        data: {"id":"chatcmpl-DS469BIj7I8lPBf6o1Vrh09KdRS1C","object":"chat.completion.chunk","created":1775581309,"model":"gpt-4o-mini-2024-07-18","service_tier":"default","system_fingerprint":"fp_218cd55417","choices":[{"index":0,"delta":{},"logprobs":null,"finish_reason":"stop"}],"obfuscation":"Ox"}
+
+
+        data: [DONE]
+
+
+        '
+    headers:
+      CF-Cache-Status:
+      - DYNAMIC
+      CF-Ray:
+      - 9e8a98af68f1abb5-YYZ
+      Connection:
+      - keep-alive
+      Content-Type:
+      - text/event-stream; charset=utf-8
+      Date:
+      - Tue, 07 Apr 2026 17:01:49 GMT
+      Server:
+      - cloudflare
+      Strict-Transport-Security:
+      - max-age=31536000; includeSubDomains; preload
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - nosniff
+      access-control-expose-headers:
+      - X-Request-ID
+      alt-svc:
+      - h3=":443"; ma=86400
+      openai-organization:
+      - braintrust-data
+      openai-processing-ms:
+      - '257'
+      openai-project:
+      - proj_vsCSXafhhByzWOThMrJcZiw9
+      openai-version:
+      - '2020-10-01'
+      set-cookie:
+      - __cf_bm=DQLjVZZ0JcnAvssCENysUlUUTA0hqjJMaJa7wzYHKE0-1775581309.3517559-1.0.1.1-EK2uxO2w8TTFRLvMzUrQQtAGpZR5morImfrMd8X9zUWXT3pumefyADMQdomdOYlUAkhXhNZfV4uO5fL7ZPdGs9rTIf.aNPpodlWzVW2uYc_NGpz4247vvIUafMMkggqg;
+        HttpOnly; Secure; Path=/; Domain=api.openai.com; Expires=Tue, 07 Apr 2026
+        17:31:49 GMT
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - '30000'
+      x-ratelimit-limit-tokens:
+      - '150000000'
+      x-ratelimit-remaining-requests:
+      - '29999'
+      x-ratelimit-remaining-tokens:
+      - '149999982'
+      x-ratelimit-reset-requests:
+      - 2ms
+      x-ratelimit-reset-tokens:
+      - 0s
+      x-request-id:
+      - req_5ae9848773d14dc8801267da556e753f
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/py/src/braintrust/integrations/openai/test_openai.py b/py/src/braintrust/integrations/openai/test_openai.py
@@ -333,6 +333,51 @@ def test_openai_chat_streaming_sync(memory_logger):
         assert "24" in str(span["output"]) or "twenty-four" in str(span["output"]).lower()
 
 
+@pytest.mark.vcr
+def test_openai_chat_streaming_sync_preserves_logprobs(memory_logger):
+    assert not memory_logger.pop()
+
+    client = wrap_openai(openai.OpenAI())
+    stream = client.chat.completions.create(
+        model=TEST_MODEL,
+        messages=[
+            {"role": "system", "content": "Reply with exactly OK and nothing else."},
+            {"role": "user", "content": "Reply with exactly: OK"},
+        ],
+        stream=True,
+        temperature=0,
+        seed=1,
+        logprobs=True,
+        top_logprobs=2,
+    )
+
+    chunk_logprob_tokens = []
+    content = ""
+    for chunk in stream:
+        if not chunk.choices:
+            continue
+
+        choice = chunk.choices[0]
+        if choice.delta.content:
+            content += choice.delta.content
+        if choice.logprobs and choice.logprobs.content:
+            chunk_logprob_tokens.extend(entry.token for entry in choice.logprobs.content)
+
+    assert "OK" in content
+    assert "OK" in "".join(chunk_logprob_tokens)
+
+    spans = memory_logger.pop()
+    assert len(spans) == 1
+    span = spans[0]
+    assert span["output"]
+
+    output_choice = span["output"][0]
+    assert output_choice["finish_reason"] == "stop"
+    assert output_choice["logprobs"] is not None
+    assert output_choice["logprobs"]["content"]
+    assert "OK" in "".join(entry["token"] for entry in output_choice["logprobs"]["content"])
+
+
 @pytest.mark.vcr
 def test_openai_chat_with_system_prompt(memory_logger):
     assert not memory_logger.pop()
diff --git a/py/src/braintrust/integrations/openai/tracing.py b/py/src/braintrust/integrations/openai/tracing.py
@@ -464,6 +464,9 @@ def _postprocess_streaming_results(cls, all_results: list[dict[str, Any]]) -> di
         content = None
         tool_calls: list[Any] | None = None
         finish_reason = None
+        logprobs_content: list[Any] | None = None
+        logprobs_refusal: list[Any] | None = None
+        saw_logprobs = False
         metrics: dict[str, float] = {}
         for result in all_results:
             usage = result.get("usage")
@@ -473,16 +476,35 @@ def _postprocess_streaming_results(cls, all_results: list[dict[str, Any]]) -> di
             choices = result["choices"]
             if not choices:
                 continue
-            delta = choices[0]["delta"]
+
+            choice = choices[0]
+            fr = choice.get("finish_reason")
+            if fr is not None:
+                finish_reason = fr
+
+            choice_logprobs = choice.get("logprobs")
+            if choice_logprobs is not None:
+                saw_logprobs = True
+
+                chunk_content_logprobs = choice_logprobs.get("content")
+                if chunk_content_logprobs is not None:
+                    if logprobs_content is None:
+                        logprobs_content = []
+                    logprobs_content.extend(chunk_content_logprobs)
+
+                chunk_refusal_logprobs = choice_logprobs.get("refusal")
+                if chunk_refusal_logprobs is not None:
+                    if logprobs_refusal is None:
+                        logprobs_refusal = []
+                    logprobs_refusal.extend(chunk_refusal_logprobs)
+
+            delta = choice.get("delta")
             if not delta:
                 continue
 
             if role is None and delta.get("role") is not None:
                 role = delta.get("role")
 
-            if delta.get("finish_reason") is not None:
-                finish_reason = delta.get("finish_reason")
-
             if delta.get("content") is not None:
                 content = (content or "") + delta.get("content")
 
@@ -524,7 +546,14 @@ def _postprocess_streaming_results(cls, all_results: list[dict[str, Any]]) -> di
                         "content": content,
                         "tool_calls": tool_calls,
                     },
-                    "logprobs": None,
+                    "logprobs": (
+                        {
+                            "content": logprobs_content,
+                            "refusal": logprobs_refusal,
+                        }
+                        if saw_logprobs
+                        else None
+                    ),
                     "finish_reason": finish_reason,
                 }
             ],