From e2419e828ffb1298f3e8c349c21557accd573640 Mon Sep 17 00:00:00 2001 From: Abhijeet Prasad Date: Tue, 7 Apr 2026 13:19:16 -0400 Subject: [PATCH] fix(openai): preserve logprobs for streaming chat spans Accumulate per-chunk logprob content and refusal entries when postprocessing chat completion streams so traced output matches non-streaming calls. Also read finish_reason from the streamed choice and add a VCR regression test for the wrapped sync path. Closes #180 --- ...hat_streaming_sync_preserves_logprobs.yaml | 110 ++++++++++++++++++ .../integrations/openai/test_openai.py | 45 +++++++ .../braintrust/integrations/openai/tracing.py | 39 ++++++- 3 files changed, 189 insertions(+), 5 deletions(-) create mode 100644 py/src/braintrust/integrations/openai/cassettes/test_openai_chat_streaming_sync_preserves_logprobs.yaml diff --git a/py/src/braintrust/integrations/openai/cassettes/test_openai_chat_streaming_sync_preserves_logprobs.yaml b/py/src/braintrust/integrations/openai/cassettes/test_openai_chat_streaming_sync_preserves_logprobs.yaml new file mode 100644 index 00000000..d20e8752 --- /dev/null +++ b/py/src/braintrust/integrations/openai/cassettes/test_openai_chat_streaming_sync_preserves_logprobs.yaml @@ -0,0 +1,110 @@ +interactions: +- request: + body: '{"messages":[{"role":"system","content":"Reply with exactly OK and nothing + else."},{"role":"user","content":"Reply with exactly: OK"}],"model":"gpt-4o-mini","logprobs":true,"seed":1,"stream":true,"temperature":0,"top_logprobs":2}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '229' + Content-Type: + - application/json + Host: + - api.openai.com + User-Agent: + - OpenAI/Python 2.30.0 + X-Stainless-Arch: + - arm64 + X-Stainless-Async: + - 'false' + X-Stainless-Lang: + - python + X-Stainless-OS: + - MacOS + X-Stainless-Package-Version: + - 2.30.0 + X-Stainless-Runtime: + - CPython + X-Stainless-Runtime-Version: + - 3.13.3 + x-stainless-read-timeout: + - '600' + x-stainless-retry-count: + - '0' + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: 'data: {"id":"chatcmpl-DS469BIj7I8lPBf6o1Vrh09KdRS1C","object":"chat.completion.chunk","created":1775581309,"model":"gpt-4o-mini-2024-07-18","service_tier":"default","system_fingerprint":"fp_218cd55417","choices":[{"index":0,"delta":{"role":"assistant","content":"","refusal":null},"logprobs":{"content":[],"refusal":null},"finish_reason":null}],"obfuscation":"4JaDqpR1wg0mz"} + + + data: {"id":"chatcmpl-DS469BIj7I8lPBf6o1Vrh09KdRS1C","object":"chat.completion.chunk","created":1775581309,"model":"gpt-4o-mini-2024-07-18","service_tier":"default","system_fingerprint":"fp_218cd55417","choices":[{"index":0,"delta":{"content":"OK"},"logprobs":{"content":[{"token":"OK","logprob":0.0,"bytes":[79,75],"top_logprobs":[{"token":"OK","logprob":0.0,"bytes":[79,75]},{"token":" + OK","logprob":-19.0,"bytes":[32,79,75]}]}],"refusal":null},"finish_reason":null}],"obfuscation":""} + + + data: {"id":"chatcmpl-DS469BIj7I8lPBf6o1Vrh09KdRS1C","object":"chat.completion.chunk","created":1775581309,"model":"gpt-4o-mini-2024-07-18","service_tier":"default","system_fingerprint":"fp_218cd55417","choices":[{"index":0,"delta":{},"logprobs":null,"finish_reason":"stop"}],"obfuscation":"Ox"} + + + data: [DONE] + + + ' + headers: + CF-Cache-Status: + - DYNAMIC + CF-Ray: + - 9e8a98af68f1abb5-YYZ + Connection: + - keep-alive + Content-Type: + - text/event-stream; charset=utf-8 + Date: + - Tue, 07 Apr 2026 17:01:49 GMT + Server: + - cloudflare + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - braintrust-data + openai-processing-ms: + - '257' + openai-project: + - proj_vsCSXafhhByzWOThMrJcZiw9 + openai-version: + - '2020-10-01' + set-cookie: + - __cf_bm=DQLjVZZ0JcnAvssCENysUlUUTA0hqjJMaJa7wzYHKE0-1775581309.3517559-1.0.1.1-EK2uxO2w8TTFRLvMzUrQQtAGpZR5morImfrMd8X9zUWXT3pumefyADMQdomdOYlUAkhXhNZfV4uO5fL7ZPdGs9rTIf.aNPpodlWzVW2uYc_NGpz4247vvIUafMMkggqg; + HttpOnly; Secure; Path=/; Domain=api.openai.com; Expires=Tue, 07 Apr 2026 + 17:31:49 GMT + x-openai-proxy-wasm: + - v0.1 + x-ratelimit-limit-requests: + - '30000' + x-ratelimit-limit-tokens: + - '150000000' + x-ratelimit-remaining-requests: + - '29999' + x-ratelimit-remaining-tokens: + - '149999982' + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_5ae9848773d14dc8801267da556e753f + status: + code: 200 + message: OK +version: 1 diff --git a/py/src/braintrust/integrations/openai/test_openai.py b/py/src/braintrust/integrations/openai/test_openai.py index 5f14335d..d8252000 100644 --- a/py/src/braintrust/integrations/openai/test_openai.py +++ b/py/src/braintrust/integrations/openai/test_openai.py @@ -383,6 +383,51 @@ def test_openai_chat_stream_helper_sync(memory_logger): assert "24" in str(span["output"]) or "twenty-four" in str(span["output"]).lower() +@pytest.mark.vcr +def test_openai_chat_streaming_sync_preserves_logprobs(memory_logger): + assert not memory_logger.pop() + + client = wrap_openai(openai.OpenAI()) + stream = client.chat.completions.create( + model=TEST_MODEL, + messages=[ + {"role": "system", "content": "Reply with exactly OK and nothing else."}, + {"role": "user", "content": "Reply with exactly: OK"}, + ], + stream=True, + temperature=0, + seed=1, + logprobs=True, + top_logprobs=2, + ) + + chunk_logprob_tokens = [] + content = "" + for chunk in stream: + if not chunk.choices: + continue + + choice = chunk.choices[0] + if choice.delta.content: + content += choice.delta.content + if choice.logprobs and choice.logprobs.content: + chunk_logprob_tokens.extend(entry.token for entry in choice.logprobs.content) + + assert "OK" in content + assert "OK" in "".join(chunk_logprob_tokens) + + spans = memory_logger.pop() + assert len(spans) == 1 + span = spans[0] + assert span["output"] + + output_choice = span["output"][0] + assert output_choice["finish_reason"] == "stop" + assert output_choice["logprobs"] is not None + assert output_choice["logprobs"]["content"] + assert "OK" in "".join(entry["token"] for entry in output_choice["logprobs"]["content"]) + + @pytest.mark.vcr def test_openai_chat_with_system_prompt(memory_logger): assert not memory_logger.pop() diff --git a/py/src/braintrust/integrations/openai/tracing.py b/py/src/braintrust/integrations/openai/tracing.py index ab10906f..3dde6d8a 100644 --- a/py/src/braintrust/integrations/openai/tracing.py +++ b/py/src/braintrust/integrations/openai/tracing.py @@ -464,6 +464,9 @@ def _postprocess_streaming_results(cls, all_results: list[dict[str, Any]]) -> di content = None tool_calls: list[Any] | None = None finish_reason = None + logprobs_content: list[Any] | None = None + logprobs_refusal: list[Any] | None = None + saw_logprobs = False metrics: dict[str, float] = {} for result in all_results: usage = result.get("usage") @@ -473,16 +476,35 @@ def _postprocess_streaming_results(cls, all_results: list[dict[str, Any]]) -> di choices = result["choices"] if not choices: continue - delta = choices[0]["delta"] + + choice = choices[0] + fr = choice.get("finish_reason") + if fr is not None: + finish_reason = fr + + choice_logprobs = choice.get("logprobs") + if choice_logprobs is not None: + saw_logprobs = True + + chunk_content_logprobs = choice_logprobs.get("content") + if chunk_content_logprobs is not None: + if logprobs_content is None: + logprobs_content = [] + logprobs_content.extend(chunk_content_logprobs) + + chunk_refusal_logprobs = choice_logprobs.get("refusal") + if chunk_refusal_logprobs is not None: + if logprobs_refusal is None: + logprobs_refusal = [] + logprobs_refusal.extend(chunk_refusal_logprobs) + + delta = choice.get("delta") if not delta: continue if role is None and delta.get("role") is not None: role = delta.get("role") - if delta.get("finish_reason") is not None: - finish_reason = delta.get("finish_reason") - if delta.get("content") is not None: content = (content or "") + delta.get("content") @@ -524,7 +546,14 @@ def _postprocess_streaming_results(cls, all_results: list[dict[str, Any]]) -> di "content": content, "tool_calls": tool_calls, }, - "logprobs": None, + "logprobs": ( + { + "content": logprobs_content, + "refusal": logprobs_refusal, + } + if saw_logprobs + else None + ), "finish_reason": finish_reason, } ],