From baf324c3058f3b1e70ef3c5dba11184ca4bab82d Mon Sep 17 00:00:00 2001 From: Abhijeet Prasad Date: Tue, 7 Apr 2026 14:24:26 +0000 Subject: [PATCH 1/3] test: Migrate pydantic ai golden tests --- .../test_agent_with_error_handling.yaml | 30 +++ .../test_agent_with_long_context.yaml | 31 +++ .../cassettes/test_agent_with_prefill.yaml | 30 +++ .../test_agent_with_short_max_tokens.yaml | 29 +++ .../test_agent_with_stop_sequences.yaml | 30 +++ .../test_pydantic_ai_integration.py | 191 +++++++++++++++++- 6 files changed, 338 insertions(+), 3 deletions(-) create mode 100644 py/src/braintrust/integrations/pydantic_ai/cassettes/test_agent_with_error_handling.yaml create mode 100644 py/src/braintrust/integrations/pydantic_ai/cassettes/test_agent_with_long_context.yaml create mode 100644 py/src/braintrust/integrations/pydantic_ai/cassettes/test_agent_with_prefill.yaml create mode 100644 py/src/braintrust/integrations/pydantic_ai/cassettes/test_agent_with_short_max_tokens.yaml create mode 100644 py/src/braintrust/integrations/pydantic_ai/cassettes/test_agent_with_stop_sequences.yaml diff --git a/py/src/braintrust/integrations/pydantic_ai/cassettes/test_agent_with_error_handling.yaml b/py/src/braintrust/integrations/pydantic_ai/cassettes/test_agent_with_error_handling.yaml new file mode 100644 index 00000000..7ea7829a --- /dev/null +++ b/py/src/braintrust/integrations/pydantic_ai/cassettes/test_agent_with_error_handling.yaml @@ -0,0 +1,30 @@ +interactions: +- request: + body: '{"messages":[{"role":"user","content":[{"type":"image_url","image_url":{"url":"data:image/png;base64,SU5WQUxJRF9QTkdfREFUQV9OT1RfQV9SRUFMX0lNQUdF"}},{"type":"text","text":"What''s + in this image?"}]}],"model":"gpt-4o-mini","max_completion_tokens":100,"stream":false}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Type: + - application/json + Host: + - api.openai.com + User-Agent: + - python-httpx/0.28.1 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: '{"error":{"message":"Could not process image. The image data is not + valid.","type":"invalid_request_error","param":null,"code":"invalid_image"}}' + headers: + Content-Type: + - application/json + status: + code: 400 + message: Bad Request +version: 1 diff --git a/py/src/braintrust/integrations/pydantic_ai/cassettes/test_agent_with_long_context.yaml b/py/src/braintrust/integrations/pydantic_ai/cassettes/test_agent_with_long_context.yaml new file mode 100644 index 00000000..509c33c5 --- /dev/null +++ b/py/src/braintrust/integrations/pydantic_ai/cassettes/test_agent_with_long_context.yaml @@ -0,0 +1,31 @@ +interactions: +- request: + body: '{"messages":[{"role":"user","content":"Here is a long text:\n\nThe quick + brown fox jumps over the lazy dog. <...repeated 20 times...>\n\nHow many times + does the word ''fox'' appear?"}],"model":"gpt-4o-mini","max_completion_tokens":100,"stream":false}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Type: + - application/json + Host: + - api.openai.com + User-Agent: + - python-httpx/0.28.1 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: '{"id":"chatcmpl-long001","object":"chat.completion","created":1735000000,"model":"gpt-4o-mini-2024-07-18","service_tier":"default","system_fingerprint":"fp_test","choices":[{"index":0,"message":{"role":"assistant","content":"The + word ''fox'' appears 20 times in the text.","refusal":null},"logprobs":null,"finish_reason":"stop"}],"usage":{"prompt_tokens":120,"completion_tokens":14,"total_tokens":134,"prompt_tokens_details":{"cached_tokens":0,"audio_tokens":0},"completion_tokens_details":{"reasoning_tokens":0,"audio_tokens":0,"accepted_prediction_tokens":0,"rejected_prediction_tokens":0}}}' + headers: + Content-Type: + - application/json + status: + code: 200 + message: OK +version: 1 diff --git a/py/src/braintrust/integrations/pydantic_ai/cassettes/test_agent_with_prefill.yaml b/py/src/braintrust/integrations/pydantic_ai/cassettes/test_agent_with_prefill.yaml new file mode 100644 index 00000000..96839e52 --- /dev/null +++ b/py/src/braintrust/integrations/pydantic_ai/cassettes/test_agent_with_prefill.yaml @@ -0,0 +1,30 @@ +interactions: +- request: + body: '{"messages":[{"role":"user","content":"Write a haiku about coding."},{"role":"assistant","content":"Here + is a haiku:"},{"role":"user","content":"Write a haiku about coding."}],"model":"gpt-4o-mini","max_completion_tokens":200,"stream":false}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Type: + - application/json + Host: + - api.openai.com + User-Agent: + - python-httpx/0.28.1 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: '{"id":"chatcmpl-pre001","object":"chat.completion","created":1735000000,"model":"gpt-4o-mini-2024-07-18","service_tier":"default","system_fingerprint":"fp_test","choices":[{"index":0,"message":{"role":"assistant","content":"Whitespace + tells a tale\nFunctions dance in loops and turns\nClean code brings freedom","refusal":null},"logprobs":null,"finish_reason":"stop"}],"usage":{"prompt_tokens":35,"completion_tokens":16,"total_tokens":51,"prompt_tokens_details":{"cached_tokens":0,"audio_tokens":0},"completion_tokens_details":{"reasoning_tokens":0,"audio_tokens":0,"accepted_prediction_tokens":0,"rejected_prediction_tokens":0}}}' + headers: + Content-Type: + - application/json + status: + code: 200 + message: OK +version: 1 diff --git a/py/src/braintrust/integrations/pydantic_ai/cassettes/test_agent_with_short_max_tokens.yaml b/py/src/braintrust/integrations/pydantic_ai/cassettes/test_agent_with_short_max_tokens.yaml new file mode 100644 index 00000000..924a1fc4 --- /dev/null +++ b/py/src/braintrust/integrations/pydantic_ai/cassettes/test_agent_with_short_max_tokens.yaml @@ -0,0 +1,29 @@ +interactions: +- request: + body: '{"messages":[{"role":"user","content":"What is AI?"}],"model":"gpt-4o-mini","max_completion_tokens":5,"stream":false}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Type: + - application/json + Host: + - api.openai.com + User-Agent: + - python-httpx/0.28.1 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: '{"id":"chatcmpl-short001","object":"chat.completion","created":1735000000,"model":"gpt-4o-mini-2024-07-18","service_tier":"default","system_fingerprint":"fp_test","choices":[{"index":0,"message":{"role":"assistant","content":"AI + stands for Artificial","refusal":null},"logprobs":null,"finish_reason":"length"}],"usage":{"prompt_tokens":11,"completion_tokens":5,"total_tokens":16,"prompt_tokens_details":{"cached_tokens":0,"audio_tokens":0},"completion_tokens_details":{"reasoning_tokens":0,"audio_tokens":0,"accepted_prediction_tokens":0,"rejected_prediction_tokens":0}}}' + headers: + Content-Type: + - application/json + status: + code: 200 + message: OK +version: 1 diff --git a/py/src/braintrust/integrations/pydantic_ai/cassettes/test_agent_with_stop_sequences.yaml b/py/src/braintrust/integrations/pydantic_ai/cassettes/test_agent_with_stop_sequences.yaml new file mode 100644 index 00000000..fb9386cd --- /dev/null +++ b/py/src/braintrust/integrations/pydantic_ai/cassettes/test_agent_with_stop_sequences.yaml @@ -0,0 +1,30 @@ +interactions: +- request: + body: '{"messages":[{"role":"user","content":"Write a short story about a robot."}],"model":"gpt-4o-mini","max_completion_tokens":500,"stop":["END","\n\n"],"stream":false}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Type: + - application/json + Host: + - api.openai.com + User-Agent: + - python-httpx/0.28.1 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: '{"id":"chatcmpl-stop001","object":"chat.completion","created":1735000000,"model":"gpt-4o-mini-2024-07-18","service_tier":"default","system_fingerprint":"fp_test","choices":[{"index":0,"message":{"role":"assistant","content":"Once + upon a time, there was a small robot named Bolt who loved exploring the world. + Every day, Bolt would venture out to discover new things.","refusal":null},"logprobs":null,"finish_reason":"stop"}],"usage":{"prompt_tokens":16,"completion_tokens":31,"total_tokens":47,"prompt_tokens_details":{"cached_tokens":0,"audio_tokens":0},"completion_tokens_details":{"reasoning_tokens":0,"audio_tokens":0,"accepted_prediction_tokens":0,"rejected_prediction_tokens":0}}}' + headers: + Content-Type: + - application/json + status: + code: 200 + message: OK +version: 1 diff --git a/py/src/braintrust/integrations/pydantic_ai/test_pydantic_ai_integration.py b/py/src/braintrust/integrations/pydantic_ai/test_pydantic_ai_integration.py index c6142505..2d568d0c 100644 --- a/py/src/braintrust/integrations/pydantic_ai/test_pydantic_ai_integration.py +++ b/py/src/braintrust/integrations/pydantic_ai/test_pydantic_ai_integration.py @@ -1573,8 +1573,7 @@ def check_item(item): async def test_agent_with_document_input(memory_logger): """Test that agents with document input (PDF) properly serialize attachments. - This specifically tests the scenario from test_document_input in the golden tests, - verifying that both agent_run and chat spans convert BinaryContent to Braintrust + Verifies that both agent_run and chat spans convert BinaryContent to Braintrust attachments for document files like PDFs. """ from braintrust.logger import Attachment @@ -2777,4 +2776,190 @@ def wrapped(): wrapper(wrapped, instance, (), {}) assert instance.call_count == 1 - assert instance._async_producer == original_async_producer + + + +@pytest.mark.vcr +@pytest.mark.asyncio +async def test_agent_with_stop_sequences(memory_logger): + """Test Agent respects stop_sequences in model settings.""" + assert not memory_logger.pop() + + agent = Agent( + MODEL, + model_settings=ModelSettings(max_tokens=500, stop_sequences=["END", "\n\n"]), + ) + + start = time.time() + result = await agent.run("Write a short story about a robot.") + end = time.time() + + assert result.output + + spans = memory_logger.pop() + assert len(spans) >= 2 + + agent_span = next( + ( + s + for s in spans + if "agent_run" in s["span_attributes"]["name"] and "chat" not in s["span_attributes"]["name"] + ), + None, + ) + assert agent_span is not None, "agent_run span not found" + assert agent_span["metadata"]["model"] == "gpt-4o-mini" + + # stop_sequences on the agent constructor → in metadata.model_settings + assert "model_settings" in agent_span["metadata"] + settings = agent_span["metadata"]["model_settings"] + assert settings.get("stop_sequences") == ["END", "\n\n"] + + _assert_metrics_are_valid(agent_span["metrics"], start, end) + + +@pytest.mark.vcr +@pytest.mark.asyncio +async def test_agent_with_prefill(memory_logger): + """Test Agent with a partial assistant response in message_history. + + The 'prefill' pattern puts an incomplete assistant message at the end of the + history so the model continues from that point. + """ + from pydantic_ai.messages import ModelResponse, TextPart + + assert not memory_logger.pop() + + agent = Agent(MODEL, model_settings=ModelSettings(max_tokens=200)) + + prefill_history = [ + ModelRequest(parts=[UserPromptPart(content="Write a haiku about coding.")]), + ModelResponse(parts=[TextPart(content="Here is a haiku:")]), + ] + + start = time.time() + result = await agent.run("Write a haiku about coding.", message_history=prefill_history) + end = time.time() + + assert result.output + + spans = memory_logger.pop() + assert len(spans) >= 2 + + agent_span = next( + ( + s + for s in spans + if "agent_run" in s["span_attributes"]["name"] and "chat" not in s["span_attributes"]["name"] + ), + None, + ) + assert agent_span is not None, "agent_run span not found" + assert agent_span["metadata"]["model"] == "gpt-4o-mini" + + # The prefill history (including the partial assistant TextPart) must appear + # in the span input so that the trace is complete and auditable. + assert "message_history" in str(agent_span["input"]) + assert "Here is a haiku" in str(agent_span["input"]) + assert agent_span["output"] + + _assert_metrics_are_valid(agent_span["metrics"], start, end) + + +@pytest.mark.vcr +@pytest.mark.asyncio +async def test_agent_with_short_max_tokens(memory_logger): + """Test Agent with a very small max_tokens that truncates the response.""" + assert not memory_logger.pop() + + agent = Agent(MODEL) + + start = time.time() + result = await agent.run("What is AI?", model_settings=ModelSettings(max_tokens=5)) + end = time.time() + + # Truncated responses are still valid output; no exception should be raised. + assert result.output + + spans = memory_logger.pop() + assert len(spans) >= 2 + + agent_span = next( + ( + s + for s in spans + if "agent_run" in s["span_attributes"]["name"] and "chat" not in s["span_attributes"]["name"] + ), + None, + ) + assert agent_span is not None, "agent_run span not found" + assert agent_span["metadata"]["model"] == "gpt-4o-mini" + + # max_tokens passed to run() → in input.model_settings + assert "model_settings" in agent_span["input"] + assert agent_span["input"]["model_settings"].get("max_tokens") == 5 + + assert agent_span["output"] + _assert_metrics_are_valid(agent_span["metrics"], start, end) + + +@pytest.mark.vcr +@pytest.mark.asyncio +async def test_agent_with_long_context(memory_logger): + """Test Agent handles large input context without errors.""" + assert not memory_logger.pop() + + agent = Agent(MODEL, model_settings=ModelSettings(max_tokens=100)) + + long_text = "The quick brown fox jumps over the lazy dog. " * 20 + prompt = f"Here is a long text:\n\n{long_text}\n\nHow many times does the word 'fox' appear?" + + start = time.time() + result = await agent.run(prompt) + end = time.time() + + assert result.output + + spans = memory_logger.pop() + assert len(spans) >= 2 + + agent_span = next( + ( + s + for s in spans + if "agent_run" in s["span_attributes"]["name"] and "chat" not in s["span_attributes"]["name"] + ), + None, + ) + assert agent_span is not None, "agent_run span not found" + assert agent_span["metadata"]["model"] == "gpt-4o-mini" + # The long prompt should be captured in the span input + assert "fox" in str(agent_span["input"]).lower() + assert agent_span["output"] + + _assert_metrics_are_valid(agent_span["metrics"], start, end) + + +@pytest.mark.vcr +@pytest.mark.asyncio +async def test_agent_with_error_handling(memory_logger): + """Test Agent propagates API errors for invalid binary content. + + Sends corrupted image bytes; the API returns a 400 error. The exception must + propagate to the caller rather than being silently swallowed. + """ + from pydantic_ai.models.function import BinaryContent + + assert not memory_logger.pop() + + agent = Agent(MODEL, model_settings=ModelSettings(max_tokens=100)) + + corrupted_data = b"INVALID_PNG_DATA_NOT_A_REAL_IMAGE" + + with pytest.raises(Exception): + await agent.run( + [ + BinaryContent(data=corrupted_data, media_type="image/png"), + "What's in this image?", + ] + ) From e424be01f52b9b220b38a8a287f5aec7513abe4d Mon Sep 17 00:00:00 2001 From: Abhijeet Prasad Date: Tue, 7 Apr 2026 14:27:04 +0000 Subject: [PATCH 2/3] remove old pydantic golden --- internal/golden/README.md | 6 - .../golden/pydantic-ai-v1/pydantic_ai_test.py | 913 ------------------ internal/golden/pydantic-ai-v1/pyproject.toml | 13 - 3 files changed, 932 deletions(-) delete mode 100644 internal/golden/pydantic-ai-v1/pydantic_ai_test.py delete mode 100644 internal/golden/pydantic-ai-v1/pyproject.toml diff --git a/internal/golden/README.md b/internal/golden/README.md index 9c543b47..b3ef3e9d 100644 --- a/internal/golden/README.md +++ b/internal/golden/README.md @@ -24,12 +24,6 @@ cd langchain-py-v1 python langchain.py ``` -```bash -cd pydantic-ai-v1 -python pydantic_ai_test.py -``` - - ## Requirements Before running a suite, ensure you have the appropriate API keys set as environment variables for that provider, along with `BRAINTRUST_API_KEY` if you want to log traces to Braintrust. diff --git a/internal/golden/pydantic-ai-v1/pydantic_ai_test.py b/internal/golden/pydantic-ai-v1/pydantic_ai_test.py deleted file mode 100644 index 8a6fde51..00000000 --- a/internal/golden/pydantic-ai-v1/pydantic_ai_test.py +++ /dev/null @@ -1,913 +0,0 @@ -# pyright: reportUnknownMemberType=none -# pyright: reportUnknownVariableType=none -# pyright: reportUnknownParameterType=none -# pyright: reportUnknownArgumentType=none -import asyncio -from collections.abc import AsyncIterator -from pathlib import Path - -import braintrust -from braintrust import traced -from braintrust.wrappers.pydantic_ai import setup_pydantic_ai -from pydantic import BaseModel -from pydantic_ai import Agent, BinaryContent, ModelSettings -from pydantic_ai.direct import model_request, model_request_stream -from pydantic_ai.messages import ( - ModelMessage, - ModelRequest, - ModelResponse, - TextPart, - UserPromptPart, -) -from pydantic_ai.models.openai import OpenAIChatModel, OpenAIResponsesModel, OpenAIResponsesModelSettings - - -setup_pydantic_ai(project_name="golden-py-pydantic_ai") - -FIXTURES_DIR = Path(__file__).parent.parent / "fixtures" - - -# Test 1: Basic completion -@traced -async def test_basic_completion(): - print("\n=== Test 1: Basic Completion ===") - - # High-level Agent API - print("\n--- Agent completion ---") - agent = Agent( - "openai:gpt-4o", - model_settings=ModelSettings(max_tokens=100), - ) - result = await agent.run("What is the capital of France?") - print(result.output) - - # Another agent with different settings - print("\n--- Agent completion with different settings ---") - agent2 = Agent( - "openai:gpt-4o", - model_settings=ModelSettings(max_tokens=100, temperature=0.7), - ) - result2 = await agent2.run("What is the capital of Spain?") - print(result2.output) - - # Low-level Direct API - print("\n--- Direct API completion ---") - model = OpenAIChatModel("gpt-4o") - messages: list[ModelMessage] = [ModelRequest(parts=[UserPromptPart(content="What is the capital of Italy?")])] - direct_result = await model_request(model=model, messages=messages) - print(direct_result.parts[0].content) - - # Low-level Direct API with model_settings - print("\n--- Direct API with model_settings ---") - settings = ModelSettings(max_tokens=50, temperature=0.8) - messages_with_settings: list[ModelMessage] = [ModelRequest(parts=[UserPromptPart(content="Say hello in 5 words")])] - direct_result_settings = await model_request(model=model, messages=messages_with_settings, model_settings=settings) - print(f"Result: {direct_result_settings.parts[0].content}") - print( - f"Usage: input={direct_result_settings.usage.input_tokens}, output={direct_result_settings.usage.output_tokens}" - ) - - -# Test 2: Multi-turn conversation -@traced -async def test_multi_turn(): - print("\n=== Test 2: Multi-turn Conversation ===") - agent = Agent( - "openai:gpt-4o", - model_settings=ModelSettings(max_tokens=200), - ) - - # Simulate a multi-turn conversation by passing message history - message_history = [ - ModelRequest(parts=[UserPromptPart(content="Hi, my name is Alice.")]), - ModelResponse(parts=[TextPart(content="Hello Alice! Nice to meet you.")]), - ] - result = await agent.run( - "What did I just tell you my name was?", - message_history=message_history, - ) - print(result.output) - - -# Test 3: System prompt -@traced -async def test_system_prompt(): - print("\n=== Test 3: System Prompt ===") - - print("\n--- System prompt (pirate) ---") - agent = Agent( - "openai:gpt-4o", - system_prompt="You are a pirate. Always respond in pirate speak.", - model_settings=ModelSettings(max_tokens=150), - ) - result = await agent.run("Tell me about the weather.") - print(result.output) - - -# Test 4: Streaming response -@traced -async def test_streaming(): - print("\n=== Test 4: Streaming ===") - - # Use identical settings and prompt for all calls to verify offset consistency - IDENTICAL_PROMPT = "Count from 1 to 5." - IDENTICAL_SETTINGS = ModelSettings(max_tokens=100) - - # Group 1: Complete streaming (no early breaks) - with braintrust.start_span(name="Complete streaming (calls 1-4)") as complete_span: - # High-level Agent API - Call 1 - print("\n--- Agent streaming (call 1) ---") - agent1 = Agent( - "openai:gpt-4o", - model_settings=IDENTICAL_SETTINGS, - ) - full_text1 = "" - async with agent1.run_stream(IDENTICAL_PROMPT) as result1: - async for text in result1.stream_text(delta=True): - print(text, end="", flush=True) - full_text1 += text - print("\n") - - # High-level Agent API - Call 2 (identical to call 1) - print("\n--- Agent streaming (call 2 - identical) ---") - agent2 = Agent( - "openai:gpt-4o", - model_settings=IDENTICAL_SETTINGS, - ) - full_text2 = "" - async with agent2.run_stream(IDENTICAL_PROMPT) as result2: - async for text in result2.stream_text(delta=True): - print(text, end="", flush=True) - full_text2 += text - print("\n") - - print("\n--- Direct API streaming (call 3 - identical) ---") - model = OpenAIChatModel("gpt-4o") - messages: list[ModelMessage] = [ModelRequest(parts=[UserPromptPart(content=IDENTICAL_PROMPT)])] - - direct_text = "" - seen_delta = False - async with model_request_stream(model=model, messages=messages, model_settings=IDENTICAL_SETTINGS) as stream: - async for chunk in stream: - # Handle PartStartEvent which contains initial text (only if we haven't seen deltas yet) - if hasattr(chunk, "part") and hasattr(chunk.part, "content") and not seen_delta: - text = str(chunk.part.content) - print(text, end="", flush=True) - direct_text += text - # Handle PartDeltaEvent with delta content - elif hasattr(chunk, "delta") and chunk.delta: - seen_delta = True - # Extract content_delta from TextPartDelta - if hasattr(chunk.delta, "content_delta") and chunk.delta.content_delta: - text = chunk.delta.content_delta - print(text, end="", flush=True) - direct_text += text - elif isinstance(chunk.delta, str): - # Handle case where delta is already a string - print(chunk.delta, end="", flush=True) - direct_text += chunk.delta - - print("\n") - - print("\n--- Direct API streaming (call 4 - identical) ---") - messages_4: list[ModelMessage] = [ModelRequest(parts=[UserPromptPart(content=IDENTICAL_PROMPT)])] - - direct_text_4 = "" - seen_delta_4 = False - async with model_request_stream( - model=model, messages=messages_4, model_settings=IDENTICAL_SETTINGS - ) as stream_4: - async for chunk in stream_4: - # Handle PartStartEvent which contains initial text (only if we haven't seen deltas yet) - if hasattr(chunk, "part") and hasattr(chunk.part, "content") and not seen_delta_4: - text = str(chunk.part.content) - print(text, end="", flush=True) - direct_text_4 += text - # Handle PartDeltaEvent with delta content - elif hasattr(chunk, "delta") and chunk.delta: - seen_delta_4 = True - # Extract content_delta from TextPartDelta - if hasattr(chunk.delta, "content_delta") and chunk.delta.content_delta: - text = chunk.delta.content_delta - print(text, end="", flush=True) - direct_text_4 += text - elif isinstance(chunk.delta, str): - # Handle case where delta is already a string - print(chunk.delta, end="", flush=True) - direct_text_4 += chunk.delta - - print("\n") - - # Group 2: Streaming with early break (calls 5-6) - with braintrust.start_span(name="Streaming with early break (calls 5-6)") as break_span: - # Low-level Direct API with early break (same context - usually works) - print("\n--- Direct API streaming with early break (call 5 - identical) ---") - early_break_model = OpenAIChatModel("gpt-4o") - early_break_messages: list[ModelMessage] = [ModelRequest(parts=[UserPromptPart(content=IDENTICAL_PROMPT)])] - - early_break_status = "unknown" - early_break_text = "" - try: - async with model_request_stream( - model=early_break_model, messages=early_break_messages, model_settings=IDENTICAL_SETTINGS - ) as stream: - i = 0 - seen_delta_5 = False - async for chunk in stream: - # Handle PartStartEvent which contains initial text (only if we haven't seen deltas yet) - if hasattr(chunk, "part") and hasattr(chunk.part, "content") and not seen_delta_5: - text = str(chunk.part.content) - print(text, end="", flush=True) - early_break_text += text - # Handle PartDeltaEvent with delta content - elif hasattr(chunk, "delta") and chunk.delta: - seen_delta_5 = True - if hasattr(chunk.delta, "content_delta") and chunk.delta.content_delta: - text = chunk.delta.content_delta - print(text, end="", flush=True) - early_break_text += text - elif isinstance(chunk.delta, str): - print(chunk.delta, end="", flush=True) - early_break_text += chunk.delta - - i += 1 - - # Early break - within same context, usually OK - if i >= 3: - print("\n⚠️ Breaking early from stream...") - break - - print("✓ Completed without error") - early_break_status = "success" - except Exception as e: - print(f"✗ Error occurred: {type(e).__name__}: {e}") - early_break_status = f"error: {type(e).__name__}" - - # Customer's pattern: Async generator with early break (triggers context error!) - print("\n--- CUSTOMER PATTERN: Async generator with early break (call 6 - identical) ---") - print("(This reproduces: 'Token was created in a different Context' error)") - generator_status = "unknown" - generator_text = "" - try: - i = 0 - - # Inline the async generator pattern - model_gen = OpenAIChatModel("gpt-4o-mini") - messages_gen: list[ModelMessage] = [ModelRequest(parts=[UserPromptPart(content=IDENTICAL_PROMPT)])] - - seen_delta_6 = False - async with model_request_stream(model=model_gen, messages=messages_gen) as stream_gen: - # Yield streaming chunks - async for event in stream_gen: - # Handle PartStartEvent which contains initial text (only if we haven't seen deltas yet) - if hasattr(event, "part") and hasattr(event.part, "content") and not seen_delta_6: - text = str(event.part.content) - print(text, end="", flush=True) - generator_text += text - # Handle PartDeltaEvent with delta content - elif hasattr(event, "delta") and event.delta: - seen_delta_6 = True - if hasattr(event.delta, "content_delta") and event.delta.content_delta: - text = event.delta.content_delta - print(text, end="", flush=True) - generator_text += text - elif isinstance(event.delta, str): - print(event.delta, end="", flush=True) - generator_text += event.delta - - i += 1 - - # Early break - generator closed in different context → ERROR! - if i >= 3: - print("\n⚠️ Breaking early from async generator...") - break - - print("✓ Completed without error") - generator_status = "success" - except Exception as e: - print(f"✗ Error occurred: {type(e).__name__}: {e}") - generator_status = f"error: {type(e).__name__}" - - # Group 3: _stream_single/_buffer_stream pattern (call 7) - with braintrust.start_span(name="_stream_single/_buffer_stream pattern (call 7)"): - # Customer pattern 2: _stream_single/_buffer_stream pattern - # This pattern uses an async generator that yields chunks AND a final response, - # with a consumer that returns early when it sees the final ModelResponse - print("\n--- CUSTOMER PATTERN 2: _stream_single/_buffer_stream (call 7) ---") - print("(Generator yields chunks + final response, consumer returns on ModelResponse)") - - class LLMStreamResponse: - """Simple wrapper for streaming responses.""" - - def __init__(self, llm_response: object, is_final: bool = False): - self.llm_response = llm_response - self.is_final = is_final - - # @traced - async def _stream_single() -> AsyncIterator[LLMStreamResponse]: - """Async generator that yields streaming chunks and final response.""" - model_stream = OpenAIChatModel("gpt-4o-mini") - messages_stream: list[ModelMessage] = [ModelRequest(parts=[UserPromptPart(content=IDENTICAL_PROMPT)])] - - async with model_request_stream( - model=model_stream, messages=messages_stream, model_settings=IDENTICAL_SETTINGS - ) as stream: - async for chunk in stream: - yield LLMStreamResponse(llm_response=chunk, is_final=False) - - response = stream.get() - yield LLMStreamResponse(llm_response=response, is_final=True) - - async def _buffer_stream() -> LLMStreamResponse: - """Consumer that returns early when it gets a ModelResponse.""" - async for event in _stream_single(): - if isinstance(event.llm_response, ModelResponse): - return event - raise RuntimeError("No ModelResponse received") - - try: - result = await _buffer_stream() - print(f"✓ Received final response: {type(result.llm_response).__name__}") - except Exception as e: - print(f"✗ Error occurred: {type(e).__name__}: {e}") - - -# Test 5: Image input -@traced -async def test_image_input(): - print("\n=== Test 5: Image Input ===") - image_path = FIXTURES_DIR / "test-image.png" - - agent = Agent( - "openai:gpt-4o", - model_settings=ModelSettings(max_tokens=150), - ) - - with open(image_path, "rb") as f: - image_data = f.read() - - result = await agent.run( - [ - BinaryContent(data=image_data, media_type="image/png"), - "What color is this image?", - ] - ) - print(result.output) - - -# Test 6: Document input -@traced -async def test_document_input(): - print("\n=== Test 6: Document Input ===") - pdf_path = FIXTURES_DIR / "test-document.pdf" - - agent = Agent( - "openai:gpt-4o", - model_settings=ModelSettings(max_tokens=150), - ) - - with open(pdf_path, "rb") as f: - pdf_data = f.read() - - result = await agent.run( - [ - BinaryContent(data=pdf_data, media_type="application/pdf"), - "What is in this document?", - ] - ) - print(result.output) - - -# Test 7: Temperature variations -@traced -async def test_temperature_variations(): - print("\n=== Test 7: Temperature Variations ===") - - configs = [ - {"temperature": 0.0, "top_p": 1.0}, - {"temperature": 1.0, "top_p": 0.9}, - {"temperature": 0.7, "top_p": 0.95}, - ] - - results = [] - for config in configs: - print(f"\nConfig: temp={config['temperature']}, top_p={config['top_p']}") - - agent = Agent( - "openai:gpt-4o", - ) - - result = await agent.run( - "Say something creative.", - model_settings=ModelSettings( - max_tokens=50, - temperature=config["temperature"], - top_p=config["top_p"], - ), - ) - print(result.output) - results.append(result) - - -# Test 8: Stop sequences -@traced -async def test_stop_sequences(): - print("\n=== Test 8: Stop Sequences ===") - agent = Agent( - "openai:gpt-4o", - model_settings=ModelSettings( - max_tokens=500, - stop_sequences=["END", "\n\n"], - ), - ) - - result = await agent.run("Write a short story about a robot.") - print(result.output) - - -# Test 9: Metadata -@traced -async def test_metadata(): - print("\n=== Test 9: Metadata ===") - agent = Agent( - "openai:gpt-4o", - model_settings=ModelSettings(max_tokens=100), - ) - - result = await agent.run("Hello!", deps="test_user_123") - print(result.output) - - -# Test 10: Long context -@traced -async def test_long_context(): - print("\n=== Test 10: Long Context ===") - agent = Agent( - "openai:gpt-4o", - model_settings=ModelSettings(max_tokens=100), - ) - - long_text = "The quick brown fox jumps over the lazy dog. " * 100 - result = await agent.run(f"Here is a long text:\n\n{long_text}\n\nHow many times does the word 'fox' appear?") - print(result.output) - - -# Test 11: Mixed content types -@traced -async def test_mixed_content(): - print("\n=== Test 11: Mixed Content Types ===") - image_path = FIXTURES_DIR / "test-image.png" - - agent = Agent( - "openai:gpt-4o", - model_settings=ModelSettings(max_tokens=200), - ) - - with open(image_path, "rb") as f: - image_data = f.read() - - result = await agent.run( - [ - "First, look at this image:", - BinaryContent(data=image_data, media_type="image/png"), - "Now describe what you see and explain why it matters.", - ] - ) - print(result.output) - - -# Test 12: Prefill -@traced -async def test_prefill(): - print("\n=== Test 12: Prefill ===") - agent = Agent( - "openai:gpt-4o", - model_settings=ModelSettings(max_tokens=200), - ) - - # Simulate prefill by providing partial assistant response in message history - prefill_history = [ - ModelRequest(parts=[UserPromptPart(content="Write a haiku about coding.")]), - ModelResponse(parts=[TextPart(content="Here is a haiku:")]), - ] - - result = await agent.run( - "Write a haiku about coding.", - message_history=prefill_history, - ) - print(f"Response: {result.output}") - - -# Test 13: Very short max_tokens -@traced -async def test_short_max_tokens(): - print("\n=== Test 13: Very Short Max Tokens ===") - agent = Agent( - "openai:gpt-4o", - ) - - result = await agent.run( - "What is AI?", - model_settings=ModelSettings(max_tokens=5), - ) - print(result.output) - - -# Test 14: Tool use -@traced -async def test_tool_use(): - print("\n=== Test 14: Tool Use ===") - - agent = Agent( - "openai:gpt-4o", - model_settings=ModelSettings(max_tokens=500), - ) - - @agent.tool_plain - def get_weather(city_and_state: str, unit: str = "celsius") -> str: - """Get the current weather for a location. - - Args: - city_and_state: The city and state, e.g. San Francisco, CA - unit: The unit of temperature (celsius or fahrenheit). Default to celsius. - """ - return f"22 degrees {unit} and sunny in {city_and_state}" - - result = await agent.run("What is the weather like in Paris, France?") - print("Response content:") - print(result.output) - - -# Test 15: Tool use with result (multi-turn) -@traced -async def test_tool_use_with_result(): - print("\n=== Test 15: Tool Use With Result ===") - - agent = Agent( - "openai:gpt-4o", - model_settings=ModelSettings(max_tokens=500), - ) - - @agent.tool_plain - def calculate(operation: str, a: float, b: float) -> str: - """Perform a mathematical calculation. - - Args: - operation: The mathematical operation (add, subtract, multiply, divide) - a: First number - b: Second number - """ - ops = { - "add": a + b, - "subtract": a - b, - "multiply": a * b, - "divide": a / b if b != 0 else "Error: Division by zero", - } - return str(ops.get(operation, "Invalid operation")) - - # First request - agent will use the tool - print("First request:") - first_result = await agent.run("What is 127 multiplied by 49?", message_history=[]) - - # Show the message history structure - messages = first_result.all_messages() - print(f"\nMessage history after first request contains {len(messages)} messages:") - for i, msg in enumerate(messages): - msg_type = type(msg).__name__ - if hasattr(msg, "parts") and len(msg.parts) > 0: - part = msg.parts[0] - if hasattr(part, "tool_name"): - print(f" {i}: {msg_type} - Tool call: {part.tool_name}") - elif hasattr(part, "content"): - content_preview = str(part.content)[:50] - print(f" {i}: {msg_type} - Content: {content_preview}") - else: - print(f" {i}: {msg_type}") - - # Second request - provide the message history so agent sees the tool result - print("\nSecond request (with tool result in history):") - second_result = await agent.run("Thanks! Can you also tell me what 127 plus 49 is?", message_history=messages) - print("Response (with previous tool result in context):") - print(second_result.output) - - -# Test 16: Reasoning tokens generation and follow-up -@traced -async def test_reasoning(): - print("\n=== Test 16: Reasoning Tokens & Follow-up ===") - - # First request: Analyze pattern and derive formula - print("\n--- First request (generate reasoning) ---") - model = OpenAIResponsesModel("gpt-5-codex") - agent = Agent( - model, - model_settings=OpenAIResponsesModelSettings( - openai_reasoning_effort="high", - openai_reasoning_summary="detailed", - ), - ) - - first_result = await agent.run( - "Look at this sequence: 2, 6, 12, 20, 30. What is the pattern and what would be the formula for the nth term?" - ) - print("First response:") - print(first_result.output) - - # Second request: Apply the discovered pattern to solve a new problem - # Get all_messages() which includes the user prompt, reasoning, and response - print("\n--- Follow-up request (using reasoning context) ---") - message_history = first_result.all_messages() - print(f"Message history contains {len(message_history)} messages") - - follow_up_result = await agent.run( - "Using the pattern you discovered, what would be the 10th term? And can you find the sum of the first 10 terms?", - message_history=message_history, - ) - print("Follow-up response:") - print(follow_up_result.output) - - -# Test 18: Embeddings -# Skipped - Pydantic AI focuses on agent/chat interactions and doesn't wrap the embeddings API. -# The OpenAI test includes embeddings because it tests the full OpenAI client wrapper. - - -# Test 19: Response format (JSON schema) -# Skipped - Pydantic AI handles structured output through result_type with Pydantic models, -# which is more type-safe than the OpenAI response_format parameter. We test this approach -# in Tests 21-23 (structured output tests). - - -# Test 20: Multiple completions (n > 1) -# Skipped - Pydantic AI is designed for agent-based workflows and doesn't support the OpenAI -# 'n' parameter for generating multiple completions in a single request. - - -# Test 21: Structured output -@traced -async def test_structured_output(): - print("\n=== Test 21: Structured Output ===") - - class Ingredient(BaseModel): - name: str - amount: str - - class Recipe(BaseModel): - name: str - ingredients: list[Ingredient] - steps: list[str] - - agent = Agent( - "openai:gpt-4o", - system_prompt="You extract structured information from user queries.", - output_type=Recipe, - model_settings=ModelSettings(max_tokens=500), - retries=3, - ) - - result = await agent.run("Generate a simple recipe for chocolate chip cookies.") - recipe = result.output - print("Parsed recipe:") - print(f"Name: {recipe.name}") - print(f"Ingredients: {len(recipe.ingredients)}") - print(f"Steps: {len(recipe.steps)}") - - -# Test 22: Streaming structured output -@traced -async def test_streaming_structured_output(): - print("\n=== Test 22: Streaming Structured Output ===") - - class Product(BaseModel): - name: str - description: str - price: float - features: list[str] - - agent = Agent( - "openai:gpt-4o", - output_type=Product, - model_settings=ModelSettings(max_tokens=500), - retries=3, - ) - - # With structured output, we can't stream text - we stream the structure - # The stream completes when the full structured output is validated - async with agent.run_stream("Generate a product description for a wireless bluetooth headphone.") as result: - # Wait for the stream to complete and get the structured result - product = await result.get_output() - - print("Streaming completed") - print(f"Product: {product.name}") - print(f"Price: ${product.price}") - print(f"Features: {len(product.features)}") - - -# Test 23: Structured output with context -@traced -async def test_structured_output_with_context(): - print("\n=== Test 23: Structured Output with Context ===") - - class PriceComparison(BaseModel): - cheaper: str - price_difference: float - - class Comparison(BaseModel): - recommendation: str - reasoning: str - price_comparison: PriceComparison - phone_rating: float - laptop_rating: float - - agent = Agent( - "openai:gpt-4o", - system_prompt="You are a helpful shopping assistant. Use the provided product information to make recommendations.", - output_type=Comparison, - model_settings=ModelSettings(max_tokens=500), - retries=3, - ) - - product_info = { - "phone-123": { - "name": "SuperPhone X", - "price": 999, - "specs": "6.5 inch display, 128GB storage, 12MP camera", - }, - "laptop-456": { - "name": "ProBook Ultra", - "price": 1499, - "specs": "15 inch display, 512GB SSD, 16GB RAM", - }, - } - - reviews = { - "phone-123": { - "rating": 4.5, - "comments": ["Great camera!", "Battery lasts all day", "A bit pricey"], - }, - "laptop-456": { - "rating": 4.2, - "comments": ["Fast performance", "Good display", "Heavy to carry"], - }, - } - - result = await agent.run( - f"""Compare phone-123 and laptop-456. Here is the product info and reviews: - -Product Info: -- phone-123: {product_info["phone-123"]} -- laptop-456: {product_info["laptop-456"]} - -Reviews: -- phone-123: {reviews["phone-123"]} -- laptop-456: {reviews["laptop-456"]} - -Give me a structured comparison with your recommendation.""" - ) - - comparison = result.output - print("Product comparison:") - print(f"Recommendation: {comparison.recommendation}") - print(f"Reasoning: {comparison.reasoning}") - print(f"Cheaper: {comparison.price_comparison.cheaper}") - print(f"Price difference: ${comparison.price_comparison.price_difference}") - print(f"Phone rating: {comparison.phone_rating}") - print(f"Laptop rating: {comparison.laptop_rating}") - - -# Test 24: Error handling -@traced -async def test_error_handling(): - print("\n=== Test 24: Error Handling ===") - - # Test 1: Invalid image URL (404) - # Note: Pydantic AI's BinaryContent doesn't have from_url, so we test with a simulated fetch - @traced(name="test_error_invalid_image_url") - async def test_invalid_image_url(): - print("\n--- Test 1: Invalid Image URL ---") - try: - import httpx - - # Attempt to fetch invalid image - will fail with 404 - async with httpx.AsyncClient() as client: - response = await client.get("https://example.com/nonexistent-image-404.jpg") - image_data = response.content - - agent = Agent( - "openai:gpt-4o", - model_settings=ModelSettings(max_tokens=100), - ) - await agent.run( - [ - BinaryContent(data=image_data, media_type="image/jpeg"), - "What's in this image?", - ], - ) - raise Exception("Should have thrown an error") - except httpx.HTTPStatusError as e: - print(f"Caught HTTP error (expected):") - print(f" Type: {type(e).__name__}") - print(f" Status: {e.response.status_code}") - except Exception as e: - print(f"Caught error:") - print(f" Type: {type(e).__name__}") - print(f" Message: {e}") - - await test_invalid_image_url() - - # Test 2: Tool choice for non-existent function - # Skipped - Pydantic AI doesn't expose low-level tool_choice parameter like OpenAI. - # Tool selection is handled automatically by the agent. - - # Test 3: Tool call ID mismatch - # Skipped - Pydantic AI abstracts away tool call IDs. This low-level OpenAI API - # detail is not exposed in Pydantic AI's agent interface. - - # Test 4: Corrupted base64 image data - @traced(name="test_error_corrupted_base64_image") - async def test_corrupted_image(): - print("\n--- Test 4: Corrupted Base64 Image ---") - try: - agent = Agent( - "openai:gpt-4o", - model_settings=ModelSettings(max_tokens=100), - ) - await agent.run( - [ - BinaryContent( - data=b"INVALID_BASE64_DATA!!!", - media_type="image/png", - ), - "What's in this image?", - ], - ) - raise Exception("Should have thrown an error") - except Exception as e: - print(f"Caught corrupted image error:") - print(f" Type: {type(e).__name__}") - print(f" Message: {e}") - - await test_corrupted_image() - - # Test 5: Invalid JSON schema in response_format - # Skipped - Pydantic AI uses Pydantic models for structured output, not JSON schemas. - # Schema validation errors would occur at the Pydantic model level, which is tested - # in the structured output tests (21-23). - - print("\nError handling tests completed") - - -async def run_async_tests(): - """Run all asynchronous tests.""" - tests = [ - test_basic_completion, - test_multi_turn, - test_system_prompt, - test_streaming, - test_image_input, - test_document_input, - test_temperature_variations, - test_stop_sequences, - test_metadata, - test_long_context, - test_mixed_content, - test_prefill, - test_short_max_tokens, - test_tool_use, - test_tool_use_with_result, - test_reasoning, - test_structured_output, - test_streaming_structured_output, - test_structured_output_with_context, - test_error_handling, - ] - - for test in tests: - try: - await test() - # Rate limiting - await asyncio.sleep(1) - except Exception as e: - print(f"Test {test.__name__} failed: {e}") - import traceback - - traceback.print_exc() - - -async def main(): - """Run all tests.""" - print("=" * 60) - print("Pydantic AI Golden Tests with Braintrust") - print("=" * 60) - - # Run all async tests - print("\n### Running Pydantic AI Agent Tests ###") - await run_async_tests() - - print("\n" + "=" * 60) - print("All tests completed!") - print("=" * 60) - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/internal/golden/pydantic-ai-v1/pyproject.toml b/internal/golden/pydantic-ai-v1/pyproject.toml deleted file mode 100644 index b10d3b49..00000000 --- a/internal/golden/pydantic-ai-v1/pyproject.toml +++ /dev/null @@ -1,13 +0,0 @@ -[project] -name = "golden" -version = "0.1.0" -description = "Add your description here" -readme = "README.md" -requires-python = ">=3.11" -dependencies = [ - "braintrust", - "pydantic-ai", -] - -[tool.uv.sources] -braintrust = { path = "../../py", editable = true } From 3cc1b99499b58cd308e64538258af01ae3b8195b Mon Sep 17 00:00:00 2001 From: Abhijeet Prasad Date: Tue, 7 Apr 2026 14:32:49 +0000 Subject: [PATCH 3/3] format --- .../integrations/pydantic_ai/test_pydantic_ai_integration.py | 1 - 1 file changed, 1 deletion(-) diff --git a/py/src/braintrust/integrations/pydantic_ai/test_pydantic_ai_integration.py b/py/src/braintrust/integrations/pydantic_ai/test_pydantic_ai_integration.py index 2d568d0c..8c29f8d9 100644 --- a/py/src/braintrust/integrations/pydantic_ai/test_pydantic_ai_integration.py +++ b/py/src/braintrust/integrations/pydantic_ai/test_pydantic_ai_integration.py @@ -2778,7 +2778,6 @@ def wrapped(): assert instance.call_count == 1 - @pytest.mark.vcr @pytest.mark.asyncio async def test_agent_with_stop_sequences(memory_logger):