Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 25 additions & 6 deletions app/features/agents/agents/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,12 @@
5. Formulate recommendation with clear metrics
6. If auto_deploy requested and model beats baselines, propose deployment

CONVERSATIONAL BEHAVIOR:
- If the user greets you or has not yet described a concrete forecasting
objective, reply conversationally in the `summary` field and ask what they
would like to experiment on. Do NOT call any tools until you have a specific
objective (a store and product plus a date range, or an explicit request).

{TOOL_USAGE_INSTRUCTIONS}

{SAFETY_INSTRUCTIONS}
Expand Down Expand Up @@ -212,20 +218,33 @@ async def tool_run_backtest(

@agent.tool_plain
def tool_compare_backtest_results(
result_a: dict[str, Any],
result_b: dict[str, Any],
result_a: dict[str, Any] | None = None,
result_b: dict[str, Any] | None = None,
) -> dict[str, Any]:
"""Compare two backtest results.

Use this to analyze which model performs better.
Use this to analyze which model performs better. Both arguments must be
full backtest-result dicts as returned by tool_run_backtest.

Args:
result_a: First backtest result.
result_b: Second backtest result.
result_a: First backtest result (from tool_run_backtest).
result_b: Second backtest result (from tool_run_backtest).

Returns:
Comparison with metric differences and recommendation.
Comparison with metric differences and recommendation, or an
informative error dict if either result is missing.
"""
# Tolerate missing/empty args: return a self-correcting hint instead of
# failing schema validation, which would burn the tool's retry budget
# and crash the whole run with UnexpectedModelBehavior.
if not result_a or not result_b:
return {
"error": "compare_backtest_results needs two backtest results.",
"hint": (
"Call tool_run_backtest twice first, then pass both result "
"dicts as result_a and result_b."
),
}
return compare_backtest_results(result_a, result_b)

@agent.tool
Expand Down
44 changes: 44 additions & 0 deletions app/features/agents/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import structlog
from pydantic_ai import Agent
from pydantic_ai.exceptions import UnexpectedModelBehavior
from pydantic_ai.messages import ModelMessage
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
Expand Down Expand Up @@ -261,6 +262,26 @@ async def chat(
raise TimeoutError(
f"Agent response timed out after {self.settings.agent_timeout_seconds} seconds"
) from e
except UnexpectedModelBehavior as e:
# The model misbehaved (e.g. a tool call exceeded its retry budget).
# This is recoverable from the user's perspective — surface a clean
# message instead of leaking the raw PydanticAI exception string.
logger.warning(
"agents.chat_model_misbehavior",
session_id=session_id,
error=str(e),
error_type=type(e).__name__,
)
session.last_activity = datetime.now(UTC)
await db.flush()
return ChatResponse(
session_id=session_id,
message=(
"I couldn't complete that request — the model produced an "
"invalid tool call. Please try rephrasing, or give me a "
"specific forecasting objective to work on."
),
)

# Extract tool calls from result
tool_calls: list[ToolCallResult] = []
Expand Down Expand Up @@ -561,6 +582,29 @@ async def stream_chat(
raise TimeoutError(
f"Agent response timed out after {self.settings.agent_timeout_seconds} seconds"
) from e
except UnexpectedModelBehavior as e:
# The model misbehaved (e.g. a tool call exceeded its retry budget).
# Emit a clean, recoverable `error` event rather than letting the raw
# PydanticAI exception bubble to the WebSocket handler.
logger.warning(
"agents.stream_chat_model_misbehavior",
session_id=session_id,
error=str(e),
error_type=type(e).__name__,
)
yield StreamEvent(
event_type="error",
data={
"error": (
"The assistant produced an invalid tool call and couldn't "
"complete the request. Please try rephrasing your message."
),
"error_type": "model_behavior_error",
"recoverable": True,
},
timestamp=datetime.now(UTC),
)
return

logger.info(
"agents.stream_chat_completed",
Expand Down
90 changes: 90 additions & 0 deletions app/features/agents/tests/test_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from unittest.mock import AsyncMock, MagicMock, patch

import pytest
from pydantic_ai.exceptions import UnexpectedModelBehavior

from app.features.agents.deps import AgentDeps
from app.features.agents.models import AgentSession, AgentType, SessionStatus
Expand Down Expand Up @@ -287,6 +288,95 @@ async def test_chat_success(
assert response.tokens_used == 100
mock_agent.run.assert_called_once()

@pytest.mark.asyncio
async def test_chat_model_misbehavior_returns_friendly_message(
self,
sample_active_session: AgentSession,
) -> None:
"""A misbehaving model should yield a clean message, not crash.

Regression for issue #164: a tool call exceeding its retry budget
raised PydanticAI's `UnexpectedModelBehavior`, whose raw string
("Tool '...' exceeded max retries count of 1") leaked to the user.
"""
service = AgentService()
mock_db = AsyncMock()

mock_result = MagicMock()
mock_result.scalar_one_or_none.return_value = sample_active_session
mock_db.execute.return_value = mock_result

mock_agent = MagicMock()
mock_agent.run = AsyncMock(
side_effect=UnexpectedModelBehavior(
"Tool 'tool_compare_backtest_results' exceeded max retries count of 1"
)
)

with patch.object(service, "_get_agent", return_value=mock_agent):
response = await service.chat(
db=mock_db,
session_id=sample_active_session.session_id,
message="Hello",
)

assert response.session_id == sample_active_session.session_id
assert response.pending_approval is False
assert "invalid tool call" in response.message
assert "exceeded max retries" not in response.message


class TestAgentServiceStreamChat:
"""Tests for streaming chat functionality."""

@pytest.mark.asyncio
async def test_stream_chat_model_misbehavior_yields_error_event(
self,
sample_active_session: AgentSession,
) -> None:
"""A misbehaving model should yield a recoverable `error` event, not crash.

Regression for issue #164: `UnexpectedModelBehavior` raised inside
`agent.run_stream` bubbled to the WebSocket handler, which echoed the
raw exception string to the client.
"""
service = AgentService()
mock_db = AsyncMock()

mock_result = MagicMock()
mock_result.scalar_one_or_none.return_value = sample_active_session
mock_db.execute.return_value = mock_result

class _RaisingStream:
"""Async context manager that fails on entry like a misbehaving run."""

async def __aenter__(self) -> Any:
raise UnexpectedModelBehavior(
"Tool 'tool_compare_backtest_results' exceeded max retries count of 1"
)

async def __aexit__(self, *exc: object) -> bool:
return False

mock_agent = MagicMock()
mock_agent.run_stream = MagicMock(return_value=_RaisingStream())

with patch.object(service, "_get_agent", return_value=mock_agent):
events = [
event
async for event in service.stream_chat(
db=mock_db,
session_id=sample_active_session.session_id,
message="Hello",
)
]

assert len(events) == 1
assert events[0].event_type == "error"
assert events[0].data["recoverable"] is True
assert events[0].data["error_type"] == "model_behavior_error"
assert "exceeded max retries" not in events[0].data["error"]


class TestAgentServiceApproval:
"""Tests for approval workflow."""
Expand Down