diff --git a/app/features/agents/agents/experiment.py b/app/features/agents/agents/experiment.py index 02123ec4..e27945e2 100644 --- a/app/features/agents/agents/experiment.py +++ b/app/features/agents/agents/experiment.py @@ -60,6 +60,12 @@ 5. Formulate recommendation with clear metrics 6. If auto_deploy requested and model beats baselines, propose deployment +CONVERSATIONAL BEHAVIOR: +- If the user greets you or has not yet described a concrete forecasting + objective, reply conversationally in the `summary` field and ask what they + would like to experiment on. Do NOT call any tools until you have a specific + objective (a store and product plus a date range, or an explicit request). + {TOOL_USAGE_INSTRUCTIONS} {SAFETY_INSTRUCTIONS} @@ -212,20 +218,33 @@ async def tool_run_backtest( @agent.tool_plain def tool_compare_backtest_results( - result_a: dict[str, Any], - result_b: dict[str, Any], + result_a: dict[str, Any] | None = None, + result_b: dict[str, Any] | None = None, ) -> dict[str, Any]: """Compare two backtest results. - Use this to analyze which model performs better. + Use this to analyze which model performs better. Both arguments must be + full backtest-result dicts as returned by tool_run_backtest. Args: - result_a: First backtest result. - result_b: Second backtest result. + result_a: First backtest result (from tool_run_backtest). + result_b: Second backtest result (from tool_run_backtest). Returns: - Comparison with metric differences and recommendation. + Comparison with metric differences and recommendation, or an + informative error dict if either result is missing. """ + # Tolerate missing/empty args: return a self-correcting hint instead of + # failing schema validation, which would burn the tool's retry budget + # and crash the whole run with UnexpectedModelBehavior. + if not result_a or not result_b: + return { + "error": "compare_backtest_results needs two backtest results.", + "hint": ( + "Call tool_run_backtest twice first, then pass both result " + "dicts as result_a and result_b." + ), + } return compare_backtest_results(result_a, result_b) @agent.tool diff --git a/app/features/agents/service.py b/app/features/agents/service.py index c510befc..013ca4a9 100644 --- a/app/features/agents/service.py +++ b/app/features/agents/service.py @@ -20,6 +20,7 @@ import structlog from pydantic_ai import Agent +from pydantic_ai.exceptions import UnexpectedModelBehavior from pydantic_ai.messages import ModelMessage from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession @@ -261,6 +262,26 @@ async def chat( raise TimeoutError( f"Agent response timed out after {self.settings.agent_timeout_seconds} seconds" ) from e + except UnexpectedModelBehavior as e: + # The model misbehaved (e.g. a tool call exceeded its retry budget). + # This is recoverable from the user's perspective — surface a clean + # message instead of leaking the raw PydanticAI exception string. + logger.warning( + "agents.chat_model_misbehavior", + session_id=session_id, + error=str(e), + error_type=type(e).__name__, + ) + session.last_activity = datetime.now(UTC) + await db.flush() + return ChatResponse( + session_id=session_id, + message=( + "I couldn't complete that request — the model produced an " + "invalid tool call. Please try rephrasing, or give me a " + "specific forecasting objective to work on." + ), + ) # Extract tool calls from result tool_calls: list[ToolCallResult] = [] @@ -561,6 +582,29 @@ async def stream_chat( raise TimeoutError( f"Agent response timed out after {self.settings.agent_timeout_seconds} seconds" ) from e + except UnexpectedModelBehavior as e: + # The model misbehaved (e.g. a tool call exceeded its retry budget). + # Emit a clean, recoverable `error` event rather than letting the raw + # PydanticAI exception bubble to the WebSocket handler. + logger.warning( + "agents.stream_chat_model_misbehavior", + session_id=session_id, + error=str(e), + error_type=type(e).__name__, + ) + yield StreamEvent( + event_type="error", + data={ + "error": ( + "The assistant produced an invalid tool call and couldn't " + "complete the request. Please try rephrasing your message." + ), + "error_type": "model_behavior_error", + "recoverable": True, + }, + timestamp=datetime.now(UTC), + ) + return logger.info( "agents.stream_chat_completed", diff --git a/app/features/agents/tests/test_service.py b/app/features/agents/tests/test_service.py index 4b831c46..7cdf8435 100644 --- a/app/features/agents/tests/test_service.py +++ b/app/features/agents/tests/test_service.py @@ -5,6 +5,7 @@ from unittest.mock import AsyncMock, MagicMock, patch import pytest +from pydantic_ai.exceptions import UnexpectedModelBehavior from app.features.agents.deps import AgentDeps from app.features.agents.models import AgentSession, AgentType, SessionStatus @@ -287,6 +288,95 @@ async def test_chat_success( assert response.tokens_used == 100 mock_agent.run.assert_called_once() + @pytest.mark.asyncio + async def test_chat_model_misbehavior_returns_friendly_message( + self, + sample_active_session: AgentSession, + ) -> None: + """A misbehaving model should yield a clean message, not crash. + + Regression for issue #164: a tool call exceeding its retry budget + raised PydanticAI's `UnexpectedModelBehavior`, whose raw string + ("Tool '...' exceeded max retries count of 1") leaked to the user. + """ + service = AgentService() + mock_db = AsyncMock() + + mock_result = MagicMock() + mock_result.scalar_one_or_none.return_value = sample_active_session + mock_db.execute.return_value = mock_result + + mock_agent = MagicMock() + mock_agent.run = AsyncMock( + side_effect=UnexpectedModelBehavior( + "Tool 'tool_compare_backtest_results' exceeded max retries count of 1" + ) + ) + + with patch.object(service, "_get_agent", return_value=mock_agent): + response = await service.chat( + db=mock_db, + session_id=sample_active_session.session_id, + message="Hello", + ) + + assert response.session_id == sample_active_session.session_id + assert response.pending_approval is False + assert "invalid tool call" in response.message + assert "exceeded max retries" not in response.message + + +class TestAgentServiceStreamChat: + """Tests for streaming chat functionality.""" + + @pytest.mark.asyncio + async def test_stream_chat_model_misbehavior_yields_error_event( + self, + sample_active_session: AgentSession, + ) -> None: + """A misbehaving model should yield a recoverable `error` event, not crash. + + Regression for issue #164: `UnexpectedModelBehavior` raised inside + `agent.run_stream` bubbled to the WebSocket handler, which echoed the + raw exception string to the client. + """ + service = AgentService() + mock_db = AsyncMock() + + mock_result = MagicMock() + mock_result.scalar_one_or_none.return_value = sample_active_session + mock_db.execute.return_value = mock_result + + class _RaisingStream: + """Async context manager that fails on entry like a misbehaving run.""" + + async def __aenter__(self) -> Any: + raise UnexpectedModelBehavior( + "Tool 'tool_compare_backtest_results' exceeded max retries count of 1" + ) + + async def __aexit__(self, *exc: object) -> bool: + return False + + mock_agent = MagicMock() + mock_agent.run_stream = MagicMock(return_value=_RaisingStream()) + + with patch.object(service, "_get_agent", return_value=mock_agent): + events = [ + event + async for event in service.stream_chat( + db=mock_db, + session_id=sample_active_session.session_id, + message="Hello", + ) + ] + + assert len(events) == 1 + assert events[0].event_type == "error" + assert events[0].data["recoverable"] is True + assert events[0].data["error_type"] == "model_behavior_error" + assert "exceeded max retries" not in events[0].data["error"] + class TestAgentServiceApproval: """Tests for approval workflow."""