w7-mgfcode · w7-mgfcode · May 18, 2026 · May 18, 2026
@@ -60,6 +60,12 @@
 5. Formulate recommendation with clear metrics
 6. If auto_deploy requested and model beats baselines, propose deployment
 
+CONVERSATIONAL BEHAVIOR:
+- If the user greets you or has not yet described a concrete forecasting
+  objective, reply conversationally in the `summary` field and ask what they
+  would like to experiment on. Do NOT call any tools until you have a specific
+  objective (a store and product plus a date range, or an explicit request).
+
 {TOOL_USAGE_INSTRUCTIONS}
 
 {SAFETY_INSTRUCTIONS}
@@ -212,20 +218,33 @@ async def tool_run_backtest(
 
     @agent.tool_plain
     def tool_compare_backtest_results(
-        result_a: dict[str, Any],
-        result_b: dict[str, Any],
+        result_a: dict[str, Any] | None = None,
+        result_b: dict[str, Any] | None = None,
     ) -> dict[str, Any]:
         """Compare two backtest results.
 
-        Use this to analyze which model performs better.
+        Use this to analyze which model performs better. Both arguments must be
+        full backtest-result dicts as returned by tool_run_backtest.
 
         Args:
-            result_a: First backtest result.
-            result_b: Second backtest result.
+            result_a: First backtest result (from tool_run_backtest).
+            result_b: Second backtest result (from tool_run_backtest).
 
         Returns:
-            Comparison with metric differences and recommendation.
+            Comparison with metric differences and recommendation, or an
+            informative error dict if either result is missing.
         """
+        # Tolerate missing/empty args: return a self-correcting hint instead of
+        # failing schema validation, which would burn the tool's retry budget
+        # and crash the whole run with UnexpectedModelBehavior.
+        if not result_a or not result_b:
+            return {
+                "error": "compare_backtest_results needs two backtest results.",
+                "hint": (
+                    "Call tool_run_backtest twice first, then pass both result "
+                    "dicts as result_a and result_b."
+                ),
+            }
         return compare_backtest_results(result_a, result_b)
 
     @agent.tool

@@ -20,6 +20,7 @@
 
 import structlog
 from pydantic_ai import Agent
+from pydantic_ai.exceptions import UnexpectedModelBehavior
 from pydantic_ai.messages import ModelMessage
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
@@ -261,6 +262,26 @@ async def chat(
             raise TimeoutError(
                 f"Agent response timed out after {self.settings.agent_timeout_seconds} seconds"
             ) from e
+        except UnexpectedModelBehavior as e:
+            # The model misbehaved (e.g. a tool call exceeded its retry budget).
+            # This is recoverable from the user's perspective — surface a clean
+            # message instead of leaking the raw PydanticAI exception string.
+            logger.warning(
+                "agents.chat_model_misbehavior",
+                session_id=session_id,
+                error=str(e),
+                error_type=type(e).__name__,
+            )
+            session.last_activity = datetime.now(UTC)
+            await db.flush()
+            return ChatResponse(
+                session_id=session_id,
+                message=(
+                    "I couldn't complete that request — the model produced an "
+                    "invalid tool call. Please try rephrasing, or give me a "
+                    "specific forecasting objective to work on."
+                ),
+            )
 
         # Extract tool calls from result
         tool_calls: list[ToolCallResult] = []
@@ -561,6 +582,29 @@ async def stream_chat(
             raise TimeoutError(
                 f"Agent response timed out after {self.settings.agent_timeout_seconds} seconds"
             ) from e
+        except UnexpectedModelBehavior as e:
+            # The model misbehaved (e.g. a tool call exceeded its retry budget).
+            # Emit a clean, recoverable `error` event rather than letting the raw
+            # PydanticAI exception bubble to the WebSocket handler.
+            logger.warning(
+                "agents.stream_chat_model_misbehavior",
+                session_id=session_id,
+                error=str(e),
+                error_type=type(e).__name__,
+            )
+            yield StreamEvent(
+                event_type="error",
+                data={
+                    "error": (
+                        "The assistant produced an invalid tool call and couldn't "
+                        "complete the request. Please try rephrasing your message."
+                    ),
+                    "error_type": "model_behavior_error",
+                    "recoverable": True,
+                },
+                timestamp=datetime.now(UTC),
+            )
+            return
 
         logger.info(
             "agents.stream_chat_completed",

@@ -5,6 +5,7 @@
 from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
+from pydantic_ai.exceptions import UnexpectedModelBehavior
 
 from app.features.agents.deps import AgentDeps
 from app.features.agents.models import AgentSession, AgentType, SessionStatus
@@ -287,6 +288,95 @@ async def test_chat_success(
         assert response.tokens_used == 100
         mock_agent.run.assert_called_once()
 
+    @pytest.mark.asyncio
+    async def test_chat_model_misbehavior_returns_friendly_message(
+        self,
+        sample_active_session: AgentSession,
+    ) -> None:
+        """A misbehaving model should yield a clean message, not crash.
+
+        Regression for issue #164: a tool call exceeding its retry budget
+        raised PydanticAI's `UnexpectedModelBehavior`, whose raw string
+        ("Tool '...' exceeded max retries count of 1") leaked to the user.
+        """
+        service = AgentService()
+        mock_db = AsyncMock()
+
+        mock_result = MagicMock()
+        mock_result.scalar_one_or_none.return_value = sample_active_session
+        mock_db.execute.return_value = mock_result
+
+        mock_agent = MagicMock()
+        mock_agent.run = AsyncMock(
+            side_effect=UnexpectedModelBehavior(
+                "Tool 'tool_compare_backtest_results' exceeded max retries count of 1"
+            )
+        )
+
+        with patch.object(service, "_get_agent", return_value=mock_agent):
+            response = await service.chat(
+                db=mock_db,
+                session_id=sample_active_session.session_id,
+                message="Hello",
+            )
+
+        assert response.session_id == sample_active_session.session_id
+        assert response.pending_approval is False
+        assert "invalid tool call" in response.message
+        assert "exceeded max retries" not in response.message
+
+
+class TestAgentServiceStreamChat:
+    """Tests for streaming chat functionality."""
+
+    @pytest.mark.asyncio
+    async def test_stream_chat_model_misbehavior_yields_error_event(
+        self,
+        sample_active_session: AgentSession,
+    ) -> None:
+        """A misbehaving model should yield a recoverable `error` event, not crash.
+
+        Regression for issue #164: `UnexpectedModelBehavior` raised inside
+        `agent.run_stream` bubbled to the WebSocket handler, which echoed the
+        raw exception string to the client.
+        """
+        service = AgentService()
+        mock_db = AsyncMock()
+
+        mock_result = MagicMock()
+        mock_result.scalar_one_or_none.return_value = sample_active_session
+        mock_db.execute.return_value = mock_result
+
+        class _RaisingStream:
+            """Async context manager that fails on entry like a misbehaving run."""
+
+            async def __aenter__(self) -> Any:
+                raise UnexpectedModelBehavior(
+                    "Tool 'tool_compare_backtest_results' exceeded max retries count of 1"
+                )
+
+            async def __aexit__(self, *exc: object) -> bool:
+                return False
+
+        mock_agent = MagicMock()
+        mock_agent.run_stream = MagicMock(return_value=_RaisingStream())
+
+        with patch.object(service, "_get_agent", return_value=mock_agent):
+            events = [
+                event
+                async for event in service.stream_chat(
+                    db=mock_db,
+                    session_id=sample_active_session.session_id,
+                    message="Hello",
+                )
+            ]
+
+        assert len(events) == 1
+        assert events[0].event_type == "error"
+        assert events[0].data["recoverable"] is True
+        assert events[0].data["error_type"] == "model_behavior_error"
+        assert "exceeded max retries" not in events[0].data["error"]
+
 
 class TestAgentServiceApproval:
     """Tests for approval workflow."""