fix(agents): improve validation, execution, and session handling

w7-learn · claude · w7-learn · commit 835dfd71e7aa · 2026-02-01T21:30:39.000Z
- .env.example: rename env vars to match Settings fields (AGENT_MAX_TOOL_CALLS, AGENT_REQUIRE_APPROVAL with JSON array format), update defaults to match config.py - config.py: validate model name is non-empty in model identifier - service.py: implement real action execution in approve_action instead of placeholder, add _execute_pending_action helper - backtesting_tools.py: fix docstring model types, add zero division guards in compare_backtest_results - forecasting_tools.py: fix docstring, add date range and horizon validation guards - registry_tools.py: add RunStatus validation before enum conversion - websocket.py: change to session-per-message pattern to prevent stale data and memory growth - docs/PHASE/9-AGENTIC_LAYER.md: update PR reference from #55 to #56 - README.md: update Agentic Layer config to match config.py Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/.env.example b/.env.example
@@ -78,17 +78,26 @@ ANTHROPIC_API_KEY=sk-ant-your-anthropic-api-key-here
 # Recommended: 4000 tokens for complex agent planning tasks
 # AGENT_THINKING_BUDGET=4000
 
-# Session settings
-AGENT_SESSION_TTL_MINUTES=30
-AGENT_APPROVAL_TIMEOUT_MINUTES=5
-AGENT_MAX_TOOL_CALLS_PER_TURN=10
-
 # Model parameters
+AGENT_TEMPERATURE=0.1
 AGENT_MAX_TOKENS=4096
-AGENT_TEMPERATURE=0.0
 
-# Human-in-the-loop actions (comma-separated list)
-AGENT_APPROVAL_REQUIRED_ACTIONS=create_alias,archive_run
+# Execution settings
+AGENT_MAX_TOOL_CALLS=10
+AGENT_TIMEOUT_SECONDS=120
+AGENT_RETRY_ATTEMPTS=3
+AGENT_RETRY_DELAY_SECONDS=1.0
+
+# Session settings
+AGENT_SESSION_TTL_MINUTES=120
+AGENT_MAX_SESSIONS_PER_USER=5
+
+# Human-in-the-loop actions (JSON array format required for safe parsing)
+AGENT_REQUIRE_APPROVAL=["create_alias","archive_run"]
+AGENT_APPROVAL_TIMEOUT_MINUTES=60
+
+# Streaming
+AGENT_ENABLE_STREAMING=true
 
 # Frontend (Vite)
 VITE_API_BASE_URL=http://localhost:8123
diff --git a/README.md b/README.md
@@ -560,15 +560,33 @@ curl -X POST http://localhost:8123/agents/sessions/{session_id}/chat \
 **Configuration:**
 ```bash
 # Agent LLM Configuration
-ANTHROPIC_API_KEY=sk-ant-your-key
-AGENT_MODEL_NAME=claude-3-haiku-20240307
-AGENT_TEMPERATURE=0.0
+# Model format: "provider:model-name" (e.g., anthropic:claude-sonnet-4-5)
+AGENT_DEFAULT_MODEL=anthropic:claude-sonnet-4-5
+AGENT_FALLBACK_MODEL=openai:gpt-4o
+AGENT_TEMPERATURE=0.1
 AGENT_MAX_TOKENS=4096
 
+# API Keys (set based on your chosen provider)
+ANTHROPIC_API_KEY=sk-ant-your-key
+# OPENAI_API_KEY=sk-your-key
+# GOOGLE_API_KEY=your-google-api-key  # For Gemini models
+
+# Execution Configuration
+AGENT_MAX_TOOL_CALLS=10
+AGENT_TIMEOUT_SECONDS=120
+AGENT_RETRY_ATTEMPTS=3
+AGENT_RETRY_DELAY_SECONDS=1.0
+
 # Session Configuration
-AGENT_SESSION_TTL_MINUTES=30
-AGENT_APPROVAL_TIMEOUT_MINUTES=5
-AGENT_MAX_TOOL_CALLS_PER_TURN=10
+AGENT_SESSION_TTL_MINUTES=120
+AGENT_MAX_SESSIONS_PER_USER=5
+
+# Human-in-the-loop Configuration (JSON array format)
+AGENT_REQUIRE_APPROVAL=["create_alias","archive_run"]
+AGENT_APPROVAL_TIMEOUT_MINUTES=60
+
+# Streaming Configuration
+AGENT_ENABLE_STREAMING=true
 ```
 
 ### Error Responses (RFC 7807)
diff --git a/app/core/config.py b/app/core/config.py
@@ -131,15 +131,25 @@ def validate_model_identifier(cls, v: str) -> str:
             Validated model identifier.
 
         Raises:
-            ValueError: If format is invalid.
+            ValueError: If format is invalid or model name is missing.
         """
         if ":" not in v:
             raise ValueError(
                 f"Invalid model identifier '{v}'. "
                 "Expected format: 'provider:model-name' "
                 "(e.g., 'anthropic:claude-sonnet-4-5', 'google-gla:gemini-3-flash')"
             )
-        provider, _ = v.split(":", 1)
+        provider, model_name = v.split(":", 1)
+
+        # Validate model name is non-empty and not just whitespace
+        if not model_name or not model_name.strip():
+            raise ValueError(
+                f"Invalid model identifier '{v}'. "
+                "Model name after ':' cannot be empty or blank. "
+                "Expected format: 'provider:model-name' "
+                "(e.g., 'anthropic:claude-sonnet-4-5', 'google-gla:gemini-3-flash')"
+            )
+
         valid_providers = ["anthropic", "openai", "google-gla", "google-vertex"]
         if provider not in valid_providers:
             raise ValueError(f"Unknown provider '{provider}'. Valid providers: {valid_providers}")
diff --git a/app/features/agents/service.py b/app/features/agents/service.py
@@ -466,15 +466,34 @@ async def approve_action(
         session.last_activity = datetime.now(UTC)
 
         result: Any = None
-        status: Literal["executed", "rejected", "expired"] = (
-            "rejected" if not approved else "executed"
-        )
+        status: Literal["executed", "rejected", "expired"] = "rejected"
 
         if approved:
             # Execute the pending action
-            # Note: In production, we would re-run the tool here
-            result = {"message": "Action approved and executed"}
-            status = "executed"
+            try:
+                result = await self._execute_pending_action(
+                    db=db,
+                    action_type=pending.get("action_type", "unknown"),
+                    arguments=pending.get("arguments", {}),
+                )
+                status = "executed"
+                logger.info(
+                    "agents.action_executed",
+                    session_id=session_id,
+                    action_id=action_id,
+                    action_type=pending.get("action_type"),
+                )
+            except Exception as e:
+                logger.exception(
+                    "agents.action_execution_failed",
+                    session_id=session_id,
+                    action_id=action_id,
+                    action_type=pending.get("action_type"),
+                    error=str(e),
+                    error_type=type(e).__name__,
+                )
+                result = {"error": str(e), "error_type": type(e).__name__}
+                status = "rejected"  # Mark as rejected on failure
 
         await db.flush()
 
@@ -606,3 +625,45 @@ def _format_pending_action(
             created_at=datetime.fromisoformat(pending.get("created_at", "")),
             expires_at=datetime.fromisoformat(pending.get("expires_at", "")),
         )
+
+    async def _execute_pending_action(
+        self,
+        db: AsyncSession,
+        action_type: str,
+        arguments: dict[str, Any],
+    ) -> dict[str, Any]:
+        """Execute a pending action that was approved.
+
+        Args:
+            db: Database session.
+            action_type: Type of action to execute (e.g., 'create_alias', 'archive_run').
+            arguments: Arguments for the action.
+
+        Returns:
+            Result dictionary from the executed action.
+
+        Raises:
+            ValueError: If action_type is not recognized.
+        """
+        from app.features.agents.tools.registry_tools import archive_run, create_alias
+
+        if action_type == "create_alias":
+            alias_name = arguments.get("alias_name", "")
+            run_id = arguments.get("run_id", "")
+            description = arguments.get("description")
+            return await create_alias(
+                db=db,
+                alias_name=alias_name,
+                run_id=run_id,
+                description=description,
+            )
+        elif action_type == "archive_run":
+            run_id = arguments.get("run_id", "")
+            result = await archive_run(db=db, run_id=run_id)
+            if result is None:
+                raise ValueError(f"Run not found: {run_id}")
+            return result
+        else:
+            raise ValueError(
+                f"Unknown action type: {action_type}. Supported actions: create_alias, archive_run"
+            )
diff --git a/app/features/agents/tests/test_service.py b/app/features/agents/tests/test_service.py
@@ -362,6 +362,11 @@ async def test_approve_action_approved(
         mock_result.scalar_one_or_none.return_value = sample_awaiting_approval_session
         mock_db.execute.return_value = mock_result
 
+        # Mock the _execute_pending_action method to return success
+        service._execute_pending_action = AsyncMock(  # type: ignore[method-assign]
+            return_value={"message": "Alias created successfully", "alias_name": "production"}
+        )
+
         pending = sample_awaiting_approval_session.pending_action
         assert pending is not None
         action_id = pending["action_id"]
@@ -376,6 +381,12 @@ async def test_approve_action_approved(
         assert response.status == "executed"
         assert sample_awaiting_approval_session.pending_action is None
         assert sample_awaiting_approval_session.status == SessionStatus.ACTIVE.value
+        # Verify _execute_pending_action was called with correct arguments
+        service._execute_pending_action.assert_called_once_with(
+            db=mock_db,
+            action_type="create_alias",
+            arguments={"alias_name": "production", "run_id": "abc123"},
+        )
 
     @pytest.mark.asyncio
     async def test_approve_action_rejected(
diff --git a/app/features/agents/tools/backtesting_tools.py b/app/features/agents/tools/backtesting_tools.py
@@ -38,14 +38,19 @@ def _create_model_config(
     """Create model configuration from type string.
 
     Args:
-        model_type: Type of model ('naive', 'seasonal_naive', 'linear_regression').
-        season_length: Season length for seasonal models (default 7 for weekly).
+        model_type: Type of model. Supported values:
+            - 'naive': Last observed value (simple baseline)
+            - 'seasonal_naive': Same period from previous season
+            - 'moving_average': Mean of last N observations
+        season_length: Season length for seasonal_naive model (default 7 for weekly).
+            Only used when model_type is 'seasonal_naive'.
 
     Returns:
-        Configured ModelConfig instance.
+        Configured ModelConfig instance (NaiveModelConfig, SeasonalNaiveModelConfig,
+        or MovingAverageModelConfig).
 
     Raises:
-        ValueError: If model_type is not supported.
+        ValueError: If model_type is not one of: naive, seasonal_naive, moving_average.
     """
     if model_type == "naive":
         return NaiveModelConfig()
@@ -248,17 +253,33 @@ def compare_backtest_results(
     mae_b = metrics_b.get("mae")
     if mae_a is not None and mae_b is not None:
         if mae_a < mae_b:
-            pct_better = ((mae_b - mae_a) / mae_b) * 100
-            comparison["recommendation"] = (
-                f"Model A ({main_a.get('model_type')}) performs better with "
-                f"{pct_better:.1f}% lower MAE ({mae_a:.2f} vs {mae_b:.2f})."
-            )
+            # Guard against division by zero
+            if mae_b == 0:
+                comparison["recommendation"] = (
+                    f"Model A ({main_a.get('model_type')}) performs better with "
+                    f"MAE {mae_a:.2f} vs {mae_b:.2f} (improvement is infinite/undetermined "
+                    f"as Model B has zero MAE baseline)."
+                )
+            else:
+                pct_better = ((mae_b - mae_a) / mae_b) * 100
+                comparison["recommendation"] = (
+                    f"Model A ({main_a.get('model_type')}) performs better with "
+                    f"{pct_better:.1f}% lower MAE ({mae_a:.2f} vs {mae_b:.2f})."
+                )
         elif mae_b < mae_a:
-            pct_better = ((mae_a - mae_b) / mae_a) * 100
-            comparison["recommendation"] = (
-                f"Model B ({main_b.get('model_type')}) performs better with "
-                f"{pct_better:.1f}% lower MAE ({mae_b:.2f} vs {mae_a:.2f})."
-            )
+            # Guard against division by zero
+            if mae_a == 0:
+                comparison["recommendation"] = (
+                    f"Model B ({main_b.get('model_type')}) performs better with "
+                    f"MAE {mae_b:.2f} vs {mae_a:.2f} (improvement is infinite/undetermined "
+                    f"as Model A has zero MAE baseline)."
+                )
+            else:
+                pct_better = ((mae_a - mae_b) / mae_a) * 100
+                comparison["recommendation"] = (
+                    f"Model B ({main_b.get('model_type')}) performs better with "
+                    f"{pct_better:.1f}% lower MAE ({mae_b:.2f} vs {mae_a:.2f})."
+                )
         else:
             comparison["recommendation"] = (
                 f"Both models have identical MAE ({mae_a:.2f}). "
diff --git a/app/features/agents/tools/forecasting_tools.py b/app/features/agents/tools/forecasting_tools.py
@@ -15,6 +15,7 @@
 import structlog
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.core.config import get_settings
 from app.features.forecasting.schemas import (
     ModelConfig,
     MovingAverageModelConfig,
@@ -35,14 +36,19 @@ def _create_model_config(
     """Create model configuration from type string.
 
     Args:
-        model_type: Type of model ('naive', 'seasonal_naive', 'linear_regression').
-        season_length: Season length for seasonal models (default 7 for weekly).
+        model_type: Type of model. Supported values:
+            - 'naive': Last observed value (simple baseline)
+            - 'seasonal_naive': Same period from previous season
+            - 'moving_average': Mean of last N observations
+        season_length: Season length for seasonal_naive model (default 7 for weekly).
+            Only used when model_type is 'seasonal_naive'.
 
     Returns:
-        Configured ModelConfig instance.
+        Configured ModelConfig instance (NaiveModelConfig, SeasonalNaiveModelConfig,
+        or MovingAverageModelConfig).
 
     Raises:
-        ValueError: If model_type is not supported.
+        ValueError: If model_type is not one of: naive, seasonal_naive, moving_average.
     """
     if model_type == "naive":
         return NaiveModelConfig()
@@ -104,6 +110,12 @@ async def train_model(
         model_type=model_type,
     )
 
+    # Validate date range
+    if train_start_date > train_end_date:
+        raise ValueError(
+            f"train_start_date ({train_start_date}) must be <= train_end_date ({train_end_date})"
+        )
+
     # Create model configuration
     model_config = _create_model_config(model_type, season_length)
 
@@ -168,6 +180,13 @@ async def predict(
         model_path=model_path,
     )
 
+    # Validate horizon against max limit
+    settings = get_settings()
+    if horizon > settings.forecast_max_horizon:
+        raise ValueError(
+            f"horizon ({horizon}) exceeds maximum allowed ({settings.forecast_max_horizon})"
+        )
+
     # Generate predictions
     service = ForecastingService()
     result: PredictResponse = await service.predict(
diff --git a/app/features/agents/tools/registry_tools.py b/app/features/agents/tools/registry_tools.py
@@ -72,8 +72,13 @@ async def list_runs(
 
     service = RegistryService()
 
-    # Convert status string to enum if provided
-    status_enum = RunStatus(status) if status else None
+    # Convert status string to enum if provided, with validation
+    status_enum: RunStatus | None = None
+    if status:
+        valid_statuses = [s.value for s in RunStatus]
+        if status not in valid_statuses:
+            raise ValueError(f"Invalid run status: '{status}'. Valid values: {valid_statuses}")
+        status_enum = RunStatus(status)
 
     result: RunListResponse = await service.list_runs(
         db=db,
diff --git a/app/features/agents/websocket.py b/app/features/agents/websocket.py
diff --git a/docs/PHASE/9-AGENTIC_LAYER.md b/docs/PHASE/9-AGENTIC_LAYER.md