Skip to content

Commit 835dfd7

Browse files
w7-learnclaude
andcommitted
fix(agents): improve validation, execution, and session handling
- .env.example: rename env vars to match Settings fields (AGENT_MAX_TOOL_CALLS, AGENT_REQUIRE_APPROVAL with JSON array format), update defaults to match config.py - config.py: validate model name is non-empty in model identifier - service.py: implement real action execution in approve_action instead of placeholder, add _execute_pending_action helper - backtesting_tools.py: fix docstring model types, add zero division guards in compare_backtest_results - forecasting_tools.py: fix docstring, add date range and horizon validation guards - registry_tools.py: add RunStatus validation before enum conversion - websocket.py: change to session-per-message pattern to prevent stale data and memory growth - docs/PHASE/9-AGENTIC_LAYER.md: update PR reference from #55 to #56 - README.md: update Agentic Layer config to match config.py Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 129de40 commit 835dfd7

10 files changed

Lines changed: 249 additions & 97 deletions

File tree

.env.example

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -78,17 +78,26 @@ ANTHROPIC_API_KEY=sk-ant-your-anthropic-api-key-here
7878
# Recommended: 4000 tokens for complex agent planning tasks
7979
# AGENT_THINKING_BUDGET=4000
8080

81-
# Session settings
82-
AGENT_SESSION_TTL_MINUTES=30
83-
AGENT_APPROVAL_TIMEOUT_MINUTES=5
84-
AGENT_MAX_TOOL_CALLS_PER_TURN=10
85-
8681
# Model parameters
82+
AGENT_TEMPERATURE=0.1
8783
AGENT_MAX_TOKENS=4096
88-
AGENT_TEMPERATURE=0.0
8984

90-
# Human-in-the-loop actions (comma-separated list)
91-
AGENT_APPROVAL_REQUIRED_ACTIONS=create_alias,archive_run
85+
# Execution settings
86+
AGENT_MAX_TOOL_CALLS=10
87+
AGENT_TIMEOUT_SECONDS=120
88+
AGENT_RETRY_ATTEMPTS=3
89+
AGENT_RETRY_DELAY_SECONDS=1.0
90+
91+
# Session settings
92+
AGENT_SESSION_TTL_MINUTES=120
93+
AGENT_MAX_SESSIONS_PER_USER=5
94+
95+
# Human-in-the-loop actions (JSON array format required for safe parsing)
96+
AGENT_REQUIRE_APPROVAL=["create_alias","archive_run"]
97+
AGENT_APPROVAL_TIMEOUT_MINUTES=60
98+
99+
# Streaming
100+
AGENT_ENABLE_STREAMING=true
92101

93102
# Frontend (Vite)
94103
VITE_API_BASE_URL=http://localhost:8123

README.md

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -560,15 +560,33 @@ curl -X POST http://localhost:8123/agents/sessions/{session_id}/chat \
560560
**Configuration:**
561561
```bash
562562
# Agent LLM Configuration
563-
ANTHROPIC_API_KEY=sk-ant-your-key
564-
AGENT_MODEL_NAME=claude-3-haiku-20240307
565-
AGENT_TEMPERATURE=0.0
563+
# Model format: "provider:model-name" (e.g., anthropic:claude-sonnet-4-5)
564+
AGENT_DEFAULT_MODEL=anthropic:claude-sonnet-4-5
565+
AGENT_FALLBACK_MODEL=openai:gpt-4o
566+
AGENT_TEMPERATURE=0.1
566567
AGENT_MAX_TOKENS=4096
567568

569+
# API Keys (set based on your chosen provider)
570+
ANTHROPIC_API_KEY=sk-ant-your-key
571+
# OPENAI_API_KEY=sk-your-key
572+
# GOOGLE_API_KEY=your-google-api-key # For Gemini models
573+
574+
# Execution Configuration
575+
AGENT_MAX_TOOL_CALLS=10
576+
AGENT_TIMEOUT_SECONDS=120
577+
AGENT_RETRY_ATTEMPTS=3
578+
AGENT_RETRY_DELAY_SECONDS=1.0
579+
568580
# Session Configuration
569-
AGENT_SESSION_TTL_MINUTES=30
570-
AGENT_APPROVAL_TIMEOUT_MINUTES=5
571-
AGENT_MAX_TOOL_CALLS_PER_TURN=10
581+
AGENT_SESSION_TTL_MINUTES=120
582+
AGENT_MAX_SESSIONS_PER_USER=5
583+
584+
# Human-in-the-loop Configuration (JSON array format)
585+
AGENT_REQUIRE_APPROVAL=["create_alias","archive_run"]
586+
AGENT_APPROVAL_TIMEOUT_MINUTES=60
587+
588+
# Streaming Configuration
589+
AGENT_ENABLE_STREAMING=true
572590
```
573591

574592
### Error Responses (RFC 7807)

app/core/config.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,15 +131,25 @@ def validate_model_identifier(cls, v: str) -> str:
131131
Validated model identifier.
132132
133133
Raises:
134-
ValueError: If format is invalid.
134+
ValueError: If format is invalid or model name is missing.
135135
"""
136136
if ":" not in v:
137137
raise ValueError(
138138
f"Invalid model identifier '{v}'. "
139139
"Expected format: 'provider:model-name' "
140140
"(e.g., 'anthropic:claude-sonnet-4-5', 'google-gla:gemini-3-flash')"
141141
)
142-
provider, _ = v.split(":", 1)
142+
provider, model_name = v.split(":", 1)
143+
144+
# Validate model name is non-empty and not just whitespace
145+
if not model_name or not model_name.strip():
146+
raise ValueError(
147+
f"Invalid model identifier '{v}'. "
148+
"Model name after ':' cannot be empty or blank. "
149+
"Expected format: 'provider:model-name' "
150+
"(e.g., 'anthropic:claude-sonnet-4-5', 'google-gla:gemini-3-flash')"
151+
)
152+
143153
valid_providers = ["anthropic", "openai", "google-gla", "google-vertex"]
144154
if provider not in valid_providers:
145155
raise ValueError(f"Unknown provider '{provider}'. Valid providers: {valid_providers}")

app/features/agents/service.py

Lines changed: 67 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -466,15 +466,34 @@ async def approve_action(
466466
session.last_activity = datetime.now(UTC)
467467

468468
result: Any = None
469-
status: Literal["executed", "rejected", "expired"] = (
470-
"rejected" if not approved else "executed"
471-
)
469+
status: Literal["executed", "rejected", "expired"] = "rejected"
472470

473471
if approved:
474472
# Execute the pending action
475-
# Note: In production, we would re-run the tool here
476-
result = {"message": "Action approved and executed"}
477-
status = "executed"
473+
try:
474+
result = await self._execute_pending_action(
475+
db=db,
476+
action_type=pending.get("action_type", "unknown"),
477+
arguments=pending.get("arguments", {}),
478+
)
479+
status = "executed"
480+
logger.info(
481+
"agents.action_executed",
482+
session_id=session_id,
483+
action_id=action_id,
484+
action_type=pending.get("action_type"),
485+
)
486+
except Exception as e:
487+
logger.exception(
488+
"agents.action_execution_failed",
489+
session_id=session_id,
490+
action_id=action_id,
491+
action_type=pending.get("action_type"),
492+
error=str(e),
493+
error_type=type(e).__name__,
494+
)
495+
result = {"error": str(e), "error_type": type(e).__name__}
496+
status = "rejected" # Mark as rejected on failure
478497

479498
await db.flush()
480499

@@ -606,3 +625,45 @@ def _format_pending_action(
606625
created_at=datetime.fromisoformat(pending.get("created_at", "")),
607626
expires_at=datetime.fromisoformat(pending.get("expires_at", "")),
608627
)
628+
629+
async def _execute_pending_action(
630+
self,
631+
db: AsyncSession,
632+
action_type: str,
633+
arguments: dict[str, Any],
634+
) -> dict[str, Any]:
635+
"""Execute a pending action that was approved.
636+
637+
Args:
638+
db: Database session.
639+
action_type: Type of action to execute (e.g., 'create_alias', 'archive_run').
640+
arguments: Arguments for the action.
641+
642+
Returns:
643+
Result dictionary from the executed action.
644+
645+
Raises:
646+
ValueError: If action_type is not recognized.
647+
"""
648+
from app.features.agents.tools.registry_tools import archive_run, create_alias
649+
650+
if action_type == "create_alias":
651+
alias_name = arguments.get("alias_name", "")
652+
run_id = arguments.get("run_id", "")
653+
description = arguments.get("description")
654+
return await create_alias(
655+
db=db,
656+
alias_name=alias_name,
657+
run_id=run_id,
658+
description=description,
659+
)
660+
elif action_type == "archive_run":
661+
run_id = arguments.get("run_id", "")
662+
result = await archive_run(db=db, run_id=run_id)
663+
if result is None:
664+
raise ValueError(f"Run not found: {run_id}")
665+
return result
666+
else:
667+
raise ValueError(
668+
f"Unknown action type: {action_type}. Supported actions: create_alias, archive_run"
669+
)

app/features/agents/tests/test_service.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,11 @@ async def test_approve_action_approved(
362362
mock_result.scalar_one_or_none.return_value = sample_awaiting_approval_session
363363
mock_db.execute.return_value = mock_result
364364

365+
# Mock the _execute_pending_action method to return success
366+
service._execute_pending_action = AsyncMock( # type: ignore[method-assign]
367+
return_value={"message": "Alias created successfully", "alias_name": "production"}
368+
)
369+
365370
pending = sample_awaiting_approval_session.pending_action
366371
assert pending is not None
367372
action_id = pending["action_id"]
@@ -376,6 +381,12 @@ async def test_approve_action_approved(
376381
assert response.status == "executed"
377382
assert sample_awaiting_approval_session.pending_action is None
378383
assert sample_awaiting_approval_session.status == SessionStatus.ACTIVE.value
384+
# Verify _execute_pending_action was called with correct arguments
385+
service._execute_pending_action.assert_called_once_with(
386+
db=mock_db,
387+
action_type="create_alias",
388+
arguments={"alias_name": "production", "run_id": "abc123"},
389+
)
379390

380391
@pytest.mark.asyncio
381392
async def test_approve_action_rejected(

app/features/agents/tools/backtesting_tools.py

Lines changed: 35 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,19 @@ def _create_model_config(
3838
"""Create model configuration from type string.
3939
4040
Args:
41-
model_type: Type of model ('naive', 'seasonal_naive', 'linear_regression').
42-
season_length: Season length for seasonal models (default 7 for weekly).
41+
model_type: Type of model. Supported values:
42+
- 'naive': Last observed value (simple baseline)
43+
- 'seasonal_naive': Same period from previous season
44+
- 'moving_average': Mean of last N observations
45+
season_length: Season length for seasonal_naive model (default 7 for weekly).
46+
Only used when model_type is 'seasonal_naive'.
4347
4448
Returns:
45-
Configured ModelConfig instance.
49+
Configured ModelConfig instance (NaiveModelConfig, SeasonalNaiveModelConfig,
50+
or MovingAverageModelConfig).
4651
4752
Raises:
48-
ValueError: If model_type is not supported.
53+
ValueError: If model_type is not one of: naive, seasonal_naive, moving_average.
4954
"""
5055
if model_type == "naive":
5156
return NaiveModelConfig()
@@ -248,17 +253,33 @@ def compare_backtest_results(
248253
mae_b = metrics_b.get("mae")
249254
if mae_a is not None and mae_b is not None:
250255
if mae_a < mae_b:
251-
pct_better = ((mae_b - mae_a) / mae_b) * 100
252-
comparison["recommendation"] = (
253-
f"Model A ({main_a.get('model_type')}) performs better with "
254-
f"{pct_better:.1f}% lower MAE ({mae_a:.2f} vs {mae_b:.2f})."
255-
)
256+
# Guard against division by zero
257+
if mae_b == 0:
258+
comparison["recommendation"] = (
259+
f"Model A ({main_a.get('model_type')}) performs better with "
260+
f"MAE {mae_a:.2f} vs {mae_b:.2f} (improvement is infinite/undetermined "
261+
f"as Model B has zero MAE baseline)."
262+
)
263+
else:
264+
pct_better = ((mae_b - mae_a) / mae_b) * 100
265+
comparison["recommendation"] = (
266+
f"Model A ({main_a.get('model_type')}) performs better with "
267+
f"{pct_better:.1f}% lower MAE ({mae_a:.2f} vs {mae_b:.2f})."
268+
)
256269
elif mae_b < mae_a:
257-
pct_better = ((mae_a - mae_b) / mae_a) * 100
258-
comparison["recommendation"] = (
259-
f"Model B ({main_b.get('model_type')}) performs better with "
260-
f"{pct_better:.1f}% lower MAE ({mae_b:.2f} vs {mae_a:.2f})."
261-
)
270+
# Guard against division by zero
271+
if mae_a == 0:
272+
comparison["recommendation"] = (
273+
f"Model B ({main_b.get('model_type')}) performs better with "
274+
f"MAE {mae_b:.2f} vs {mae_a:.2f} (improvement is infinite/undetermined "
275+
f"as Model A has zero MAE baseline)."
276+
)
277+
else:
278+
pct_better = ((mae_a - mae_b) / mae_a) * 100
279+
comparison["recommendation"] = (
280+
f"Model B ({main_b.get('model_type')}) performs better with "
281+
f"{pct_better:.1f}% lower MAE ({mae_b:.2f} vs {mae_a:.2f})."
282+
)
262283
else:
263284
comparison["recommendation"] = (
264285
f"Both models have identical MAE ({mae_a:.2f}). "

app/features/agents/tools/forecasting_tools.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import structlog
1616
from sqlalchemy.ext.asyncio import AsyncSession
1717

18+
from app.core.config import get_settings
1819
from app.features.forecasting.schemas import (
1920
ModelConfig,
2021
MovingAverageModelConfig,
@@ -35,14 +36,19 @@ def _create_model_config(
3536
"""Create model configuration from type string.
3637
3738
Args:
38-
model_type: Type of model ('naive', 'seasonal_naive', 'linear_regression').
39-
season_length: Season length for seasonal models (default 7 for weekly).
39+
model_type: Type of model. Supported values:
40+
- 'naive': Last observed value (simple baseline)
41+
- 'seasonal_naive': Same period from previous season
42+
- 'moving_average': Mean of last N observations
43+
season_length: Season length for seasonal_naive model (default 7 for weekly).
44+
Only used when model_type is 'seasonal_naive'.
4045
4146
Returns:
42-
Configured ModelConfig instance.
47+
Configured ModelConfig instance (NaiveModelConfig, SeasonalNaiveModelConfig,
48+
or MovingAverageModelConfig).
4349
4450
Raises:
45-
ValueError: If model_type is not supported.
51+
ValueError: If model_type is not one of: naive, seasonal_naive, moving_average.
4652
"""
4753
if model_type == "naive":
4854
return NaiveModelConfig()
@@ -104,6 +110,12 @@ async def train_model(
104110
model_type=model_type,
105111
)
106112

113+
# Validate date range
114+
if train_start_date > train_end_date:
115+
raise ValueError(
116+
f"train_start_date ({train_start_date}) must be <= train_end_date ({train_end_date})"
117+
)
118+
107119
# Create model configuration
108120
model_config = _create_model_config(model_type, season_length)
109121

@@ -168,6 +180,13 @@ async def predict(
168180
model_path=model_path,
169181
)
170182

183+
# Validate horizon against max limit
184+
settings = get_settings()
185+
if horizon > settings.forecast_max_horizon:
186+
raise ValueError(
187+
f"horizon ({horizon}) exceeds maximum allowed ({settings.forecast_max_horizon})"
188+
)
189+
171190
# Generate predictions
172191
service = ForecastingService()
173192
result: PredictResponse = await service.predict(

app/features/agents/tools/registry_tools.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,13 @@ async def list_runs(
7272

7373
service = RegistryService()
7474

75-
# Convert status string to enum if provided
76-
status_enum = RunStatus(status) if status else None
75+
# Convert status string to enum if provided, with validation
76+
status_enum: RunStatus | None = None
77+
if status:
78+
valid_statuses = [s.value for s in RunStatus]
79+
if status not in valid_statuses:
80+
raise ValueError(f"Invalid run status: '{status}'. Valid values: {valid_statuses}")
81+
status_enum = RunStatus(status)
7782

7883
result: RunListResponse = await service.list_runs(
7984
db=db,

0 commit comments

Comments
 (0)