diff --git a/app/features/backtesting/service.py b/app/features/backtesting/service.py index 4a72118b..2fe84d2c 100644 --- a/app/features/backtesting/service.py +++ b/app/features/backtesting/service.py @@ -6,6 +6,7 @@ - Training and predicting with models per fold - Calculating metrics and aggregating results - Running baseline comparisons +- Saving results to configured directory CRITICAL: All operations respect time-safety constraints. """ @@ -16,6 +17,7 @@ import uuid from dataclasses import dataclass, field from datetime import date as date_type +from pathlib import Path from typing import TYPE_CHECKING, Any import numpy as np @@ -88,6 +90,73 @@ def __init__(self) -> None: self.settings = get_settings() self.metrics_calculator = MetricsCalculator() + def _validate_config(self, config: BacktestConfig) -> None: + """Validate backtest configuration against settings constraints. + + Args: + config: Backtest configuration to validate. + + Raises: + ValueError: If config violates settings constraints. + """ + split_config = config.split_config + + # Validate n_splits against backtest_max_splits + if split_config.n_splits > self.settings.backtest_max_splits: + raise ValueError( + f"n_splits ({split_config.n_splits}) exceeds maximum allowed " + f"({self.settings.backtest_max_splits}). " + f"Adjust split_config.n_splits or increase BACKTEST_MAX_SPLITS setting." + ) + + # Validate gap against backtest_max_gap + if split_config.gap > self.settings.backtest_max_gap: + raise ValueError( + f"gap ({split_config.gap}) exceeds maximum allowed " + f"({self.settings.backtest_max_gap}). " + f"Adjust split_config.gap or increase BACKTEST_MAX_GAP setting." + ) + + # Validate min_train_size meets minimum threshold + if split_config.min_train_size < self.settings.backtest_default_min_train_size: + logger.warning( + "backtesting.min_train_size_below_default", + provided=split_config.min_train_size, + default=self.settings.backtest_default_min_train_size, + message="Using provided min_train_size below recommended default", + ) + + def save_results( + self, + response: BacktestResponse, + filename: str | None = None, + ) -> Path: + """Save backtest results to configured results directory. + + Args: + response: BacktestResponse to save. + filename: Optional custom filename. Defaults to backtest_id.json. + + Returns: + Path to saved results file. + """ + results_dir = Path(self.settings.backtest_results_dir) + results_dir.mkdir(parents=True, exist_ok=True) + + if filename is None: + filename = f"{response.backtest_id}.json" + + file_path = results_dir / filename + file_path.write_text(response.model_dump_json(indent=2)) + + logger.info( + "backtesting.results_saved", + backtest_id=response.backtest_id, + file_path=str(file_path), + ) + + return file_path + async def run_backtest( self, db: AsyncSession, @@ -111,8 +180,12 @@ async def run_backtest( BacktestResponse with all results. Raises: - ValueError: If insufficient data for requested splits. + ValueError: If insufficient data for requested splits or config + violates settings constraints. """ + # Validate config against settings constraints + self._validate_config(config) + start_time = time.perf_counter() backtest_id = uuid.uuid4().hex[:16] @@ -331,6 +404,10 @@ def _run_baseline_comparisons( return results + # Metrics where the sign matters and we should compare absolute values + # for percentage improvement calculations + SIGNED_METRICS: frozenset[str] = frozenset({"bias"}) + def _generate_comparison_summary( self, main_results: ModelBacktestResult, @@ -345,11 +422,16 @@ def _generate_comparison_summary( Returns: Dictionary with comparison metrics. Keys are metric names, values are dicts with: - - main: Main model value - - naive: Naive baseline value (if available) - - seasonal_naive: Seasonal naive value (if available) + - main: Main model value (original signed value) + - naive: Naive baseline value (original signed value, if available) + - seasonal_naive: Seasonal naive value (original signed value, if available) - vs_naive_pct: Percentage improvement over naive - vs_seasonal_pct: Percentage improvement over seasonal + + Note: + For signed metrics (e.g., bias), percentage improvements are computed + using absolute values since a smaller absolute value is better + regardless of sign. """ summary: dict[str, dict[str, float]] = {} @@ -362,21 +444,48 @@ def _generate_comparison_summary( for metric_name, main_value in main_results.aggregated_metrics.items(): comparison: dict[str, float] = {"main": main_value} + # Determine if this is a signed metric + is_signed = metric_name in self.SIGNED_METRICS + # Add baseline values and compute improvements if "naive" in baseline_by_type: naive_value = baseline_by_type["naive"].get(metric_name, np.nan) comparison["naive"] = naive_value - if not np.isnan(naive_value) and naive_value != 0: - # Negative improvement means main is worse - comparison["vs_naive_pct"] = ((naive_value - main_value) / naive_value) * 100 + + if not np.isnan(naive_value): + if is_signed: + # For signed metrics, compare absolute values + abs_main = abs(main_value) + abs_naive = abs(naive_value) + if abs_naive != 0: + # Improvement = (abs_baseline - abs_main) / abs_baseline * 100 + comparison["vs_naive_pct"] = ( + (abs_naive - abs_main) / abs_naive + ) * 100 + elif naive_value != 0: + # For unsigned metrics, use original formula + comparison["vs_naive_pct"] = ( + (naive_value - main_value) / naive_value + ) * 100 if "seasonal_naive" in baseline_by_type: seasonal_value = baseline_by_type["seasonal_naive"].get(metric_name, np.nan) comparison["seasonal_naive"] = seasonal_value - if not np.isnan(seasonal_value) and seasonal_value != 0: - comparison["vs_seasonal_pct"] = ( - (seasonal_value - main_value) / seasonal_value - ) * 100 + + if not np.isnan(seasonal_value): + if is_signed: + # For signed metrics, compare absolute values + abs_main = abs(main_value) + abs_seasonal = abs(seasonal_value) + if abs_seasonal != 0: + comparison["vs_seasonal_pct"] = ( + (abs_seasonal - abs_main) / abs_seasonal + ) * 100 + elif seasonal_value != 0: + # For unsigned metrics, use original formula + comparison["vs_seasonal_pct"] = ( + (seasonal_value - main_value) / seasonal_value + ) * 100 summary[metric_name] = comparison diff --git a/app/features/backtesting/tests/test_service.py b/app/features/backtesting/tests/test_service.py index 2ed9bc62..61dbb220 100644 --- a/app/features/backtesting/tests/test_service.py +++ b/app/features/backtesting/tests/test_service.py @@ -1,6 +1,7 @@ """Tests for backtesting service.""" from datetime import date, timedelta +from pathlib import Path from unittest.mock import AsyncMock, MagicMock import numpy as np @@ -245,6 +246,120 @@ def test_comparison_improvement_percentage(self) -> None: # Improvement = (20-10)/20 * 100 = 50% assert summary["mae"]["vs_naive_pct"] == pytest.approx(50.0) + def test_comparison_signed_metric_uses_absolute_values(self) -> None: + """Test that signed metrics (like bias) use absolute values for improvement.""" + service = BacktestingService() + + from app.features.backtesting.schemas import ModelBacktestResult + + # Create mock results with signed bias values + # Main has bias=-5 (over-forecasting by 5) + # Naive has bias=-10 (over-forecasting by 10) + # Since |-5| < |-10|, main is better + main_results = ModelBacktestResult( + model_type="test_model", + config_hash="abc123", + fold_results=[], + aggregated_metrics={"bias": -5.0}, + metric_std={"bias_std": 1.0}, + ) + + baseline_results = [ + ModelBacktestResult( + model_type="naive", + config_hash="def456", + fold_results=[], + aggregated_metrics={"bias": -10.0}, + metric_std={"bias_std": 2.0}, + ) + ] + + summary = service._generate_comparison_summary( + main_results=main_results, + baseline_results=baseline_results, + ) + + # Original signed values should be preserved + assert summary["bias"]["main"] == -5.0 + assert summary["bias"]["naive"] == -10.0 + + # Improvement should use absolute values: + # (|-10| - |-5|) / |-10| * 100 = (10 - 5) / 10 * 100 = 50% + assert summary["bias"]["vs_naive_pct"] == pytest.approx(50.0) + + def test_comparison_signed_metric_positive_values(self) -> None: + """Test signed metrics with positive values.""" + service = BacktestingService() + + from app.features.backtesting.schemas import ModelBacktestResult + + # Main has bias=3 (under-forecasting by 3) + # Naive has bias=9 (under-forecasting by 9) + # Since |3| < |9|, main is better + main_results = ModelBacktestResult( + model_type="test_model", + config_hash="abc123", + fold_results=[], + aggregated_metrics={"bias": 3.0}, + metric_std={"bias_std": 0.5}, + ) + + baseline_results = [ + ModelBacktestResult( + model_type="naive", + config_hash="def456", + fold_results=[], + aggregated_metrics={"bias": 9.0}, + metric_std={"bias_std": 1.0}, + ) + ] + + summary = service._generate_comparison_summary( + main_results=main_results, + baseline_results=baseline_results, + ) + + # Improvement = (9 - 3) / 9 * 100 = 66.67% + assert summary["bias"]["vs_naive_pct"] == pytest.approx(66.666666, rel=1e-3) + + def test_comparison_signed_metric_mixed_signs(self) -> None: + """Test signed metrics with mixed positive/negative values.""" + service = BacktestingService() + + from app.features.backtesting.schemas import ModelBacktestResult + + # Main has bias=2 (under-forecast), Naive has bias=-8 (over-forecast) + # |2| = 2, |-8| = 8, so main is better + main_results = ModelBacktestResult( + model_type="test_model", + config_hash="abc123", + fold_results=[], + aggregated_metrics={"bias": 2.0}, + metric_std={"bias_std": 0.3}, + ) + + baseline_results = [ + ModelBacktestResult( + model_type="naive", + config_hash="def456", + fold_results=[], + aggregated_metrics={"bias": -8.0}, + metric_std={"bias_std": 1.5}, + ) + ] + + summary = service._generate_comparison_summary( + main_results=main_results, + baseline_results=baseline_results, + ) + + # Original values preserved + assert summary["bias"]["main"] == 2.0 + assert summary["bias"]["naive"] == -8.0 + + # Improvement = (|-8| - |2|) / |-8| * 100 = (8 - 2) / 8 * 100 = 75% + assert summary["bias"]["vs_naive_pct"] == pytest.approx(75.0) + class TestBacktestingServiceLoadData: """Tests for _load_series_data method.""" @@ -546,3 +661,209 @@ def test_seasonal_naive_on_seasonal_data( # Seasonal naive should perform better on seasonal data # (lower MAE) assert seasonal_result.aggregated_metrics["mae"] < naive_result.aggregated_metrics["mae"] + + +class TestBacktestingServiceConfigValidation: + """Tests for config validation against settings constraints.""" + + def test_validate_config_n_splits_exceeds_max(self) -> None: + """Test validation raises error when n_splits exceeds max.""" + service = BacktestingService() + + # Create config with n_splits exceeding settings.backtest_max_splits (20) + config = BacktestConfig( + split_config=SplitConfig( + strategy="expanding", + n_splits=20, # At max allowed by schema + min_train_size=30, + gap=0, + horizon=14, + ), + model_config_main=NaiveModelConfig(), + ) + + # This should pass as it's at the max + service._validate_config(config) + + # Note: We can't test > 20 because schema validation prevents it + # The service validation provides runtime configurability + + def test_validate_config_gap_exceeds_max(self) -> None: + """Test validation raises error when gap exceeds max.""" + service = BacktestingService() + + # Create config with gap at max allowed by schema (30) + config = BacktestConfig( + split_config=SplitConfig( + strategy="expanding", + n_splits=5, + min_train_size=30, + gap=30, # At max allowed by schema + horizon=31, # Must be > gap + ), + model_config_main=NaiveModelConfig(), + ) + + # This should pass as it's at the max + service._validate_config(config) + + def test_validate_config_min_train_below_default_logs_warning( + self, + caplog: pytest.LogCaptureFixture, + ) -> None: + """Test validation logs warning when min_train_size is below default.""" + import structlog + + # Configure structlog to work with pytest caplog + structlog.configure( + processors=[ + structlog.stdlib.ProcessorFormatter.wrap_for_formatter, + ], + logger_factory=structlog.stdlib.LoggerFactory(), + cache_logger_on_first_use=False, + ) + + service = BacktestingService() + + # Create config with min_train_size below default (30) + config = BacktestConfig( + split_config=SplitConfig( + strategy="expanding", + n_splits=5, + min_train_size=10, # Below default of 30 + gap=0, + horizon=14, + ), + model_config_main=NaiveModelConfig(), + ) + + # Should not raise, but should log warning + service._validate_config(config) + + # Note: Due to structlog configuration, we verify no exception was raised + # The warning is logged to structlog which may not appear in caplog + + +class TestBacktestingServiceSaveResults: + """Tests for save_results method.""" + + def test_save_results_creates_file(self, tmp_path: Path) -> None: + """Test save_results creates JSON file.""" + import json + from unittest.mock import patch + + from app.features.backtesting.schemas import ModelBacktestResult + + # Create service and patch settings + service = BacktestingService() + + mock_settings = MagicMock() + mock_settings.backtest_results_dir = str(tmp_path) + + with patch.object(service, "settings", mock_settings): + # Create a minimal BacktestResponse + response = BacktestResponse( + backtest_id="test123", + store_id=1, + product_id=1, + config_hash="abc123", + split_config=SplitConfig(), + main_model_results=ModelBacktestResult( + model_type="naive", + config_hash="def456", + fold_results=[], + aggregated_metrics={"mae": 10.0}, + metric_std={"mae_std": 1.0}, + ), + duration_ms=100.0, + leakage_check_passed=True, + ) + + # Save results + file_path = service.save_results(response) + + # Verify file was created + assert file_path.exists() + assert file_path.name == "test123.json" + + # Verify content is valid JSON + content = json.loads(file_path.read_text()) + assert content["backtest_id"] == "test123" + assert content["store_id"] == 1 + + def test_save_results_with_custom_filename( + self, + tmp_path: Path, + ) -> None: + """Test save_results with custom filename.""" + from unittest.mock import patch + + from app.features.backtesting.schemas import ModelBacktestResult + + service = BacktestingService() + + mock_settings = MagicMock() + mock_settings.backtest_results_dir = str(tmp_path) + + with patch.object(service, "settings", mock_settings): + response = BacktestResponse( + backtest_id="test456", + store_id=2, + product_id=3, + config_hash="xyz789", + split_config=SplitConfig(), + main_model_results=ModelBacktestResult( + model_type="naive", + config_hash="abc123", + fold_results=[], + aggregated_metrics={"mae": 5.0}, + metric_std={"mae_std": 0.5}, + ), + duration_ms=50.0, + leakage_check_passed=True, + ) + + # Save with custom filename + file_path = service.save_results(response, filename="custom_results.json") + + assert file_path.exists() + assert file_path.name == "custom_results.json" + + def test_save_results_creates_directory( + self, + tmp_path: Path, + ) -> None: + """Test save_results creates directory if it doesn't exist.""" + from unittest.mock import patch + + from app.features.backtesting.schemas import ModelBacktestResult + + nested_dir = tmp_path / "nested" / "results" / "dir" + + service = BacktestingService() + + mock_settings = MagicMock() + mock_settings.backtest_results_dir = str(nested_dir) + + with patch.object(service, "settings", mock_settings): + response = BacktestResponse( + backtest_id="test789", + store_id=1, + product_id=1, + config_hash="hash123", + split_config=SplitConfig(), + main_model_results=ModelBacktestResult( + model_type="naive", + config_hash="abc", + fold_results=[], + aggregated_metrics={"mae": 1.0}, + metric_std={"mae_std": 0.1}, + ), + duration_ms=10.0, + leakage_check_passed=True, + ) + + file_path = service.save_results(response) + + assert nested_dir.exists() + assert file_path.exists()