-
Notifications
You must be signed in to change notification settings - Fork 0
fix(jobs): backtest job result keeps fold metrics, stability and baselines (#148) #152
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,160 @@ | ||
| """Unit tests for the jobs service result-shaping logic. | ||
|
|
||
| These are pure-function tests (no DB) for the helpers that flatten a | ||
| ``BacktestResponse`` into the job-result contract the dashboard reads. | ||
| Regression coverage for issue #148 — the backtest job result used to drop | ||
| per-fold metrics, stability and the baseline comparison. | ||
| """ | ||
|
|
||
| import math | ||
| from datetime import date | ||
|
|
||
| from app.features.backtesting.schemas import ( | ||
| BacktestResponse, | ||
| FoldResult, | ||
| ModelBacktestResult, | ||
| SplitBoundary, | ||
| SplitConfig, | ||
| ) | ||
| from app.features.jobs.service import _finite, _shape_backtest_result | ||
|
|
||
|
|
||
| def _fold(idx: int, mae: float, smape: float, wape: float, bias: float) -> FoldResult: | ||
| """Build a FoldResult with the given per-fold metrics.""" | ||
| return FoldResult( | ||
| fold_index=idx, | ||
| split=SplitBoundary( | ||
| fold_index=idx, | ||
| train_start=date(2024, 1, 1), | ||
| train_end=date(2024, 2, 1), | ||
| test_start=date(2024, 2, 2), | ||
| test_end=date(2024, 2, 15), | ||
| train_size=32, | ||
| test_size=14, | ||
| ), | ||
| dates=[date(2024, 2, 2)], | ||
| actuals=[10.0], | ||
| predictions=[11.0], | ||
| metrics={"mae": mae, "smape": smape, "wape": wape, "bias": bias}, | ||
| ) | ||
|
|
||
|
|
||
| def _make_response(*, with_baselines: bool = True, nan_stability: bool = False) -> BacktestResponse: | ||
| """Build a minimal BacktestResponse for shaping tests.""" | ||
| folds = [ | ||
| _fold(0, 10.0, 12.0, 11.0, 1.0), | ||
| _fold(1, 14.0, 11.0, 12.0, 3.0), | ||
| ] | ||
| stability = { | ||
| "wape_stability": math.nan if nan_stability else 8.5, | ||
| "mae_stability": 5.0, | ||
| } | ||
| main = ModelBacktestResult( | ||
| model_type="seasonal_naive", | ||
| config_hash="abc123", | ||
| fold_results=folds, | ||
| aggregated_metrics={"mae": 12.0, "smape": 11.5, "wape": 11.5, "bias": 2.0}, | ||
| metric_std=stability, | ||
| ) | ||
| baselines: list[ModelBacktestResult] | None = None | ||
| comparison: dict[str, dict[str, float]] | None = None | ||
| if with_baselines: | ||
| baselines = [ | ||
| ModelBacktestResult( | ||
| model_type="naive", | ||
| config_hash="n1", | ||
| fold_results=folds, | ||
| aggregated_metrics={"mae": 15.0, "smape": 14.0, "wape": 14.0, "bias": 3.0}, | ||
| metric_std={}, | ||
| ), | ||
| ] | ||
| comparison = { | ||
| "mae": { | ||
| "main": 12.0, | ||
| "naive": 15.0, | ||
| "vs_naive_pct": 20.0, | ||
| "seasonal_naive": 13.0, | ||
| "vs_seasonal_pct": 7.7, | ||
| }, | ||
| } | ||
| return BacktestResponse( | ||
| backtest_id="bt-1", | ||
| store_id=1, | ||
| product_id=1, | ||
| config_hash="cfg", | ||
| split_config=SplitConfig(n_splits=2), | ||
| main_model_results=main, | ||
| baseline_results=baselines, | ||
| comparison_summary=comparison, | ||
| duration_ms=3.5, | ||
| leakage_check_passed=True, | ||
| ) | ||
|
|
||
|
|
||
| def test_shape_backtest_result_includes_fold_metrics() -> None: | ||
| """fold_metrics is populated, one entry per fold, with 1-based fold numbers.""" | ||
| response = _make_response() | ||
| result = _shape_backtest_result(response, "seasonal_naive") | ||
|
|
||
| # top-level fields are passed through unchanged | ||
| assert result["backtest_id"] == response.backtest_id | ||
| assert result["model_type"] == "seasonal_naive" | ||
| assert result["duration_ms"] == response.duration_ms | ||
|
|
||
| # per-fold metrics are present and correctly shaped | ||
| assert result["n_splits"] == 2 | ||
| assert [f["fold"] for f in result["fold_metrics"]] == [1, 2] | ||
| assert result["fold_metrics"][0]["mae"] == 10.0 | ||
| assert result["fold_metrics"][1]["bias"] == 3.0 | ||
|
|
||
|
|
||
| def test_shape_backtest_result_aggregated_metrics_use_mean_keys() -> None: | ||
| """aggregated_metrics uses the *_mean keys plus stability_index.""" | ||
| result = _shape_backtest_result(_make_response(), "seasonal_naive") | ||
| agg = result["aggregated_metrics"] | ||
| assert set(agg) == {"mae_mean", "smape_mean", "wape_mean", "bias_mean", "stability_index"} | ||
| assert agg["mae_mean"] == 12.0 | ||
| assert agg["stability_index"] == 8.5 | ||
|
|
||
|
|
||
| def test_shape_backtest_result_includes_baseline_comparison() -> None: | ||
| """baseline_comparison carries naive/seasonal MAE and improvement percentages.""" | ||
| result = _shape_backtest_result(_make_response(with_baselines=True), "seasonal_naive") | ||
| comparison = result["baseline_comparison"] | ||
| assert comparison["naive"] == {"mae": 15.0, "improvement_pct": 20.0} | ||
| assert comparison["seasonal_naive"] == {"mae": 13.0, "improvement_pct": 7.7} | ||
|
|
||
|
|
||
| def test_shape_backtest_result_omits_baseline_when_absent() -> None: | ||
| """With no comparison summary, baseline_comparison is left out entirely.""" | ||
| result = _shape_backtest_result(_make_response(with_baselines=False), "seasonal_naive") | ||
| assert "baseline_comparison" not in result | ||
|
|
||
|
|
||
| def test_shape_backtest_result_coerces_nan_stability() -> None: | ||
| """NaN metrics are coerced to 0.0 — Postgres jsonb rejects non-finite floats.""" | ||
| result = _shape_backtest_result(_make_response(nan_stability=True), "seasonal_naive") | ||
| assert result["aggregated_metrics"]["stability_index"] == 0.0 | ||
|
|
||
|
|
||
| def test_shape_backtest_result_defaults_missing_metric_to_zero() -> None: | ||
| """A metric absent from the response defaults to 0.0 instead of being dropped. | ||
|
|
||
| Guards against silent drift if the backtesting service stops emitting one of | ||
| the _BACKTEST_METRICS keys. | ||
| """ | ||
| response = _make_response() | ||
| del response.main_model_results.aggregated_metrics["bias"] | ||
| result = _shape_backtest_result(response, "seasonal_naive") | ||
| # the *_mean key is still present in the contract, just zeroed | ||
| assert result["aggregated_metrics"]["bias_mean"] == 0.0 | ||
| assert "bias_mean" in result["aggregated_metrics"] | ||
|
|
||
|
|
||
| def test_finite_coerces_non_finite_values() -> None: | ||
| """_finite passes finite values through and maps NaN/inf to 0.0.""" | ||
| assert _finite(3.5) == 3.5 | ||
| assert _finite(0.0) == 0.0 | ||
| assert _finite(math.nan) == 0.0 | ||
| assert _finite(math.inf) == 0.0 | ||
| assert _finite(-math.inf) == 0.0 | ||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
suggestion (testing): Add coverage for top-level fields (backtest_id, model_type, duration_ms) in the shaped result.
Current tests only validate
fold_metrics/aggregated_metrics, but the fix also depends onbacktest_id,model_type, andduration_msbeing preserved. Please add an assertion (in this or another test) that_shape_backtest_resultpasses these fields through unchanged, to guard against future contract changes.