Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
997 changes: 997 additions & 0 deletions PRPs/PRP-MLZOO-C2-prophet-like-additive-model.md

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,7 @@ curl -X POST http://localhost:8123/forecasting/predict \
- `regression` - Gradient-boosted exogenous-feature regressor (feature-aware)
- `lightgbm` - LightGBM feature-aware regressor — opt-in: install the `ml-lightgbm` extra and set `forecast_enable_lightgbm=True`
- `xgboost` - XGBoost feature-aware regressor — opt-in: install the `ml-xgboost` extra and set `forecast_enable_xgboost=True`
- `prophet_like` - Prophet-like additive linear model (trend / seasonality / regressor decomposition); pure scikit-learn, always available, no extra to install

See [examples/models/](examples/models/) for baseline model examples.

Expand Down
33 changes: 33 additions & 0 deletions app/features/backtesting/tests/test_feature_aware_backtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from app.features.backtesting.splitter import TimeSeriesSplitter
from app.features.forecasting.schemas import (
NaiveModelConfig,
ProphetLikeModelConfig,
RegressionModelConfig,
XGBoostModelConfig,
)
Expand Down Expand Up @@ -177,6 +178,38 @@ def test_feature_aware_backtest_runs_with_xgboost_model(
assert "mae" in fold.metrics


def test_prophet_like_feature_aware_backtest_produces_per_fold_metrics(
sample_dates_120: list[date],
sample_values_120: np.ndarray,
sample_split_config_expanding: SplitConfig,
) -> None:
"""A prophet_like backtest runs end-to-end and yields per-fold metrics.

The Prophet-like additive model is feature-aware (pure scikit-learn, no
flag), so it routes through the SAME per-fold feature-aware path as the
regression model — satisfying INITIAL-MLZOO-B's "backtesting integration
test comparing baseline and advanced model path".
"""
service = BacktestingService()
series = _series(sample_dates_120, sample_values_120, with_exogenous=True)
splitter = TimeSeriesSplitter(sample_split_config_expanding)

result = service._run_model_backtest(
series_data=series,
splitter=splitter,
model_config=ProphetLikeModelConfig(),
store_fold_details=True,
)

assert result.model_type == "prophet_like"
assert result.feature_aware is True
assert len(result.fold_results) > 0
assert "mae" in result.aggregated_metrics
for fold in result.fold_results:
assert "mae" in fold.metrics
assert np.isfinite(fold.metrics["mae"])


def test_feature_aware_result_records_observed_policy(
sample_dates_120: list[date],
sample_values_120: np.ndarray,
Expand Down
237 changes: 236 additions & 1 deletion app/features/forecasting/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,56 @@
from sklearn.ensemble import ( # type: ignore[import-untyped]
HistGradientBoostingRegressor,
)
from sklearn.impute import SimpleImputer # type: ignore[import-untyped]
from sklearn.linear_model import Ridge # type: ignore[import-untyped]
from sklearn.pipeline import Pipeline # type: ignore[import-untyped]

if TYPE_CHECKING:
from app.features.forecasting.schemas import ModelConfig


# Canonical 14-column feature frame partitioned into the three Prophet-style
# additive components. Together the three column tuples cover all 14 canonical
# columns exactly — which is what makes the additive invariant hold (the
# component contributions partition the full coef_ · x sum). See
# ``canonical_feature_columns()`` in ``app/shared/feature_frames``.
_PROPHET_LIKE_COMPONENTS: dict[str, tuple[str, ...]] = {
"trend": ("lag_1", "lag_7", "lag_14", "lag_28", "days_since_launch"),
"seasonality": (
"dow_sin",
"dow_cos",
"month_sin",
"month_cos",
"is_weekend",
"is_month_end",
),
"holiday_regressor": ("price_factor", "promo_active", "is_holiday"),
}


@dataclass
class ForecastDecomposition:
"""Additive component breakdown of a Prophet-like forecast.

Invariant: ``intercept + trend + seasonality + holiday_regressor`` equals
``predict(...)`` for the same ``X`` (within float tolerance), element-wise.
Each component array has shape ``[n_rows]`` — one value per forecast row.

Attributes:
intercept: The fitted Ridge intercept (a scalar, broadcast over rows).
trend: Per-row contribution of the trend columns (autoregressive lags
+ ``days_since_launch``).
seasonality: Per-row contribution of the calendar/seasonal columns.
holiday_regressor: Per-row contribution of the holiday + extra-regressor
columns (price, promotion, holiday flag).
"""

intercept: float
trend: np.ndarray[Any, np.dtype[np.floating[Any]]]
seasonality: np.ndarray[Any, np.dtype[np.floating[Any]]]
holiday_regressor: np.ndarray[Any, np.dtype[np.floating[Any]]]


@dataclass
class FitResult:
"""Result of model fitting.
Expand Down Expand Up @@ -888,9 +933,191 @@ def set_params(self, **params: Any) -> XGBoostForecaster: # noqa: ANN401
return self


class ProphetLikeForecaster(BaseForecaster):
"""Feature-aware ADDITIVE forecaster — Ridge over the canonical frame.

Prophet-LIKE, not Prophet: it approximates Prophet's additive trend +
seasonality + holiday/regressor decomposition with a regularized linear
model over the already-engineered 14-column feature frame. It REQUIRES a
non-``None`` exogenous ``X`` for both ``fit`` and ``predict``.

The fitted estimator is a scikit-learn ``Pipeline`` of two deterministic
steps: a ``SimpleImputer(strategy="median")`` that fills the ``NaN`` lag
cells the future feature frame emits (a bare ``Ridge`` raises
``ValueError: Input contains NaN``), followed by a
``Ridge(solver="cholesky")`` whose closed-form L2-regularized fit is
robust to the collinear engineered columns. Folding the imputer INSIDE the
pipeline keeps the no-leakage invariant: it learns its medians on the
training ``X`` only and re-applies them at predict time.

``decompose()`` returns the per-component additive contributions of a
forecast — the literal ``y_hat = intercept + trend + seasonality +
holiday_regressor`` split, computed on the IMPUTED ``X``.

NOT modelled (deliberately — see PRP-MLZOO-C2 Risks): changepoint trend,
posterior uncertainty intervals, automatic seasonality discovery,
multiplicative seasonality. This is an additive linear approximation, not
the real ``prophet`` package.

Attributes:
alpha: Ridge L2 regularization strength (0.0 degenerates to OLS).
"""

requires_features: ClassVar[bool] = True
"""A feature-aware model — ``fit``/``predict`` REQUIRE a non-None ``X``."""

def __init__(self, *, alpha: float = 1.0, random_state: int = 42) -> None:
"""Initialize the Prophet-like additive forecaster.

Args:
alpha: Ridge L2 regularization strength. The default 1.0 keeps
coefficients robust to the collinear engineered-feature frame.
random_state: Kept for interface parity with the other forecasters;
``Ridge(solver="cholesky")`` is closed-form and needs no seed.
"""
super().__init__(random_state)
self.alpha = alpha
self._estimator: Any = None

def fit(
self,
y: np.ndarray[Any, np.dtype[np.floating[Any]]],
X: np.ndarray[Any, np.dtype[np.floating[Any]]] | None = None,
) -> ProphetLikeForecaster:
"""Fit the additive Ridge pipeline on historical features.

Args:
y: Target values (1D array of shape ``[n_samples]``).
X: Exogenous features (2D array of shape ``[n_samples, n_features]``).
REQUIRED — unlike the baseline forecasters.

Returns:
self (for method chaining).

Raises:
ValueError: If ``X`` is ``None``, ``y`` is empty, or the row counts
of ``X`` and ``y`` do not match.
"""
if X is None:
raise ValueError("ProphetLikeForecaster requires exogenous features X for fit()")
if len(y) == 0:
raise ValueError("Cannot fit on empty array")
if X.shape[0] != len(y):
raise ValueError(
f"X has {X.shape[0]} rows but y has {len(y)} — feature/target rows must match"
)
# The imputer learns its per-column medians on THIS training X only;
# the Ridge solver is deterministic and closed-form.
estimator: Any = Pipeline(
[
("impute", SimpleImputer(strategy="median")),
("ridge", Ridge(alpha=self.alpha, solver="cholesky")),
]
)
estimator.fit(X, y)
self._estimator = estimator
self._last_values = np.asarray(y[-1:], dtype=np.float64)
self._is_fitted = True
return self

def predict(
self,
horizon: int,
X: np.ndarray[Any, np.dtype[np.floating[Any]]] | None = None,
) -> np.ndarray[Any, np.dtype[np.floating[Any]]]:
"""Generate forecasts from a future feature frame.

Args:
horizon: Number of steps to forecast.
X: Exogenous features for the forecast period, shape
``[horizon, n_features]``. REQUIRED.

Returns:
Array of forecasts with shape ``[horizon]``.

Raises:
RuntimeError: If the model has not been fitted.
ValueError: If ``X`` is ``None`` or its row count is not ``horizon``.
"""
if not self._is_fitted or self._estimator is None:
raise RuntimeError("Model must be fitted before predict")
if X is None:
raise ValueError("ProphetLikeForecaster requires exogenous features X for predict()")
if X.shape[0] != horizon:
raise ValueError(f"X has {X.shape[0]} rows but horizon is {horizon} — they must match")
# The Pipeline imputes the NaN lag cells, then the Ridge predicts.
predictions = self._estimator.predict(X)
result: np.ndarray[Any, np.dtype[np.floating[Any]]] = np.asarray(
predictions, dtype=np.float64
)
return result

def decompose(self, X: np.ndarray[Any, np.dtype[np.floating[Any]]]) -> ForecastDecomposition:
"""Split a forecast into its additive trend / seasonality / regressor parts.

Operates on the IMPUTED ``X`` — the trained imputer's ``transform`` —
so the per-component contributions sum EXACTLY to ``predict(...)``: any
``NaN`` cell is filled with the TRAINING-window median, never a
predict-time median (no leakage). Each component contribution is the
partial sum ``Σ_{i ∈ component} coef_i · x_i``; together the three
component column-sets partition all 14 canonical columns, so
``intercept + trend + seasonality + holiday_regressor == predict()``.

Args:
X: Feature matrix of shape ``[n_rows, n_features]`` (the same frame
a ``predict`` call would consume). May contain ``NaN`` cells.

Returns:
A :class:`ForecastDecomposition` with the four-way breakdown.

Raises:
RuntimeError: If the model has not been fitted.
"""
from app.shared.feature_frames import canonical_feature_columns

if not self._is_fitted or self._estimator is None:
raise RuntimeError("Model must be fitted before decompose")
imputer = self._estimator.named_steps["impute"]
ridge = self._estimator.named_steps["ridge"]
x_imputed = imputer.transform(X)
columns = canonical_feature_columns()
coef = np.asarray(ridge.coef_, dtype=np.float64)
contributions: dict[str, np.ndarray[Any, np.dtype[np.floating[Any]]]] = {}
for component, comp_cols in _PROPHET_LIKE_COMPONENTS.items():
idx = [columns.index(c) for c in comp_cols]
contributions[component] = np.asarray(x_imputed[:, idx] @ coef[idx], dtype=np.float64)
return ForecastDecomposition(
intercept=float(ridge.intercept_),
trend=contributions["trend"],
seasonality=contributions["seasonality"],
holiday_regressor=contributions["holiday_regressor"],
)

def get_params(self) -> dict[str, Any]:
"""Get model parameters.

Returns:
Dictionary with alpha and random_state.
"""
return {"alpha": self.alpha, "random_state": self.random_state}

def set_params(self, **params: Any) -> ProphetLikeForecaster: # noqa: ANN401
"""Set model parameters.

Args:
**params: Parameter names and values to set.

Returns:
self (for method chaining).
"""
for key, value in params.items():
setattr(self, key, value)
return self


# Type alias for model type literals
ModelType = Literal[
"naive", "seasonal_naive", "moving_average", "xgboost", "lightgbm", "regression"
"naive", "seasonal_naive", "moving_average", "xgboost", "lightgbm", "regression", "prophet_like"
]


Expand Down Expand Up @@ -974,5 +1201,13 @@ def model_factory(config: ModelConfig, random_state: int = 42) -> BaseForecaster
random_state=random_state,
)
raise ValueError("Invalid config type for regression")
elif model_type == "prophet_like":
# No flag gate — the Prophet-like model is pure scikit-learn and ships
# always-enabled, exactly like ``regression``.
from app.features.forecasting.schemas import ProphetLikeModelConfig

if isinstance(config, ProphetLikeModelConfig):
return ProphetLikeForecaster(alpha=config.alpha, random_state=random_state)
raise ValueError("Invalid config type for prophet_like")
else:
raise ValueError(f"Unknown model type: {model_type}")
32 changes: 32 additions & 0 deletions app/features/forecasting/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,37 @@ class RegressionModelConfig(ModelConfigBase):
)


class ProphetLikeModelConfig(ModelConfigBase):
"""Configuration for the Prophet-like additive forecaster (MLZOO-C2).

A deterministic, regularized ADDITIVE linear model — a ``Ridge`` regressor
over the canonical 14-column feature frame — that decomposes demand into
trend / seasonality / holiday-regressor components. It approximates
Prophet's additive shape WITHOUT the real ``prophet``/Stan dependency: it
does not model changepoint trend, posterior uncertainty, or automatic
seasonality discovery. Pure scikit-learn — no optional dependency, no
feature flag, always available (like ``RegressionModelConfig``).

Attributes:
alpha: Ridge L2 regularization strength. 0.0 degenerates to ordinary
least squares; the default 1.0 keeps coefficients robust to the
collinear engineered-feature frame.
feature_config_hash: Optional hash of the feature contract used.
"""

model_type: Literal["prophet_like"] = "prophet_like"
alpha: float = Field(
default=1.0,
ge=0.0,
le=10000.0,
description="Ridge L2 regularization strength",
)
feature_config_hash: str | None = Field(
default=None,
description="Hash of the feature contract used for training",
)


# Union type for all model configs
ModelConfig = (
NaiveModelConfig
Expand All @@ -240,6 +271,7 @@ class RegressionModelConfig(ModelConfigBase):
| LightGBMModelConfig
| XGBoostModelConfig
| RegressionModelConfig
| ProphetLikeModelConfig
)


Expand Down
Loading