Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion PRPs/INITIAL/INITIAL-MLZOO-index.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Recommended PRP sequence:
| Order | INITIAL | Intended PRP | Purpose |
| --- | --- | --- | --- |
| 1 | `INITIAL-MLZOO-A-foundation-feature-frames.md` | PRP-29 | Feature-aware forecasting foundation and leakage-safe frame contracts |
| 2 | `INITIAL-MLZOO-B-lightgbm-first-model.md` | Future PRP | First advanced model path with LightGBM or sklearn fallback |
| 2 | `INITIAL-MLZOO-B-lightgbm-first-model.md` | PRP-30 | First advanced model path with LightGBM (optional `ml-lightgbm` extra) |
| 3 | `INITIAL-MLZOO-C-xgboost-prophet-extensions.md` | Future PRP | XGBoost and Prophet-like extensions |
| 4 | `INITIAL-MLZOO-D-frontend-registry-explainability.md` | Future PRP | UI, registry surfacing, and explanation polish |

Expand Down
944 changes: 944 additions & 0 deletions PRPs/PRP-30-lightgbm-first-advanced-model.md

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ docker-compose up -d
```bash
uv sync --extra dev
# or: pip install -e ".[dev]"
# LightGBM is an opt-in advanced model — add the extra to enable it:
# uv sync --extra dev --extra ml-lightgbm (then set forecast_enable_lightgbm=true)
```

4. **Run database migrations**
Expand Down Expand Up @@ -338,7 +340,8 @@ curl -X POST http://localhost:8123/forecasting/predict \
- `naive` - Last observed value (simple baseline)
- `seasonal_naive` - Same period from previous season
- `moving_average` - Mean of last N observations
- `lightgbm` - LightGBM regressor (requires `forecast_enable_lightgbm=True`)
- `regression` - Gradient-boosted exogenous-feature regressor (feature-aware)
- `lightgbm` - LightGBM feature-aware regressor — opt-in: install the `ml-lightgbm` extra and set `forecast_enable_lightgbm=True`

See [examples/models/](examples/models/) for baseline model examples.

Expand Down
167 changes: 165 additions & 2 deletions app/features/forecasting/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,6 +577,161 @@ def set_params(self, **params: Any) -> RegressionForecaster: # noqa: ANN401
return self


class LightGBMForecaster(BaseForecaster):
"""Feature-aware forecaster wrapping ``lightgbm.LGBMRegressor``.

The first ADVANCED feature-aware model (MLZOO-B). Like
``RegressionForecaster`` it REQUIRES a non-``None`` exogenous ``X`` for both
``fit`` and ``predict``; unlike it, the estimator is gradient-boosted
leaf-wise trees from the optional ``lightgbm`` package.

``lightgbm`` is imported LAZILY inside ``fit`` — never at module scope and
never in ``__init__`` — so importing this module (which every forecasting
code path does, baseline models included) never requires the optional
``ml-lightgbm`` dependency.

Determinism: ``LGBMRegressor`` is bit-reproducible only with ``n_jobs=1``
AND ``deterministic=True`` AND ``force_col_wise=True`` AND a fixed
``random_state`` — all four are pinned in ``fit``. LightGBM also tolerates
``NaN`` natively, which matters because the future feature frame leaves lag
cells ``NaN`` when their source target lies in the un-observed horizon.

Attributes:
n_estimators: Number of boosting rounds.
learning_rate: Gradient-boosting learning rate.
max_depth: Maximum depth of each tree.
"""

requires_features: ClassVar[bool] = True
"""A feature-aware model — ``fit``/``predict`` REQUIRE a non-None ``X``."""

def __init__(
self,
*,
n_estimators: int = 100,
learning_rate: float = 0.1,
max_depth: int = 6,
random_state: int = 42,
) -> None:
"""Initialize the LightGBM forecaster.

Args:
n_estimators: Number of boosting rounds.
learning_rate: Gradient-boosting learning rate.
max_depth: Maximum depth of each tree.
random_state: Random seed for reproducibility (determinism).
"""
super().__init__(random_state)
self.n_estimators = n_estimators
self.learning_rate = learning_rate
self.max_depth = max_depth
self._estimator: Any = None

def fit(
self,
y: np.ndarray[Any, np.dtype[np.floating[Any]]],
X: np.ndarray[Any, np.dtype[np.floating[Any]]] | None = None,
) -> LightGBMForecaster:
"""Fit the gradient-boosted regressor on historical features.

Args:
y: Target values (1D array of shape ``[n_samples]``).
X: Exogenous features (2D array of shape ``[n_samples, n_features]``).
REQUIRED — unlike the baseline forecasters.

Returns:
self (for method chaining).

Raises:
ValueError: If ``X`` is ``None``, ``y`` is empty, or the row counts
of ``X`` and ``y`` do not match.
"""
if X is None:
raise ValueError("LightGBMForecaster requires exogenous features X for fit()")
if len(y) == 0:
raise ValueError("Cannot fit on empty array")
if X.shape[0] != len(y):
raise ValueError(
f"X has {X.shape[0]} rows but y has {len(y)} — feature/target rows must match"
)
# LAZY import — the optional ``ml-lightgbm`` dependency is only needed
# the first time a LightGBM model is actually fitted.
import lightgbm as lgb

estimator: Any = lgb.LGBMRegressor(
n_estimators=self.n_estimators,
learning_rate=self.learning_rate,
max_depth=self.max_depth,
random_state=self.random_state,
n_jobs=1, # \
deterministic=True, # } all four required for a bit-reproducible fit
force_col_wise=True, # /
verbosity=-1, # silence LightGBM's training chatter
)
estimator.fit(X, y)
self._estimator = estimator
self._last_values = np.asarray(y[-1:], dtype=np.float64)
self._is_fitted = True
return self

def predict(
self,
horizon: int,
X: np.ndarray[Any, np.dtype[np.floating[Any]]] | None = None,
) -> np.ndarray[Any, np.dtype[np.floating[Any]]]:
"""Generate forecasts from a future feature frame.

Args:
horizon: Number of steps to forecast.
X: Exogenous features for the forecast period, shape
``[horizon, n_features]``. REQUIRED.

Returns:
Array of forecasts with shape ``[horizon]``.

Raises:
RuntimeError: If the model has not been fitted.
ValueError: If ``X`` is ``None`` or its row count is not ``horizon``.
"""
if not self._is_fitted or self._estimator is None:
raise RuntimeError("Model must be fitted before predict")
if X is None:
raise ValueError("LightGBMForecaster requires exogenous features X for predict()")
if X.shape[0] != horizon:
raise ValueError(f"X has {X.shape[0]} rows but horizon is {horizon} — they must match")
predictions = self._estimator.predict(X)
result: np.ndarray[Any, np.dtype[np.floating[Any]]] = np.asarray(
predictions, dtype=np.float64
)
return result

def get_params(self) -> dict[str, Any]:
"""Get model parameters.

Returns:
Dictionary with n_estimators, learning_rate, max_depth, random_state.
"""
return {
"n_estimators": self.n_estimators,
"learning_rate": self.learning_rate,
"max_depth": self.max_depth,
"random_state": self.random_state,
}

def set_params(self, **params: Any) -> LightGBMForecaster: # noqa: ANN401
"""Set model parameters.

Args:
**params: Parameter names and values to set.

Returns:
self (for method chaining).
"""
for key, value in params.items():
setattr(self, key, value)
return self


# Type alias for model type literals
ModelType = Literal["naive", "seasonal_naive", "moving_average", "lightgbm", "regression"]

Expand Down Expand Up @@ -625,8 +780,16 @@ def model_factory(config: ModelConfig, random_state: int = 42) -> BaseForecaster
raise ValueError(
"LightGBM is not enabled. Set forecast_enable_lightgbm=True in settings."
)
# LightGBM implementation would go here when feature-flagged
raise NotImplementedError("LightGBM forecaster not yet implemented")
from app.features.forecasting.schemas import LightGBMModelConfig

if isinstance(config, LightGBMModelConfig):
return LightGBMForecaster(
n_estimators=config.n_estimators,
learning_rate=config.learning_rate,
max_depth=config.max_depth,
random_state=random_state,
)
raise ValueError("Invalid config type for lightgbm")
elif model_type == "regression":
from app.features.forecasting.schemas import RegressionModelConfig

Expand Down
27 changes: 27 additions & 0 deletions app/features/forecasting/persistence.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ class ModelBundle:
created_at: Timestamp when bundle was created.
python_version: Python version used when saving.
sklearn_version: Scikit-learn version used when saving.
lightgbm_version: LightGBM version used when saving, ``None`` when the
optional ``ml-lightgbm`` dependency was not installed.
bundle_hash: Deterministic hash of bundle contents.
"""

Expand All @@ -51,6 +53,7 @@ class ModelBundle:
created_at: datetime | None = None
python_version: str | None = None
sklearn_version: str | None = None
lightgbm_version: str | None = None
bundle_hash: str | None = None

def compute_hash(self) -> str:
Expand Down Expand Up @@ -95,6 +98,14 @@ def save_model_bundle(bundle: ModelBundle, path: str | Path) -> Path:
bundle.created_at = datetime.now(UTC)
bundle.python_version = sys.version
bundle.sklearn_version = sklearn.__version__
# Best-effort: LightGBM is an optional dependency, so a baseline-only
# install legitimately has no version to record.
try:
import lightgbm

bundle.lightgbm_version = str(lightgbm.__version__)
except ImportError:
bundle.lightgbm_version = None
bundle.bundle_hash = bundle.compute_hash()

# Save with compression
Expand Down Expand Up @@ -171,6 +182,22 @@ def load_model_bundle(path: str | Path, base_dir: str | Path | None = None) -> M
current_sklearn=sklearn.__version__,
)

# LightGBM is optional — only warn when the bundle recorded a version AND
# the optional dependency is importable here AND the two differ.
if bundle.lightgbm_version:
try:
import lightgbm

current_lightgbm: str | None = str(lightgbm.__version__)
except ImportError:
current_lightgbm = None
if current_lightgbm is not None and bundle.lightgbm_version != current_lightgbm:
logger.warning(
"forecasting.lightgbm_version_mismatch",
saved_lightgbm=bundle.lightgbm_version,
current_lightgbm=current_lightgbm,
)

logger.info(
"forecasting.model_bundle_loaded",
path=str(path),
Expand Down
Loading
Loading