From 8541553aef8eb5288c8fe86705ad7f22459c3430 Mon Sep 17 00:00:00 2001 From: Gabor Szabo <168316277+w7-mgfcode@users.noreply.github.com> Date: Sun, 1 Feb 2026 00:32:24 +0100 Subject: [PATCH 1/5] feat(featuresets): implement time-safe feature engineering layer (#24) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs: update INITIAL-4 and INITIAL-5 with additional references - Add scikit-learn, mlforecast, and sktime documentation links - Add considerations for imputation logic, agent tooling, and computation overhead - Add model persistence documentation references Co-Authored-By: Claude Opus 4.5 * feat(featuresets): implement time-safe feature engineering layer Add complete feature engineering module with: - Pydantic schemas for feature configuration (lag, rolling, calendar, exogenous, imputation) - FeatureEngineeringService with CRITICAL leakage prevention: - Lag features use positive shift() only - Rolling features use shift(1) BEFORE rolling to exclude current observation - Group-aware operations prevent cross-series leakage - Cutoff date filtering before any computation - FastAPI endpoints: POST /featuresets/compute and /featuresets/preview - Comprehensive test suite (55 tests) including leakage prevention tests - Example demo script Co-Authored-By: Claude Opus 4.5 * docs: update INITIAL-5.md * docs: update documentation for Phase 3 Feature Engineering - README.md: Add featuresets module to project structure and API endpoints - docs/ARCHITECTURE.md: Add Feature Engineering section (section 6) - docs/PHASE-index.md: Mark Phase 3 as completed with summary - docs/PHASE/3-FEATURE_ENGINEERING.md: Create detailed phase documentation Co-Authored-By: Claude Opus 4.5 * fix(featuresets): address code review feedback and prevent data leakage Routes: - Validate store_id/product_id presence (no silent defaults to 0) - Convert ValueError for unsupported date types to HTTP 400 Service: - Add expanding_mean imputation strategy (time-safe alternative) - Add warnings when bfill/mean strategies are used (leakage risk) - Fix price_pct_change_7d to use shift(1) before pct_change Schemas: - Add expanding_mean to ImputationConfig Literal type - Document time-safety of each imputation strategy - Fix PreviewFeaturesRequest docstring: GET → POST Documentation: - Convert bare URLs to markdown links in INITIAL-4.md, INITIAL-5.md - Fix PRP-4 to show POST for preview endpoint Co-Authored-By: Claude Opus 4.5 * style: format schemas.py Co-Authored-By: Claude Opus 4.5 --------- Co-authored-by: Gabe@w7dev Co-authored-by: Claude Opus 4.5 --- INITIAL-4.md | 6 + INITIAL-5.md | 12 +- PRPs/PRP-4-feature-engineering.md | 1225 +++++++++++++++++ README.md | 38 +- app/core/config.py | 5 + app/features/featuresets/__init__.py | 33 + app/features/featuresets/routes.py | 342 +++++ app/features/featuresets/schemas.py | 355 +++++ app/features/featuresets/service.py | 588 ++++++++ app/features/featuresets/tests/__init__.py | 1 + app/features/featuresets/tests/conftest.py | 171 +++ .../featuresets/tests/test_leakage.py | 328 +++++ .../featuresets/tests/test_schemas.py | 261 ++++ .../featuresets/tests/test_service.py | 319 +++++ app/main.py | 2 + docs/ARCHITECTURE.md | 62 +- docs/PHASE-index.md | 37 +- docs/PHASE/3-FEATURE_ENGINEERING.md | 283 ++++ examples/compute_features_demo.py | 229 +++ pyproject.toml | 7 + uv.lock | 187 ++- 21 files changed, 4468 insertions(+), 23 deletions(-) create mode 100644 PRPs/PRP-4-feature-engineering.md create mode 100644 app/features/featuresets/__init__.py create mode 100644 app/features/featuresets/routes.py create mode 100644 app/features/featuresets/schemas.py create mode 100644 app/features/featuresets/service.py create mode 100644 app/features/featuresets/tests/__init__.py create mode 100644 app/features/featuresets/tests/conftest.py create mode 100644 app/features/featuresets/tests/test_leakage.py create mode 100644 app/features/featuresets/tests/test_schemas.py create mode 100644 app/features/featuresets/tests/test_service.py create mode 100644 docs/PHASE/3-FEATURE_ENGINEERING.md create mode 100644 examples/compute_features_demo.py diff --git a/INITIAL-4.md b/INITIAL-4.md index 285fc7fd..097c63e7 100644 --- a/INITIAL-4.md +++ b/INITIAL-4.md @@ -18,7 +18,13 @@ ## DOCUMENTATION: - Time-series feature engineering best practices - scikit-learn transformers/pipelines (if used) +- [scikit-learn Pipeline Composition](https://scikit-learn.org/stable/modules/compose.html) +- [MLForecast Feature Engineering](https://www.nixtla.io/blog/automated-time-series-feature-engineering-with-mlforecast?utm_source=chatgpt.com#introduction-to-mlforecast) +- [sktime Transformations API](https://www.sktime.net/en/stable/api_reference/transformations.html) ## OTHER CONSIDERATIONS: - Feature configs must be persisted per run in the registry. - Reproducibility: same config + same data window must be re-runnable. +- **Imputation Logic**: Define behavior for missing price data (forward-fill) vs missing sales data (zero-fill). +- **Agent Tooling**: Expose the Feature Pipeline as a tool for PydanticAI to "inspect" the shape of the data before suggesting ModelConfigs. +- **Computation Overhead**: Evaluate if features should be computed on-the-fly in FastAPI or pre-computed in a materialized view for performance. diff --git a/INITIAL-5.md b/INITIAL-5.md index 6f3dd911..55e9f06a 100644 --- a/INITIAL-5.md +++ b/INITIAL-5.md @@ -11,6 +11,13 @@ - Extensible “Global ML” hook: - regression pipeline (scikit-learn) - enabled/disabled via feature flags +- Unified Estimator Pipeline: + - Scikit-learn Pipeline incorporating Scaling -> Encoding -> Regressor. + - Integration with FeatureEngineeringService for automated lag-injection. +- Persistence Layer: + - Joblib-based serialization including a 'ModelBundle' (Model + Metadata + FeatureHash). +- Multi-Horizon Support: + - Logic for Recursive Forecasting (predicting day-by-day and updating lags). ## EXAMPLES: - `examples/models/baseline_naive.py` @@ -21,8 +28,11 @@ ## DOCUMENTATION: - scikit-learn estimators + pipelines - joblib serialization patterns +- [scikit-learn Pipeline Composition](https://scikit-learn.org/stable/modules/compose.html) +- [scikit-learn Glossary](https://scikit-learn.org/stable/glossary.html) +- [scikit-learn Model Persistence](https://scikit-learn.org/stable/model_persistence.html) ## OTHER CONSIDERATIONS: - No hardcoded horizons: driven by request/config. - Determinism: random seed from Settings. -- Enforce input grain validation (store×product×date). +- Enforce input grain validation (store×product×date). \ No newline at end of file diff --git a/PRPs/PRP-4-feature-engineering.md b/PRPs/PRP-4-feature-engineering.md new file mode 100644 index 00000000..10a831e2 --- /dev/null +++ b/PRPs/PRP-4-feature-engineering.md @@ -0,0 +1,1225 @@ +# PRP-4: Feature Engineering — Time-Safe Feature Computation + +## Goal + +Implement a time-safe feature engineering module for the ForecastLabAI forecasting pipeline. The module provides configurable feature computation (lags, rolling windows, calendar features, exogenous signals) with **guaranteed no future data leakage** through explicit cutoff enforcement. + +**End State:** A production-ready `featuresets` vertical slice with: +- `FeatureSetConfig` — Pydantic-based configuration schema with versioning +- `FeatureEngineeringService` — Core service computing features with cutoff enforcement +- `POST /featuresets/compute` — API endpoint to compute features for a series +- `POST /featuresets/preview` — Preview feature output for debugging/exploration +- Time-safety enforced at every level (lags, rolling, calendar) +- Comprehensive tests including **leakage-focused validation tests** +- All validation gates passing (ruff, mypy, pyright, pytest) + +--- + +## Why + +- **Foundation for ForecastOps**: INITIAL-5 (Forecasting) and INITIAL-6 (Backtesting) require computed features as input +- **Prevent Data Leakage**: Time-series models fail silently when features leak future data; explicit cutoff enforcement prevents this +- **Reproducibility**: Feature configs are versioned and hashable for registry storage (INITIAL-7) +- **Configurability**: Different products/stores may need different feature sets without code changes +- **Agent Tooling Ready**: Exposes feature inspection for PydanticAI agents (INITIAL-9) + +--- + +## What + +### Success Criteria + +- [ ] `FeatureSetConfig` schema with `lag_config`, `rolling_config`, `calendar_config`, `exogenous_config`, `imputation_config` +- [ ] All configs have `schema_version` field and deterministic `config_hash()` method +- [ ] `FeatureEngineeringService.compute_features(df, cutoff_date)` returns features respecting cutoff +- [ ] Lag features use `shift(lag)` — only past data +- [ ] Rolling features use `shift(1)` before rolling — excludes current observation +- [ ] Calendar features derived from date column (no leakage possible) +- [ ] Exogenous features (price/promo/inventory) lagged appropriately +- [ ] Imputation strategies: zero-fill for sales, forward-fill for prices +- [ ] `POST /featuresets/compute` endpoint accepts config + data window + cutoff +- [ ] `GET /featuresets/preview` endpoint for single-series feature inspection +- [ ] Unit tests for each feature type +- [ ] **Leakage tests** verify no future data used +- [ ] Integration tests with real DB queries +- [ ] Example files: `examples/features/preview_features.py`, `examples/features/config_shape.json` + +--- + +## All Needed Context + +### Documentation & References + +```yaml +# MUST READ - Critical for implementation +- url: https://www.nixtla.io/blog/automated-time-series-feature-engineering-with-mlforecast + why: MLForecast patterns for time-safe lag/rolling transforms + critical: shift(1) before rolling to prevent leakage + +- url: https://towardsdatascience.com/avoiding-data-leakage-in-timeseries-101-25ea13fcb15f + why: Common leakage pitfalls and detection strategies + critical: Random CV is wrong for time series; always use time-based splits + +- url: https://www.sktime.net/en/stable/api_reference/transformations.html + why: WindowSummarizer patterns if needed + critical: lag_feature dict syntax for window configurations + +- url: https://docs.pydantic.dev/latest/concepts/validators/ + why: Pydantic v2 field validators and model validators + critical: Use ConfigDict(frozen=True) for immutable configs + +- url: https://learn.microsoft.com/en-us/azure/machine-learning/concept-automl-forecasting-calendar-features + why: Best practices for calendar features in demand forecasting + critical: Cyclical encoding (sin/cos) for periodic features + +- url: https://scikit-learn.org/stable/modules/compose.html + why: Pipeline composition patterns + critical: Feature union and column transformer patterns + +# Internal codebase files - MUST reference these patterns +- file: app/features/ingest/service.py + why: Service pattern with Protocol, dataclass results, async DB queries + pattern: KeyResolverProtocol, UpsertResult dataclass, batch queries + +- file: app/features/ingest/schemas.py + why: Pydantic v2 schema patterns with Field(), validators + pattern: ConfigDict, field_validator, model_validator + +- file: app/features/ingest/routes.py + why: FastAPI router structure with Depends(get_db) + pattern: Router tags, response_model, status codes + +- file: app/features/ingest/tests/conftest.py + why: Test fixture patterns with MockKeyResolver + pattern: Protocol-based mocks, sample data fixtures + +- file: app/features/data_platform/models.py + why: SalesDaily, Calendar, PriceHistory, Promotion, InventorySnapshotDaily models + pattern: Relationships, grain constraints, Decimal types + +- file: app/core/config.py + why: Pydantic Settings pattern + pattern: Field() with defaults, ge/le validators + +- file: docs/validation/logging-standard.md + why: Event naming convention + pattern: featureops.{action}_{state} + +- file: CLAUDE.md + why: Type safety requirements, vertical slice architecture, KISS/YAGNI + critical: No future leakage, time-based splits only +``` + +### Current Codebase Tree + +```bash +app/ +├── __init__.py +├── main.py # FastAPI entry, router registration +├── core/ +│ ├── config.py # Pydantic Settings +│ ├── database.py # AsyncSession, get_db() +│ ├── exceptions.py # ForecastLabError hierarchy +│ ├── health.py # Router pattern +│ ├── logging.py # Structured logging +│ └── middleware.py # RequestIdMiddleware +├── shared/ +│ ├── models.py # TimestampMixin +│ ├── schemas.py # ErrorResponse +│ └── utils.py # Utilities +└── features/ + ├── data_platform/ + │ ├── models.py # Store, Product, Calendar, SalesDaily, PriceHistory, etc. + │ ├── schemas.py # Pydantic schemas + │ └── tests/ + └── ingest/ + ├── routes.py # POST /ingest/sales-daily + ├── schemas.py # Ingest schemas + ├── service.py # KeyResolver, upsert logic + └── tests/ +``` + +### Desired Codebase Tree (files to be added) + +```bash +app/ +├── core/ +│ └── config.py # MODIFY: Add feature engineering settings +└── features/ + └── featuresets/ # NEW: Feature engineering vertical slice + ├── __init__.py # Module exports + ├── schemas.py # FeatureSetConfig, LagConfig, RollingConfig, etc. + ├── service.py # FeatureEngineeringService + ├── routes.py # POST /featuresets/compute, GET /featuresets/preview + └── tests/ + ├── __init__.py + ├── conftest.py # Feature-specific fixtures, sample time series + ├── test_schemas.py # Config validation tests + ├── test_service.py # Service logic tests + ├── test_leakage.py # CRITICAL: Leakage prevention tests + └── test_routes.py # Integration tests + +examples/ +└── features/ + ├── preview_features.py # NEW: Feature preview script + ├── config_shape.json # NEW: Example config JSON + └── leakage_tests.py # NEW: Leakage test templates +``` + +### Known Gotchas & Library Quirks + +```python +# CRITICAL: Lag features must use positive shift values +# ❌ WRONG: df["lag_1"] = df["quantity"].shift(-1) # LEAKS FUTURE! +# ✅ CORRECT: df["lag_1"] = df["quantity"].shift(1) # Uses past data + +# CRITICAL: Rolling features must exclude current observation +# ❌ WRONG: df["rolling_mean_7"] = df["quantity"].rolling(7).mean() # Includes current! +# ✅ CORRECT: df["rolling_mean_7"] = df["quantity"].shift(1).rolling(7).mean() # Excludes current + +# CRITICAL: Group-aware operations for multi-series data +# ❌ WRONG: df["lag_1"] = df["quantity"].shift(1) # Leaks across series! +# ✅ CORRECT: df["lag_1"] = df.groupby(["store_id", "product_id"])["quantity"].shift(1) + +# CRITICAL: Cutoff must be explicit parameter, not inferred +# ❌ WRONG: def compute_features(df): # No cutoff = risk of leakage +# ✅ CORRECT: def compute_features(df, cutoff_date: date): # Explicit cutoff + +# CRITICAL: Decimal for monetary values +# ❌ WRONG: price: float +# ✅ CORRECT: price: Decimal = Field(..., decimal_places=2) + +# CRITICAL: Pydantic v2 frozen configs for immutability +# ❌ WRONG: class Config: pass +# ✅ CORRECT: model_config = ConfigDict(frozen=True, extra="forbid") + +# CRITICAL: Cyclical encoding for periodic features +# ❌ WRONG: day_of_week as integer 0-6 (ML sees 0 and 6 as far apart) +# ✅ CORRECT: sin/cos encoding preserves cyclical continuity + +# CRITICAL: Forward-fill for prices, zero-fill for sales +# - Missing sales = no sales = 0 +# - Missing price = price unchanged = forward-fill +# - Missing inventory = last known state = forward-fill + +# CRITICAL: Calendar dates must exist in Calendar table +# Features referencing calendar (is_holiday, etc.) require FK lookup +``` + +--- + +## Implementation Blueprint + +### Data Models and Structure + +#### Feature Configuration Schemas (app/features/featuresets/schemas.py) + +```python +"""Pydantic schemas for feature engineering configuration.""" + +from datetime import date, datetime +from decimal import Decimal +from typing import Literal +import hashlib + +from pydantic import BaseModel, Field, ConfigDict, field_validator + + +class FeatureConfigBase(BaseModel): + """Base configuration with versioning support.""" + + model_config = ConfigDict( + frozen=True, # Immutable after creation + extra="forbid", # No extra fields allowed + strict=True, + ) + + schema_version: str = Field( + ..., + description="Semantic version of this config schema", + pattern=r"^\d+\.\d+(\.\d+)?$", + ) + + def config_hash(self) -> str: + """Generate deterministic hash of configuration.""" + # Exclude timestamps from hash for reproducibility + config_json = self.model_dump_json() + return hashlib.sha256(config_json.encode()).hexdigest()[:16] + + +class LagConfig(FeatureConfigBase): + """Configuration for lag-based features.""" + + schema_version: str = "1.0" + lags: tuple[int, ...] = Field( + default=(1, 7, 14, 28), + description="Lag periods in days", + ) + target_column: str = Field(default="quantity") + fill_value: float | None = Field( + default=None, + description="Value to fill NaN (None = keep NaN)", + ) + + @field_validator("lags") + @classmethod + def validate_lags_positive(cls, v: tuple[int, ...]) -> tuple[int, ...]: + """Ensure all lags are positive (no future leakage).""" + if any(lag <= 0 for lag in v): + raise ValueError("All lags must be positive integers") + return v + + +class RollingConfig(FeatureConfigBase): + """Configuration for rolling window features.""" + + schema_version: str = "1.0" + windows: tuple[int, ...] = Field( + default=(7, 14, 28), + description="Window sizes in days", + ) + aggregations: tuple[Literal["mean", "std", "min", "max", "sum"], ...] = Field( + default=("mean", "std"), + ) + target_column: str = Field(default="quantity") + min_periods: int | None = Field( + default=None, + description="Minimum observations required (None = window size)", + ) + + +class CalendarConfig(FeatureConfigBase): + """Configuration for calendar features.""" + + schema_version: str = "1.0" + include_day_of_week: bool = True + include_month: bool = True + include_quarter: bool = True + include_year: bool = False + include_is_weekend: bool = True + include_is_month_end: bool = True + include_is_holiday: bool = True + use_cyclical_encoding: bool = Field( + default=True, + description="Use sin/cos encoding for periodic features", + ) + + +class ExogenousConfig(FeatureConfigBase): + """Configuration for exogenous variable features.""" + + schema_version: str = "1.0" + # Price features + include_price: bool = True + price_lags: tuple[int, ...] = (7, 28) + include_price_change: bool = True + # Promotion features + include_promo: bool = True + # Inventory features + include_inventory: bool = False + include_stockout_flag: bool = True + + +class ImputationConfig(FeatureConfigBase): + """Configuration for missing value imputation.""" + + schema_version: str = "1.0" + strategies: dict[str, Literal["zero", "ffill", "bfill", "mean", "drop"]] = Field( + default_factory=lambda: { + "quantity": "zero", + "unit_price": "ffill", + "total_amount": "zero", + } + ) + + +class FeatureSetConfig(FeatureConfigBase): + """Complete feature engineering configuration.""" + + schema_version: str = "1.0" + + # Feature sub-configurations (None = disabled) + lag_config: LagConfig | None = None + rolling_config: RollingConfig | None = None + calendar_config: CalendarConfig | None = None + exogenous_config: ExogenousConfig | None = None + imputation_config: ImputationConfig | None = None + + # Data grain + entity_columns: tuple[str, ...] = ("store_id", "product_id") + date_column: str = "date" + target_column: str = "quantity" + + # Metadata + name: str = Field(..., min_length=1, max_length=100) + description: str | None = None + + def get_enabled_features(self) -> list[str]: + """Return list of enabled feature types.""" + enabled = [] + if self.lag_config: + enabled.append("lag") + if self.rolling_config: + enabled.append("rolling") + if self.calendar_config: + enabled.append("calendar") + if self.exogenous_config: + enabled.append("exogenous") + return enabled + + +# Request/Response schemas for API +class ComputeFeaturesRequest(BaseModel): + """Request body for POST /featuresets/compute.""" + + model_config = ConfigDict(strict=True) + + store_id: int = Field(..., ge=1) + product_id: int = Field(..., ge=1) + cutoff_date: date = Field(..., description="Compute features up to this date (inclusive)") + lookback_days: int = Field(default=365, ge=1, le=1095, description="Days of history to use") + config: FeatureSetConfig + + +class FeatureRow(BaseModel): + """Single row of computed features.""" + + date: date + store_id: int + product_id: int + features: dict[str, float | int | None] + + +class ComputeFeaturesResponse(BaseModel): + """Response body for POST /featuresets/compute.""" + + rows: list[FeatureRow] + feature_columns: list[str] + config_hash: str + cutoff_date: date + row_count: int + null_counts: dict[str, int] + duration_ms: float + + +class PreviewFeaturesRequest(BaseModel): + """Request for GET /featuresets/preview.""" + + store_id: int = Field(..., ge=1) + product_id: int = Field(..., ge=1) + cutoff_date: date + sample_rows: int = Field(default=10, ge=1, le=100) + config: FeatureSetConfig +``` + +#### Settings Extension (app/core/config.py) + +```python +# Add to Settings class: +# Feature Engineering configuration +feature_max_lookback_days: int = Field( + default=1095, # 3 years + ge=30, + le=1825, # 5 years + description="Maximum lookback window for feature computation", +) +feature_max_lag: int = Field( + default=365, + ge=1, + le=730, + description="Maximum allowed lag value", +) +feature_max_window: int = Field( + default=90, + ge=1, + le=365, + description="Maximum rolling window size", +) +``` + +### Tasks (Ordered Implementation) + +```yaml +Task 1: Create featuresets feature directory structure + FILES: + - app/features/featuresets/__init__.py + - app/features/featuresets/schemas.py + - app/features/featuresets/service.py + - app/features/featuresets/routes.py + - app/features/featuresets/tests/__init__.py + - app/features/featuresets/tests/conftest.py + VALIDATION: + - ls -la app/features/featuresets/ + +Task 2: Add feature engineering configuration to Settings + MODIFY: app/core/config.py + ADD: + - feature_max_lookback_days: int = 1095 + - feature_max_lag: int = 365 + - feature_max_window: int = 90 + VALIDATION: + - uv run python -c "from app.core.config import get_settings; s = get_settings(); print(f'max_lag={s.feature_max_lag}')" + +Task 3: Implement feature configuration schemas + FILE: app/features/featuresets/schemas.py + IMPLEMENT: + - FeatureConfigBase with config_hash() + - LagConfig with lag validation (positive only) + - RollingConfig with window/aggregation options + - CalendarConfig with cyclical encoding flag + - ExogenousConfig for price/promo/inventory + - ImputationConfig with strategy mapping + - FeatureSetConfig combining all configs + - ComputeFeaturesRequest/Response for API + VALIDATION: + - uv run mypy app/features/featuresets/schemas.py + - uv run pyright app/features/featuresets/schemas.py + +Task 4: Implement FeatureEngineeringService core + FILE: app/features/featuresets/service.py + IMPLEMENT: + - FeatureComputationResult dataclass + - FeatureEngineeringService class + - compute_features(df, cutoff_date) main method + - _compute_lag_features() with groupby + shift + - _compute_rolling_features() with shift(1) + rolling + - _compute_calendar_features() with cyclical encoding + - _apply_imputation() with strategy dispatch + PSEUDOCODE: See "Service Implementation Pseudocode" section below + VALIDATION: + - uv run mypy app/features/featuresets/service.py + - uv run pyright app/features/featuresets/service.py + +Task 5: Implement data loading service + FILE: app/features/featuresets/service.py (append) + IMPLEMENT: + - FeatureDataLoader class + - load_sales_data(db, store_id, product_id, start_date, end_date) + - load_calendar_data(db, start_date, end_date) + - load_price_data(db, store_id, product_id, start_date, end_date) + - Efficient async queries with proper date filtering + VALIDATION: + - uv run mypy app/features/featuresets/service.py + +Task 6: Implement exogenous feature computation + FILE: app/features/featuresets/service.py (append) + IMPLEMENT: + - _compute_exogenous_features() method + - Price lag features + - Price change percentage + - Promotion active flag + - Stockout flag from inventory + VALIDATION: + - uv run mypy app/features/featuresets/service.py + +Task 7: Implement featuresets routes + FILE: app/features/featuresets/routes.py + IMPLEMENT: + - Router with tag "featuresets" + - POST /featuresets/compute endpoint + - GET /featuresets/preview endpoint + - Error handling with ForecastLabError + VALIDATION: + - uv run mypy app/features/featuresets/routes.py + - uv run pyright app/features/featuresets/routes.py + +Task 8: Register featuresets router in main.py + MODIFY: app/main.py + ADD: + - from app.features.featuresets.routes import router as featuresets_router + - app.include_router(featuresets_router) + VALIDATION: + - uv run python -c "from app.main import app; print([r.path for r in app.routes])" + +Task 9: Create test fixtures with sample time series + FILE: app/features/featuresets/tests/conftest.py + IMPLEMENT: + - sample_time_series fixture (30 days of data) + - sample_feature_config fixture + - sample_lag_config, sample_rolling_config fixtures + - mock_db_session fixture for unit tests + VALIDATION: + - uv run pytest app/features/featuresets/tests/ --collect-only + +Task 10: Create unit tests for schemas + FILE: app/features/featuresets/tests/test_schemas.py + IMPLEMENT: + - Test LagConfig rejects negative lags + - Test FeatureSetConfig config_hash() determinism + - Test ComputeFeaturesRequest validation + - Test frozen config immutability + VALIDATION: + - uv run pytest app/features/featuresets/tests/test_schemas.py -v + +Task 11: Create unit tests for service + FILE: app/features/featuresets/tests/test_service.py + IMPLEMENT: + - Test lag feature computation + - Test rolling feature computation + - Test calendar feature computation + - Test imputation strategies + - Test compute_features() integration + VALIDATION: + - uv run pytest app/features/featuresets/tests/test_service.py -v + +Task 12: Create CRITICAL leakage prevention tests + FILE: app/features/featuresets/tests/test_leakage.py + IMPLEMENT: + - test_lag_features_no_future_data() + - test_rolling_features_exclude_current() + - test_cutoff_strictly_enforced() + - test_group_isolation_no_cross_series_leakage() + PATTERN: + - Use sequential values (1,2,3...) so leakage is mathematically detectable + - Assert feature at row i never uses data from rows > i + VALIDATION: + - uv run pytest app/features/featuresets/tests/test_leakage.py -v + +Task 13: Create integration tests for routes + FILE: app/features/featuresets/tests/test_routes.py + IMPLEMENT: + - Test POST /featuresets/compute with valid payload + - Test GET /featuresets/preview returns sample rows + - Test error handling for missing data + - Test config_hash consistency + REQUIRES: + - Running PostgreSQL with seeded data + VALIDATION: + - uv run pytest app/features/featuresets/tests/test_routes.py -v -m integration + +Task 14: Create example files + FILES: + - examples/features/preview_features.py + - examples/features/config_shape.json + - examples/features/leakage_tests.py + CONTENT: See "Example Files" section below + VALIDATION: + - ls -la examples/features/ + +Task 15: Final validation - Run all quality gates + COMMANDS: + - uv run ruff check app/features/featuresets/ --fix + - uv run ruff format app/features/featuresets/ + - uv run mypy app/features/featuresets/ + - uv run pyright app/features/featuresets/ + - uv run pytest app/features/featuresets/tests/ -v + - uv run pytest app/features/featuresets/tests/test_leakage.py -v # CRITICAL +``` + +### Service Implementation Pseudocode + +```python +"""Feature engineering service - CRITICAL implementation details.""" + +from dataclasses import dataclass, field +from datetime import date +from typing import Any + +import numpy as np +import pandas as pd +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.core.logging import get_logger +from app.features.data_platform.models import SalesDaily, Calendar, PriceHistory +from app.features.featuresets.schemas import FeatureSetConfig + +logger = get_logger(__name__) + + +@dataclass +class FeatureComputationResult: + """Result of feature computation.""" + + df: pd.DataFrame + feature_columns: list[str] + config_hash: str + stats: dict[str, Any] = field(default_factory=dict) + + +class FeatureEngineeringService: + """ + Time-safe feature engineering service. + + CRITICAL: All feature computation respects cutoff_date to prevent leakage. + """ + + def __init__(self, config: FeatureSetConfig) -> None: + self.config = config + self.entity_cols = list(config.entity_columns) + self.date_col = config.date_column + self.target_col = config.target_column + + def compute_features( + self, + df: pd.DataFrame, + cutoff_date: date | None = None, + ) -> FeatureComputationResult: + """ + Compute all configured features. + + Args: + df: Input dataframe with entity columns, date, and target + cutoff_date: Maximum date to include (CRITICAL for time-safety) + + Returns: + FeatureComputationResult with computed features + """ + logger.info( + "featureops.compute_started", + config_hash=self.config.config_hash(), + row_count=len(df), + cutoff_date=str(cutoff_date) if cutoff_date else None, + ) + + result = df.copy() + + # CRITICAL: Sort by entity + date for correct lag/rolling computation + result = result.sort_values(self.entity_cols + [self.date_col]) + + # CRITICAL: Filter to cutoff BEFORE any feature computation + if cutoff_date: + result = result[result[self.date_col] <= cutoff_date] + + feature_columns: list[str] = [] + + # 1. Apply imputation FIRST (fills gaps before lag/rolling) + if self.config.imputation_config: + result = self._apply_imputation(result) + + # 2. Lag features + if self.config.lag_config: + result, cols = self._compute_lag_features(result) + feature_columns.extend(cols) + + # 3. Rolling features (uses shifted data) + if self.config.rolling_config: + result, cols = self._compute_rolling_features(result) + feature_columns.extend(cols) + + # 4. Calendar features (no leakage risk) + if self.config.calendar_config: + result, cols = self._compute_calendar_features(result) + feature_columns.extend(cols) + + # 5. Exogenous features + if self.config.exogenous_config: + result, cols = self._compute_exogenous_features(result) + feature_columns.extend(cols) + + stats = { + "input_rows": len(df), + "output_rows": len(result), + "feature_count": len(feature_columns), + "null_counts": result[feature_columns].isnull().sum().to_dict(), + } + + logger.info( + "featureops.compute_completed", + config_hash=self.config.config_hash(), + feature_count=len(feature_columns), + ) + + return FeatureComputationResult( + df=result, + feature_columns=feature_columns, + config_hash=self.config.config_hash(), + stats=stats, + ) + + def _compute_lag_features( + self, df: pd.DataFrame + ) -> tuple[pd.DataFrame, list[str]]: + """ + Compute lag features with proper grouping. + + CRITICAL: shift(lag) uses PAST data only (positive lag = look back) + """ + config = self.config.lag_config + assert config is not None + + result = df.copy() + columns: list[str] = [] + + for lag in config.lags: + col_name = f"lag_{lag}" + # CRITICAL: Group by entity to prevent cross-series leakage + result[col_name] = ( + df.groupby(self.entity_cols)[config.target_column] + .shift(lag) # Positive shift = look back in time + ) + if config.fill_value is not None: + result[col_name] = result[col_name].fillna(config.fill_value) + columns.append(col_name) + + return result, columns + + def _compute_rolling_features( + self, df: pd.DataFrame + ) -> tuple[pd.DataFrame, list[str]]: + """ + Compute rolling window features. + + CRITICAL: shift(1) BEFORE rolling to exclude current observation. + Without shift(1), rolling(7).mean() at row i uses data from [i-6, i] inclusive. + With shift(1), it uses data from [i-7, i-1] — truly past data only. + """ + config = self.config.rolling_config + assert config is not None + + result = df.copy() + columns: list[str] = [] + + for window in config.windows: + min_per = config.min_periods if config.min_periods else window + + for agg in config.aggregations: + col_name = f"rolling_{agg}_{window}" + + # CRITICAL: shift(1) prevents using current row in rolling calculation + result[col_name] = ( + df.groupby(self.entity_cols)[config.target_column] + .transform( + lambda x: x.shift(1) + .rolling(window=window, min_periods=min_per) + .agg(agg) + ) + ) + columns.append(col_name) + + return result, columns + + def _compute_calendar_features( + self, df: pd.DataFrame + ) -> tuple[pd.DataFrame, list[str]]: + """ + Compute calendar-based features. + + Calendar features are derived from the date column itself, + so there's no risk of future leakage. + """ + config = self.config.calendar_config + assert config is not None + + result = df.copy() + columns: list[str] = [] + dates = pd.to_datetime(result[self.date_col]) + + if config.include_day_of_week: + dow = dates.dt.dayofweek # 0=Monday, 6=Sunday + if config.use_cyclical_encoding: + result["dow_sin"] = np.sin(2 * np.pi * dow / 7) + result["dow_cos"] = np.cos(2 * np.pi * dow / 7) + columns.extend(["dow_sin", "dow_cos"]) + else: + result["day_of_week"] = dow + columns.append("day_of_week") + + if config.include_month: + month = dates.dt.month + if config.use_cyclical_encoding: + result["month_sin"] = np.sin(2 * np.pi * month / 12) + result["month_cos"] = np.cos(2 * np.pi * month / 12) + columns.extend(["month_sin", "month_cos"]) + else: + result["month"] = month + columns.append("month") + + if config.include_quarter: + result["quarter"] = dates.dt.quarter + columns.append("quarter") + + if config.include_is_weekend: + result["is_weekend"] = dates.dt.dayofweek.isin([5, 6]).astype(int) + columns.append("is_weekend") + + if config.include_is_month_end: + result["is_month_end"] = dates.dt.is_month_end.astype(int) + columns.append("is_month_end") + + # is_holiday requires calendar table lookup (handled separately) + + return result, columns + + def _apply_imputation(self, df: pd.DataFrame) -> pd.DataFrame: + """Apply configured imputation strategies.""" + config = self.config.imputation_config + assert config is not None + + result = df.copy() + + for col, strategy in config.strategies.items(): + if col not in result.columns: + continue + + if strategy == "zero": + result[col] = result[col].fillna(0) + elif strategy == "ffill": + # CRITICAL: Group-aware forward fill + result[col] = result.groupby(self.entity_cols)[col].ffill() + elif strategy == "bfill": + result[col] = result.groupby(self.entity_cols)[col].bfill() + elif strategy == "mean": + result[col] = result.groupby(self.entity_cols)[col].transform( + lambda x: x.fillna(x.mean()) + ) + elif strategy == "drop": + result = result.dropna(subset=[col]) + + return result + + def _compute_exogenous_features( + self, df: pd.DataFrame + ) -> tuple[pd.DataFrame, list[str]]: + """Compute exogenous features (price, promo, inventory).""" + config = self.config.exogenous_config + assert config is not None + + result = df.copy() + columns: list[str] = [] + + # Price features (if price column exists) + if config.include_price and "unit_price" in df.columns: + for lag in config.price_lags: + col_name = f"price_lag_{lag}" + result[col_name] = ( + df.groupby(self.entity_cols)["unit_price"] + .shift(lag) + ) + columns.append(col_name) + + if config.include_price_change: + # Price change vs 7 days ago + result["price_pct_change_7d"] = ( + df.groupby(self.entity_cols)["unit_price"] + .pct_change(periods=7) + ) + columns.append("price_pct_change_7d") + + return result, columns +``` + +### Integration Points + +```yaml +DATABASE: + - No new migrations required (uses existing SalesDaily, Calendar, PriceHistory) + - Queries use existing grain constraint for efficient lookups + - Calendar table provides is_holiday flag + +CONFIG: + - MODIFY: app/core/config.py + - ADD: feature_max_lookback_days, feature_max_lag, feature_max_window + +ROUTES: + - MODIFY: app/main.py + - ADD: app.include_router(featuresets_router) + - ENDPOINTS: + - POST /featuresets/compute + - GET /featuresets/preview + +DEPENDENCIES: + - pandas (add to pyproject.toml if not present) + - numpy (add to pyproject.toml if not present) + - SalesDaily table must have data for the requested series/date range + - Calendar table should have entries for dates in range + +REGISTRY INTEGRATION (Future - INITIAL-7): + - FeatureSetConfig.config_hash() enables registry lookups + - FeatureSetConfig.model_dump_json() for registry storage +``` + +--- + +## Validation Loop + +### Level 1: Syntax & Style + +```bash +# Run FIRST - fix any errors before proceeding +uv run ruff check app/features/featuresets/ --fix +uv run ruff format app/features/featuresets/ + +# Expected: No errors +``` + +### Level 2: Type Checking + +```bash +# Run SECOND - type safety is non-negotiable +uv run mypy app/features/featuresets/ +uv run pyright app/features/featuresets/ + +# Expected: 0 errors +``` + +### Level 3: Unit Tests + +```bash +# Run THIRD - verify schemas and service logic +uv run pytest app/features/featuresets/tests/test_schemas.py -v +uv run pytest app/features/featuresets/tests/test_service.py -v + +# Expected: All tests pass +``` + +### Level 4: CRITICAL Leakage Tests + +```bash +# Run FOURTH - these tests are NON-NEGOTIABLE +uv run pytest app/features/featuresets/tests/test_leakage.py -v + +# Expected: ALL leakage tests pass +# If any fail: STOP and fix immediately - leakage is a critical bug +``` + +### Level 5: Integration Tests + +```bash +# Run FIFTH - verify API and database behavior +docker-compose up -d +sleep 5 +uv run alembic upgrade head +uv run python examples/seed_demo_data.py + +uv run pytest app/features/featuresets/tests/test_routes.py -v -m integration + +docker-compose down + +# Expected: All tests pass +``` + +### Level 6: Manual API Test + +```bash +# Start API server +uv run uvicorn app.main:app --reload --port 8123 + +# Test compute endpoint +curl -X POST http://localhost:8123/featuresets/compute \ + -H "Content-Type: application/json" \ + -d '{ + "store_id": 1, + "product_id": 1, + "cutoff_date": "2024-01-31", + "lookback_days": 60, + "config": { + "schema_version": "1.0", + "name": "test_config", + "lag_config": {"schema_version": "1.0", "lags": [1, 7, 14]}, + "rolling_config": {"schema_version": "1.0", "windows": [7, 14], "aggregations": ["mean", "std"]}, + "calendar_config": {"schema_version": "1.0"} + } + }' + +# Expected: {"rows": [...], "feature_columns": [...], "config_hash": "...", ...} +``` + +--- + +## Final Validation Checklist + +- [ ] `uv run ruff check app/features/featuresets/` passes with no errors +- [ ] `uv run ruff format --check app/features/featuresets/` passes +- [ ] `uv run mypy app/features/featuresets/` passes with 0 errors +- [ ] `uv run pyright app/features/featuresets/` passes with 0 errors +- [ ] `uv run pytest app/features/featuresets/tests/test_schemas.py -v` all tests pass +- [ ] `uv run pytest app/features/featuresets/tests/test_service.py -v` all tests pass +- [ ] `uv run pytest app/features/featuresets/tests/test_leakage.py -v` **ALL leakage tests pass** +- [ ] `uv run pytest app/features/featuresets/tests/test_routes.py -v -m integration` all tests pass +- [ ] POST /featuresets/compute returns correct response structure +- [ ] Lag features only use past data (shift with positive values) +- [ ] Rolling features exclude current observation (shift(1) before rolling) +- [ ] Calendar features use cyclical encoding +- [ ] Config hash is deterministic (same config = same hash) +- [ ] Imputation respects group boundaries +- [ ] Logs follow `featureops.{action}_{state}` naming convention +- [ ] `examples/features/` contains working examples + +--- + +## Anti-Patterns to Avoid + +- ❌ **Don't** use negative shift values — causes future leakage +- ❌ **Don't** apply rolling without shift(1) — includes current observation +- ❌ **Don't** compute features without explicit cutoff_date parameter +- ❌ **Don't** use global statistics (mean/std of entire dataset) +- ❌ **Don't** forget groupby for multi-series data — causes cross-series leakage +- ❌ **Don't** use random train/test splits — use time-based splits only +- ❌ **Don't** use integer encoding for cyclical features (day of week, month) +- ❌ **Don't** hardcode feature configurations — use FeatureSetConfig +- ❌ **Don't** skip leakage tests — they are the most important tests +- ❌ **Don't** use float for monetary values — use Decimal + +--- + +## Example Files + +### examples/features/config_shape.json + +```json +{ + "schema_version": "1.0", + "name": "retail_demand_v1", + "description": "Standard feature set for retail demand forecasting", + "entity_columns": ["store_id", "product_id"], + "date_column": "date", + "target_column": "quantity", + "lag_config": { + "schema_version": "1.0", + "lags": [1, 7, 14, 28, 364], + "target_column": "quantity", + "fill_value": null + }, + "rolling_config": { + "schema_version": "1.0", + "windows": [7, 14, 28], + "aggregations": ["mean", "std"], + "target_column": "quantity", + "min_periods": null + }, + "calendar_config": { + "schema_version": "1.0", + "include_day_of_week": true, + "include_month": true, + "include_quarter": true, + "include_year": false, + "include_is_weekend": true, + "include_is_month_end": true, + "include_is_holiday": true, + "use_cyclical_encoding": true + }, + "exogenous_config": { + "schema_version": "1.0", + "include_price": true, + "price_lags": [7, 28], + "include_price_change": true, + "include_promo": true, + "include_inventory": false, + "include_stockout_flag": true + }, + "imputation_config": { + "schema_version": "1.0", + "strategies": { + "quantity": "zero", + "unit_price": "ffill", + "total_amount": "zero" + } + } +} +``` + +### examples/features/preview_features.py + +```python +"""Preview feature computation for a single series.""" + +import asyncio +from datetime import date + +import httpx + + +async def preview_features() -> None: + """Preview features for store 1, product 1.""" + async with httpx.AsyncClient() as client: + response = await client.post( + "http://localhost:8123/featuresets/compute", + json={ + "store_id": 1, + "product_id": 1, + "cutoff_date": "2024-01-31", + "lookback_days": 60, + "config": { + "schema_version": "1.0", + "name": "preview_test", + "lag_config": { + "schema_version": "1.0", + "lags": [1, 7, 14], + }, + "rolling_config": { + "schema_version": "1.0", + "windows": [7, 14], + "aggregations": ["mean", "std"], + }, + "calendar_config": { + "schema_version": "1.0", + }, + }, + }, + ) + + result = response.json() + print(f"Config Hash: {result['config_hash']}") + print(f"Feature Columns: {result['feature_columns']}") + print(f"Row Count: {result['row_count']}") + print(f"Null Counts: {result['null_counts']}") + print(f"\nSample Rows:") + for row in result["rows"][:5]: + print(f" {row['date']}: {row['features']}") + + +if __name__ == "__main__": + asyncio.run(preview_features()) +``` + +--- + +## Confidence Score: 8/10 + +**Rationale:** + +- (+) Clear time-safety patterns with explicit cutoff enforcement +- (+) Comprehensive leakage prevention with dedicated tests +- (+) Follows existing codebase patterns (vertical slice, schemas, logging) +- (+) Pydantic v2 config with versioning and hashing for reproducibility +- (+) Type-safe throughout with strict mypy/pyright +- (+) Well-documented gotchas and anti-patterns +- (+) Calendar cyclical encoding is best practice for ML +- (-) Exogenous features require data from PriceHistory/Promotion tables (more complex queries) +- (-) Large datasets may need chunked processing (YAGNI for now) +- (-) Holiday features require Calendar table is_holiday flag to be populated + +**Recommended Approach:** + +1. Execute tasks 1-3 (directory structure, config, schemas) +2. Run type checkers after each file +3. Execute tasks 4-6 (service implementation) +4. Run unit tests after service +5. Execute tasks 7-8 (routes, main registration) +6. Execute tasks 9-12 (test fixtures, all tests) +7. **CRITICAL: Ensure all leakage tests pass before proceeding** +8. Execute tasks 13-14 (examples, integration tests) +9. Run full validation loop + +--- + +## Version + +- **PRP Version:** 1.0 +- **Target INITIAL:** INITIAL-4.md (Feature Engineering) +- **Created:** 2026-01-31 +- **Author:** Claude Code + +--- + +## References + +### Time-Series Feature Engineering +- [MLForecast Feature Engineering](https://www.nixtla.io/blog/automated-time-series-feature-engineering-with-mlforecast) +- [sktime Transformations API](https://www.sktime.net/en/stable/api_reference/transformations.html) +- [Avoiding Data Leakage in Time Series](https://towardsdatascience.com/avoiding-data-leakage-in-timeseries-101-25ea13fcb15f) + +### Best Practices +- [Azure AutoML Calendar Features](https://learn.microsoft.com/en-us/azure/machine-learning/concept-automl-forecasting-calendar-features) +- [scikit-learn Pipeline Composition](https://scikit-learn.org/stable/modules/compose.html) +- [Cyclical Feature Encoding](https://developer.nvidia.com/blog/three-approaches-to-encoding-time-information-as-features-for-ml-models/) + +### Pydantic v2 +- [Field Validators](https://docs.pydantic.dev/latest/concepts/validators/) +- [Model Configuration](https://docs.pydantic.dev/latest/concepts/config/) + +### Project Internal +- [CLAUDE.md](../CLAUDE.md) - Type safety, vertical slice architecture +- [Logging Standard](../docs/validation/logging-standard.md) - Event naming convention +- [Ingest Service Pattern](../app/features/ingest/service.py) - Async service patterns diff --git a/README.md b/README.md index 68e22415..0d5607d9 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,8 @@ app/ ├── shared/ # Pagination, timestamps, error schemas ├── features/ │ ├── data_platform/ # Store, product, calendar, sales tables -│ └── ingest/ # Batch upsert endpoints for sales data +│ ├── ingest/ # Batch upsert endpoints for sales data +│ └── featuresets/ # Time-safe feature engineering (lags, rolling, calendar) └── main.py # FastAPI entry point tests/ # Test fixtures and helpers @@ -101,7 +102,8 @@ alembic/ # Database migrations examples/ ├── api/ # HTTP client examples ├── schema/ # Table documentation -└── queries/ # Example SQL queries +├── queries/ # Example SQL queries +└── compute_features_demo.py # Feature engineering demo scripts/ # Utility scripts ``` @@ -150,6 +152,38 @@ curl -X POST http://localhost:8123/ingest/sales-daily \ See [examples/api/ingest_sales_daily.http](examples/api/ingest_sales_daily.http) for more examples. +### Feature Engineering + +- `POST /featuresets/compute` - Compute time-safe features for a series +- `POST /featuresets/preview` - Preview features with sample rows + +**Example Request:** +```bash +curl -X POST http://localhost:8123/featuresets/compute \ + -H "Content-Type: application/json" \ + -d '{ + "store_id": 1, + "product_id": 1, + "cutoff_date": "2024-01-31", + "lookback_days": 365, + "config": { + "name": "retail_forecast_v1", + "lag_config": {"lags": [1, 7, 14, 28]}, + "rolling_config": {"windows": [7, 14], "aggregations": ["mean", "std"]}, + "calendar_config": {"include_day_of_week": true, "use_cyclical_encoding": true} + } + }' +``` + +**Features:** +- **Time-safe computation**: All features use only data up to cutoff_date (no future leakage) +- **Lag features**: Past values at specified lag periods (shift with positive values only) +- **Rolling features**: Rolling statistics with shift(1) to exclude current observation +- **Calendar features**: Cyclical encoding (sin/cos) for day of week, month +- **Group isolation**: Entity-aware groupby prevents cross-series leakage + +See [examples/compute_features_demo.py](examples/compute_features_demo.py) for a complete demo. + ## API Documentation Once the server is running: diff --git a/app/core/config.py b/app/core/config.py index 95b04106..a32ddcc2 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -35,6 +35,11 @@ class Settings(BaseSettings): ingest_batch_size: int = 1000 ingest_timeout_seconds: int = 60 + # Feature Engineering + feature_max_lookback_days: int = 1095 # 3 years + feature_max_lag: int = 365 + feature_max_window: int = 90 + @property def is_development(self) -> bool: """Check if running in development mode.""" diff --git a/app/features/featuresets/__init__.py b/app/features/featuresets/__init__.py new file mode 100644 index 00000000..b6958cb5 --- /dev/null +++ b/app/features/featuresets/__init__.py @@ -0,0 +1,33 @@ +"""Feature engineering module for time-safe feature computation.""" + +from app.features.featuresets.schemas import ( + CalendarConfig, + ComputeFeaturesRequest, + ComputeFeaturesResponse, + ExogenousConfig, + FeatureRow, + FeatureSetConfig, + ImputationConfig, + LagConfig, + PreviewFeaturesRequest, + RollingConfig, +) +from app.features.featuresets.service import ( + FeatureComputationResult, + FeatureEngineeringService, +) + +__all__ = [ + "CalendarConfig", + "ComputeFeaturesRequest", + "ComputeFeaturesResponse", + "ExogenousConfig", + "FeatureComputationResult", + "FeatureEngineeringService", + "FeatureRow", + "FeatureSetConfig", + "ImputationConfig", + "LagConfig", + "PreviewFeaturesRequest", + "RollingConfig", +] diff --git a/app/features/featuresets/routes.py b/app/features/featuresets/routes.py new file mode 100644 index 00000000..221cf537 --- /dev/null +++ b/app/features/featuresets/routes.py @@ -0,0 +1,342 @@ +"""Feature engineering API routes for feature computation and preview.""" + +import math +import time +from datetime import date as date_type + +import pandas as pd +from fastapi import APIRouter, Depends, HTTPException, status +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession + +from app.core.database import get_db +from app.core.exceptions import DatabaseError, NotFoundError +from app.core.logging import get_logger +from app.features.featuresets.schemas import ( + ComputeFeaturesRequest, + ComputeFeaturesResponse, + FeatureRow, + PreviewFeaturesRequest, +) +from app.features.featuresets.service import compute_features_for_series + +logger = get_logger(__name__) + +router = APIRouter(prefix="/featuresets", tags=["featuresets"]) + + +@router.post( + "/compute", + response_model=ComputeFeaturesResponse, + status_code=status.HTTP_200_OK, + summary="Compute features for a series", + description=""" +Compute time-safe features for a single store/product series. + +**Time Safety:** All features are computed using only data up to and including +the cutoff_date. This prevents future data leakage in training pipelines. + +**Feature Types:** +- **Lag features:** Past values at specified lag periods +- **Rolling features:** Rolling statistics (mean, std, etc.) over windows +- **Calendar features:** Day of week, month, quarter with cyclical encoding +- **Exogenous features:** Price lags, price changes, stockout flags + +**Configuration:** Pass a FeatureSetConfig to enable/disable specific feature types. +Each sub-config (lag_config, rolling_config, etc.) can be null to disable. +""", +) +async def compute_features( + request: ComputeFeaturesRequest, + db: AsyncSession = Depends(get_db), +) -> ComputeFeaturesResponse: + """Compute features for a single series. + + Args: + request: Feature computation request with config. + db: Async database session from dependency. + + Returns: + Response with computed features and metadata. + + Raises: + NotFoundError: If no data found for the series. + DatabaseError: If database operation fails. + """ + start_time = time.perf_counter() + + logger.info( + "featureops.compute_request_received", + store_id=request.store_id, + product_id=request.product_id, + cutoff_date=str(request.cutoff_date), + lookback_days=request.lookback_days, + config_hash=request.config.config_hash(), + ) + + try: + result = await compute_features_for_series( + db=db, + store_id=request.store_id, + product_id=request.product_id, + cutoff_date=request.cutoff_date, + lookback_days=request.lookback_days, + config=request.config, + ) + + duration_ms = (time.perf_counter() - start_time) * 1000 + + # Check if any data was found + if result.df.empty: + logger.warning( + "featureops.compute_no_data", + store_id=request.store_id, + product_id=request.product_id, + ) + raise NotFoundError( + message=f"No data found for store_id={request.store_id}, product_id={request.product_id}", + details={ + "store_id": request.store_id, + "product_id": request.product_id, + "cutoff_date": str(request.cutoff_date), + }, + ) + + # Convert dataframe to response rows using records for type safety + rows: list[FeatureRow] = [] + records = result.df.to_dict("records") + for record in records: + # Extract features, handling NaN/None + features: dict[str, float | int | None] = {} + for col in result.feature_columns: + val = record.get(col) + if val is None or (isinstance(val, float) and math.isnan(val)): + features[col] = None + elif isinstance(val, (int, float)): + features[col] = float(val) if isinstance(val, float) else int(val) + else: + features[col] = None + + # Extract date, handling Timestamp + date_val = record.get(request.config.date_column) + row_date: date_type + if isinstance(date_val, pd.Timestamp): + row_date = date_val.date() + elif isinstance(date_val, date_type): + row_date = date_val + elif date_val is not None and hasattr(date_val, "date"): + row_date = date_val.date() + else: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Cannot extract date from unsupported type: {type(date_val).__name__}", + ) + + # Validate store_id/product_id presence + store_id_val = record.get("store_id") + product_id_val = record.get("product_id") + if store_id_val is None or int(store_id_val) < 1: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Missing or invalid store_id in data record", + ) + if product_id_val is None or int(product_id_val) < 1: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Missing or invalid product_id in data record", + ) + + rows.append( + FeatureRow( + date=row_date, + store_id=int(store_id_val), + product_id=int(product_id_val), + features=features, + ) + ) + + # Convert null_counts values to int + null_counts = {k: int(v) for k, v in result.stats.get("null_counts", {}).items()} + + logger.info( + "featureops.compute_request_completed", + store_id=request.store_id, + product_id=request.product_id, + row_count=len(rows), + feature_count=len(result.feature_columns), + duration_ms=round(duration_ms, 2), + ) + + return ComputeFeaturesResponse( + rows=rows, + feature_columns=result.feature_columns, + config_hash=result.config_hash, + cutoff_date=request.cutoff_date, + row_count=len(rows), + null_counts=null_counts, + duration_ms=round(duration_ms, 2), + ) + + except NotFoundError: + raise + except SQLAlchemyError as e: + logger.error( + "featureops.compute_request_failed", + error=str(e), + error_type=type(e).__name__, + exc_info=True, + ) + raise DatabaseError( + message="Failed to compute features", + details={"error": str(e)}, + ) from e + + +@router.post( + "/preview", + response_model=ComputeFeaturesResponse, + status_code=status.HTTP_200_OK, + summary="Preview features for a series", + description=""" +Preview computed features for a single store/product series. + +Returns a limited number of sample rows for debugging and exploration. +Uses the same computation logic as /compute but limits output rows. +""", +) +async def preview_features( + request: PreviewFeaturesRequest, + db: AsyncSession = Depends(get_db), +) -> ComputeFeaturesResponse: + """Preview features for a single series. + + Args: + request: Preview request with config and sample size. + db: Async database session from dependency. + + Returns: + Response with sample feature rows. + + Raises: + NotFoundError: If no data found for the series. + DatabaseError: If database operation fails. + """ + start_time = time.perf_counter() + + logger.info( + "featureops.preview_request_received", + store_id=request.store_id, + product_id=request.product_id, + cutoff_date=str(request.cutoff_date), + sample_rows=request.sample_rows, + ) + + try: + # Use default lookback for preview + result = await compute_features_for_series( + db=db, + store_id=request.store_id, + product_id=request.product_id, + cutoff_date=request.cutoff_date, + lookback_days=365, + config=request.config, + ) + + duration_ms = (time.perf_counter() - start_time) * 1000 + + if result.df.empty: + raise NotFoundError( + message=f"No data found for store_id={request.store_id}, product_id={request.product_id}", + details={ + "store_id": request.store_id, + "product_id": request.product_id, + }, + ) + + # Limit to sample_rows (take last N rows) + sample_df = result.df.tail(request.sample_rows) + + # Convert to response rows using records for type safety + rows: list[FeatureRow] = [] + records = sample_df.to_dict("records") + for record in records: + # Extract features, handling NaN/None + features: dict[str, float | int | None] = {} + for col in result.feature_columns: + val = record.get(col) + if val is None or (isinstance(val, float) and math.isnan(val)): + features[col] = None + elif isinstance(val, (int, float)): + features[col] = float(val) if isinstance(val, float) else int(val) + else: + features[col] = None + + # Extract date, handling Timestamp + date_val = record.get(request.config.date_column) + row_date: date_type + if isinstance(date_val, pd.Timestamp): + row_date = date_val.date() + elif isinstance(date_val, date_type): + row_date = date_val + elif date_val is not None and hasattr(date_val, "date"): + row_date = date_val.date() + else: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Cannot extract date from unsupported type: {type(date_val).__name__}", + ) + + # Validate store_id/product_id presence + store_id_val = record.get("store_id") + product_id_val = record.get("product_id") + if store_id_val is None or int(store_id_val) < 1: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Missing or invalid store_id in data record", + ) + if product_id_val is None or int(product_id_val) < 1: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Missing or invalid product_id in data record", + ) + + rows.append( + FeatureRow( + date=row_date, + store_id=int(store_id_val), + product_id=int(product_id_val), + features=features, + ) + ) + + null_counts = {k: int(v) for k, v in result.stats.get("null_counts", {}).items()} + + logger.info( + "featureops.preview_request_completed", + row_count=len(rows), + duration_ms=round(duration_ms, 2), + ) + + return ComputeFeaturesResponse( + rows=rows, + feature_columns=result.feature_columns, + config_hash=result.config_hash, + cutoff_date=request.cutoff_date, + row_count=len(rows), + null_counts=null_counts, + duration_ms=round(duration_ms, 2), + ) + + except NotFoundError: + raise + except SQLAlchemyError as e: + logger.error( + "featureops.preview_request_failed", + error=str(e), + error_type=type(e).__name__, + exc_info=True, + ) + raise DatabaseError( + message="Failed to preview features", + details={"error": str(e)}, + ) from e diff --git a/app/features/featuresets/schemas.py b/app/features/featuresets/schemas.py new file mode 100644 index 00000000..d041ebb7 --- /dev/null +++ b/app/features/featuresets/schemas.py @@ -0,0 +1,355 @@ +"""Pydantic schemas for feature engineering configuration. + +Feature configs are designed to be: +- Immutable (frozen=True) for reproducibility +- Versioned (schema_version) for registry storage +- Hashable (config_hash) for deduplication +""" + +from __future__ import annotations + +import hashlib +from datetime import date as date_type +from typing import Literal + +from pydantic import BaseModel, ConfigDict, Field, field_validator + + +class FeatureConfigBase(BaseModel): + """Base configuration with versioning support. + + All feature configs inherit from this base to ensure: + - Immutability after creation (frozen=True) + - No extra fields allowed (extra="forbid") + - Schema versioning for reproducibility + """ + + model_config = ConfigDict( + frozen=True, + extra="forbid", + ) + + schema_version: str = Field( + default="1.0", + description="Semantic version of this config schema", + pattern=r"^\d+\.\d+(\.\d+)?$", + ) + + def config_hash(self) -> str: + """Generate deterministic hash of configuration. + + Returns: + 16-character hex string hash of config JSON. + """ + config_json = self.model_dump_json() + return hashlib.sha256(config_json.encode()).hexdigest()[:16] + + +class LagConfig(FeatureConfigBase): + """Configuration for lag-based features. + + Lag features capture autoregressive patterns where past values + predict future values. All lags must be positive to prevent + future data leakage. + + Attributes: + lags: Tuple of positive lag periods in days. + target_column: Column to compute lags from. + fill_value: Value to fill NaN (None = keep NaN). + """ + + lags: tuple[int, ...] = Field( + default=(1, 7, 14, 28), + description="Lag periods in days (must be positive)", + ) + target_column: str = Field(default="quantity") + fill_value: float | None = Field( + default=None, + description="Value to fill NaN (None = keep NaN)", + ) + + @field_validator("lags") + @classmethod + def validate_lags_positive(cls, v: tuple[int, ...]) -> tuple[int, ...]: + """Ensure all lags are positive (no future leakage).""" + if not v: + raise ValueError("At least one lag must be specified") + if any(lag <= 0 for lag in v): + raise ValueError("All lags must be positive integers (no future leakage)") + return v + + +class RollingConfig(FeatureConfigBase): + """Configuration for rolling window features. + + Rolling features capture trends and volatility over fixed windows. + Uses shift(1) before rolling to exclude current observation. + + Attributes: + windows: Tuple of window sizes in days. + aggregations: Tuple of aggregation functions to apply. + target_column: Column to compute rolling features from. + min_periods: Minimum observations required (None = window size). + """ + + windows: tuple[int, ...] = Field( + default=(7, 14, 28), + description="Window sizes in days", + ) + aggregations: tuple[Literal["mean", "std", "min", "max", "sum"], ...] = Field( + default=("mean", "std"), + description="Aggregation functions to apply", + ) + target_column: str = Field(default="quantity") + min_periods: int | None = Field( + default=None, + description="Minimum observations required (None = window size)", + ) + + @field_validator("windows") + @classmethod + def validate_windows_positive(cls, v: tuple[int, ...]) -> tuple[int, ...]: + """Ensure all windows are positive.""" + if not v: + raise ValueError("At least one window must be specified") + if any(w <= 0 for w in v): + raise ValueError("All windows must be positive integers") + return v + + +class CalendarConfig(FeatureConfigBase): + """Configuration for calendar features. + + Calendar features capture cyclical patterns (day of week, month) + and special events (holidays). No leakage risk as features are + derived from the date column itself. + + Attributes: + include_day_of_week: Include day of week features. + include_month: Include month features. + include_quarter: Include quarter features. + include_year: Include year feature. + include_is_weekend: Include weekend flag. + include_is_month_end: Include month-end flag. + include_is_holiday: Include holiday flag (requires Calendar table). + use_cyclical_encoding: Use sin/cos encoding for periodic features. + """ + + include_day_of_week: bool = True + include_month: bool = True + include_quarter: bool = True + include_year: bool = False + include_is_weekend: bool = True + include_is_month_end: bool = True + include_is_holiday: bool = True + use_cyclical_encoding: bool = Field( + default=True, + description="Use sin/cos encoding for periodic features", + ) + + +class ExogenousConfig(FeatureConfigBase): + """Configuration for exogenous variable features. + + Exogenous features capture external factors (price, promotions, + inventory) that affect demand. All features are lagged appropriately + to prevent leakage. + + Attributes: + include_price: Include price features. + price_lags: Lag periods for price features. + include_price_change: Include price change percentage. + include_promo: Include promotion flags. + include_inventory: Include inventory levels. + include_stockout_flag: Include stockout flag. + """ + + include_price: bool = True + price_lags: tuple[int, ...] = Field( + default=(7, 28), + description="Lag periods for price features", + ) + include_price_change: bool = True + include_promo: bool = True + include_inventory: bool = False + include_stockout_flag: bool = True + + @field_validator("price_lags") + @classmethod + def validate_price_lags_positive(cls, v: tuple[int, ...]) -> tuple[int, ...]: + """Ensure all price lags are positive.""" + if any(lag <= 0 for lag in v): + raise ValueError("All price lags must be positive integers") + return v + + +class ImputationConfig(FeatureConfigBase): + """Configuration for missing value imputation. + + Imputation strategies: + - zero: Fill with 0 (for sales/quantity) — TIME-SAFE + - ffill: Forward fill (for prices) — TIME-SAFE + - bfill: Backward fill — WARNING: uses future data, avoid in production + - mean: Fill with group mean — WARNING: uses future data, avoid in production + - expanding_mean: Fill with expanding mean — TIME-SAFE (uses only past data) + - drop: Drop rows with missing values — TIME-SAFE + + Attributes: + strategies: Mapping of column name to imputation strategy. + """ + + strategies: dict[str, Literal["zero", "ffill", "bfill", "mean", "expanding_mean", "drop"]] = ( + Field( + default={ + "quantity": "zero", + "unit_price": "ffill", + "total_amount": "zero", + } + ) + ) + + +class FeatureSetConfig(FeatureConfigBase): + """Complete feature engineering configuration. + + Combines all feature sub-configurations into a single config. + Each sub-config can be None to disable that feature type. + + Attributes: + name: Human-readable name for this feature set. + description: Optional description. + entity_columns: Columns defining the entity (e.g., store_id, product_id). + date_column: Column containing the date. + target_column: Column containing the target variable. + lag_config: Configuration for lag features (None = disabled). + rolling_config: Configuration for rolling features (None = disabled). + calendar_config: Configuration for calendar features (None = disabled). + exogenous_config: Configuration for exogenous features (None = disabled). + imputation_config: Configuration for imputation (None = disabled). + """ + + name: str = Field(..., min_length=1, max_length=100, description="Feature set name") + description: str | None = Field(default=None, description="Optional description") + + # Data grain + entity_columns: tuple[str, ...] = Field( + default=("store_id", "product_id"), + description="Columns defining the entity grain", + ) + date_column: str = Field(default="date") + target_column: str = Field(default="quantity") + + # Feature sub-configurations (None = disabled) + lag_config: LagConfig | None = None + rolling_config: RollingConfig | None = None + calendar_config: CalendarConfig | None = None + exogenous_config: ExogenousConfig | None = None + imputation_config: ImputationConfig | None = None + + def get_enabled_features(self) -> list[str]: + """Return list of enabled feature types. + + Returns: + List of enabled feature type names. + """ + enabled: list[str] = [] + if self.lag_config: + enabled.append("lag") + if self.rolling_config: + enabled.append("rolling") + if self.calendar_config: + enabled.append("calendar") + if self.exogenous_config: + enabled.append("exogenous") + return enabled + + +# ============================================================================= +# API Request/Response Schemas +# ============================================================================= + + +class ComputeFeaturesRequest(BaseModel): + """Request body for POST /featuresets/compute. + + Attributes: + store_id: Store ID to compute features for. + product_id: Product ID to compute features for. + cutoff_date: Maximum date to include (CRITICAL for time-safety). + lookback_days: Days of history to use. + config: Feature set configuration. + """ + + model_config = ConfigDict(strict=True) + + store_id: int = Field(..., ge=1, description="Store ID") + product_id: int = Field(..., ge=1, description="Product ID") + cutoff_date: date_type = Field( + ..., + description="Compute features up to this date (inclusive)", + ) + lookback_days: int = Field( + default=365, + ge=1, + le=1095, + description="Days of history to use", + ) + config: FeatureSetConfig + + +class FeatureRow(BaseModel): + """Single row of computed features. + + Attributes: + date: Date for this row. + store_id: Store ID. + product_id: Product ID. + features: Dictionary of feature name to value. + """ + + date: date_type + store_id: int + product_id: int + features: dict[str, float | int | None] + + +class ComputeFeaturesResponse(BaseModel): + """Response body for POST /featuresets/compute. + + Attributes: + rows: List of feature rows. + feature_columns: List of computed feature column names. + config_hash: Hash of the configuration used. + cutoff_date: Cutoff date used. + row_count: Number of rows returned. + null_counts: Count of null values per feature. + duration_ms: Processing duration in milliseconds. + """ + + rows: list[FeatureRow] + feature_columns: list[str] + config_hash: str + cutoff_date: date_type + row_count: int + null_counts: dict[str, int] + duration_ms: float + + +class PreviewFeaturesRequest(BaseModel): + """Request for POST /featuresets/preview. + + Attributes: + store_id: Store ID to preview features for. + product_id: Product ID to preview features for. + cutoff_date: Cutoff date for features. + sample_rows: Number of sample rows to return. + config: Feature set configuration. + """ + + model_config = ConfigDict(strict=True) + + store_id: int = Field(..., ge=1) + product_id: int = Field(..., ge=1) + cutoff_date: date_type + sample_rows: int = Field(default=10, ge=1, le=100) + config: FeatureSetConfig diff --git a/app/features/featuresets/service.py b/app/features/featuresets/service.py new file mode 100644 index 00000000..71707022 --- /dev/null +++ b/app/features/featuresets/service.py @@ -0,0 +1,588 @@ +"""Feature engineering service for time-safe feature computation. + +CRITICAL: All feature computation respects cutoff_date to prevent leakage. +- Lag features use shift(lag) with positive lag values only +- Rolling features use shift(1) BEFORE rolling to exclude current observation +- Calendar features are derived from date column (no leakage risk) +- Exogenous features are lagged appropriately +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from datetime import date as date_type +from datetime import timedelta +from typing import TYPE_CHECKING, Any + +import numpy as np +import pandas as pd +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.core.logging import get_logger +from app.features.data_platform.models import Calendar, SalesDaily +from app.features.featuresets.schemas import FeatureSetConfig + +if TYPE_CHECKING: + pass + +logger = get_logger(__name__) + + +@dataclass +class FeatureComputationResult: + """Result of feature computation. + + Attributes: + df: DataFrame with computed features. + feature_columns: List of computed feature column names. + config_hash: Hash of the configuration used. + stats: Statistics about the computation. + """ + + df: pd.DataFrame + feature_columns: list[str] + config_hash: str + stats: dict[str, Any] = field(default_factory=lambda: {}) + + +class FeatureEngineeringService: + """Time-safe feature engineering service. + + CRITICAL: All feature computation respects cutoff_date to prevent leakage. + Features are computed using only data available at prediction time. + + Example: + >>> config = FeatureSetConfig( + ... name="test", + ... lag_config=LagConfig(lags=(1, 7)), + ... ) + >>> service = FeatureEngineeringService(config) + >>> result = service.compute_features(df, cutoff_date=date(2024, 1, 31)) + """ + + def __init__(self, config: FeatureSetConfig) -> None: + """Initialize service with configuration. + + Args: + config: Feature set configuration. + """ + self.config = config + self.entity_cols = list(config.entity_columns) + self.date_col = config.date_column + self.target_col = config.target_column + + def compute_features( + self, + df: pd.DataFrame, + cutoff_date: date_type | None = None, + ) -> FeatureComputationResult: + """Compute all configured features. + + CRITICAL: Filters data to cutoff_date BEFORE any feature computation + to ensure no future data leakage. + + Args: + df: Input dataframe with entity columns, date, and target. + cutoff_date: Maximum date to include (CRITICAL for time-safety). + + Returns: + FeatureComputationResult with computed features. + """ + logger.info( + "featureops.compute_started", + config_hash=self.config.config_hash(), + row_count=len(df), + cutoff_date=str(cutoff_date) if cutoff_date else None, + ) + + input_rows = len(df) + result = df.copy() + + # CRITICAL: Sort by entity + date for correct lag/rolling computation + result = result.sort_values([*self.entity_cols, self.date_col]) + + # CRITICAL: Filter to cutoff BEFORE any feature computation + if cutoff_date: + date_series = pd.to_datetime(result[self.date_col]).dt.date + result = result[date_series <= cutoff_date] + + feature_columns: list[str] = [] + + # 1. Apply imputation FIRST (fills gaps before lag/rolling) + if self.config.imputation_config: + result = self._apply_imputation(result) + + # 2. Lag features + if self.config.lag_config: + result, cols = self._compute_lag_features(result) + feature_columns.extend(cols) + + # 3. Rolling features (uses shifted data) + if self.config.rolling_config: + result, cols = self._compute_rolling_features(result) + feature_columns.extend(cols) + + # 4. Calendar features (no leakage risk) + if self.config.calendar_config: + result, cols = self._compute_calendar_features(result) + feature_columns.extend(cols) + + # 5. Exogenous features + if self.config.exogenous_config: + result, cols = self._compute_exogenous_features(result) + feature_columns.extend(cols) + + # Compute stats + null_counts: dict[str, int] = {} + if feature_columns: + null_counts = { + str(k): int(v) for k, v in result[feature_columns].isnull().sum().items() + } + + stats: dict[str, Any] = { + "input_rows": input_rows, + "output_rows": len(result), + "feature_count": len(feature_columns), + "null_counts": null_counts, + } + + logger.info( + "featureops.compute_completed", + config_hash=self.config.config_hash(), + feature_count=len(feature_columns), + output_rows=len(result), + ) + + return FeatureComputationResult( + df=result, + feature_columns=feature_columns, + config_hash=self.config.config_hash(), + stats=stats, + ) + + def _compute_lag_features(self, df: pd.DataFrame) -> tuple[pd.DataFrame, list[str]]: + """Compute lag features with proper grouping. + + CRITICAL: shift(lag) uses PAST data only (positive lag = look back). + Groups by entity columns to prevent cross-series leakage. + + Args: + df: Input dataframe sorted by entity + date. + + Returns: + Tuple of (dataframe with lag features, list of new column names). + """ + config = self.config.lag_config + if config is None: + raise RuntimeError("_compute_lag_features called without lag_config") + + result = df.copy() + columns: list[str] = [] + + for lag in config.lags: + col_name = f"lag_{lag}" + # CRITICAL: Group by entity to prevent cross-series leakage + result[col_name] = df.groupby(self.entity_cols, observed=True)[ + config.target_column + ].shift(lag) # Positive shift = look back in time + if config.fill_value is not None: + result[col_name] = result[col_name].fillna(config.fill_value) + columns.append(col_name) + + return result, columns + + def _compute_rolling_features(self, df: pd.DataFrame) -> tuple[pd.DataFrame, list[str]]: + """Compute rolling window features. + + CRITICAL: shift(1) BEFORE rolling to exclude current observation. + Without shift(1), rolling(7).mean() at row i uses data from [i-6, i]. + With shift(1), it uses data from [i-7, i-1] — truly past data only. + + Args: + df: Input dataframe sorted by entity + date. + + Returns: + Tuple of (dataframe with rolling features, list of new column names). + """ + config = self.config.rolling_config + if config is None: + raise RuntimeError("_compute_rolling_features called without rolling_config") + + result = df.copy() + columns: list[str] = [] + + for window in config.windows: + min_per = config.min_periods if config.min_periods else window + + for agg in config.aggregations: + col_name = f"rolling_{agg}_{window}" + + # CRITICAL: shift(1) prevents using current row in rolling calculation + def compute_rolling( + x: pd.Series[float], + w: int = window, + m: int = min_per, + a: str = agg, + ) -> pd.Series[float]: + return x.shift(1).rolling(window=w, min_periods=m).agg(a) + + result[col_name] = df.groupby(self.entity_cols, observed=True)[ + config.target_column + ].transform(compute_rolling) + columns.append(col_name) + + return result, columns + + def _compute_calendar_features(self, df: pd.DataFrame) -> tuple[pd.DataFrame, list[str]]: + """Compute calendar-based features. + + Calendar features are derived from the date column itself, + so there's no risk of future leakage. + + Args: + df: Input dataframe with date column. + + Returns: + Tuple of (dataframe with calendar features, list of new column names). + """ + config = self.config.calendar_config + if config is None: + raise RuntimeError("_compute_calendar_features called without calendar_config") + + result = df.copy() + columns: list[str] = [] + dates = pd.to_datetime(result[self.date_col]) + + if config.include_day_of_week: + dow = dates.dt.dayofweek # 0=Monday, 6=Sunday + if config.use_cyclical_encoding: + result["dow_sin"] = np.sin(2 * np.pi * dow / 7) + result["dow_cos"] = np.cos(2 * np.pi * dow / 7) + columns.extend(["dow_sin", "dow_cos"]) + else: + result["day_of_week"] = dow + columns.append("day_of_week") + + if config.include_month: + month = dates.dt.month + if config.use_cyclical_encoding: + result["month_sin"] = np.sin(2 * np.pi * month / 12) + result["month_cos"] = np.cos(2 * np.pi * month / 12) + columns.extend(["month_sin", "month_cos"]) + else: + result["month"] = month + columns.append("month") + + if config.include_quarter: + result["quarter"] = dates.dt.quarter + columns.append("quarter") + + if config.include_year: + result["year"] = dates.dt.year + columns.append("year") + + if config.include_is_weekend: + result["is_weekend"] = dates.dt.dayofweek.isin([5, 6]).astype(int) + columns.append("is_weekend") + + if config.include_is_month_end: + result["is_month_end"] = dates.dt.is_month_end.astype(int) + columns.append("is_month_end") + + # is_holiday would require calendar table lookup + # Handled separately if data is joined from Calendar table + + return result, columns + + def _apply_imputation(self, df: pd.DataFrame) -> pd.DataFrame: + """Apply configured imputation strategies. + + CRITICAL: Group-aware imputation to prevent cross-series leakage. + + **Leakage Warnings:** + - "bfill" (backward fill) uses FUTURE values to fill gaps — avoid in production + - "mean" uses entire series mean including FUTURE values — avoid in production + - Use "expanding_mean" for time-safe mean imputation (uses only past data) + - "ffill" (forward fill) and "zero" are safe + + Args: + df: Input dataframe. + + Returns: + Dataframe with imputed values. + """ + config = self.config.imputation_config + if config is None: + raise RuntimeError("_apply_imputation called without imputation_config") + + result = df.copy() + + for col, strategy in config.strategies.items(): + if col not in result.columns: + continue + + if strategy == "zero": + result[col] = result[col].fillna(0) + elif strategy == "ffill": + # CRITICAL: Group-aware forward fill (time-safe) + result[col] = result.groupby(self.entity_cols, observed=True)[col].ffill() + elif strategy == "bfill": + # WARNING: bfill uses future data — use only for debugging/testing + logger.warning( + "featureops.imputation_leakage_risk", + strategy="bfill", + column=col, + message="bfill uses future values to fill gaps; avoid in production", + ) + result[col] = result.groupby(self.entity_cols, observed=True)[col].bfill() + elif strategy == "mean": + # WARNING: mean uses entire series including future — use only for debugging + logger.warning( + "featureops.imputation_leakage_risk", + strategy="mean", + column=col, + message="mean uses entire series including future values; use 'expanding_mean' instead", + ) + result[col] = result.groupby(self.entity_cols, observed=True)[col].transform( + lambda x: x.fillna(x.mean()) + ) + elif strategy == "expanding_mean": + # TIME-SAFE: Uses only past values via expanding window + result[col] = result.groupby(self.entity_cols, observed=True)[col].transform( + lambda x: x.fillna(x.expanding(min_periods=1).mean().shift(1)) + ) + elif strategy == "drop": + result = result.dropna(subset=[col]) + + return result + + def _compute_exogenous_features(self, df: pd.DataFrame) -> tuple[pd.DataFrame, list[str]]: + """Compute exogenous features (price, promo, inventory). + + CRITICAL: All exogenous features are lagged to prevent leakage. + + Args: + df: Input dataframe with exogenous columns. + + Returns: + Tuple of (dataframe with exogenous features, list of new column names). + """ + config = self.config.exogenous_config + if config is None: + raise RuntimeError("_compute_exogenous_features called without exogenous_config") + + result = df.copy() + columns: list[str] = [] + + # Price features (if price column exists) + if config.include_price and "unit_price" in df.columns: + for lag in config.price_lags: + col_name = f"price_lag_{lag}" + result[col_name] = df.groupby(self.entity_cols, observed=True)["unit_price"].shift( + lag + ) + columns.append(col_name) + + if config.include_price_change: + # CRITICAL: shift(1) before pct_change to prevent using current price + # This computes: (price[t-1] - price[t-8]) / price[t-8] + # Without shift(1), it would use current price at t, causing leakage + result["price_pct_change_7d"] = df.groupby(self.entity_cols, observed=True)[ + "unit_price" + ].transform(lambda x: x.shift(1).pct_change(periods=7)) + columns.append("price_pct_change_7d") + + # Stockout flag (if inventory column exists) + if config.include_stockout_flag and "is_stockout" in df.columns: + # Lagged stockout flag (yesterday's stockout) + result["stockout_lag_1"] = df.groupby(self.entity_cols, observed=True)[ + "is_stockout" + ].shift(1) + columns.append("stockout_lag_1") + + return result, columns + + +class FeatureDataLoader: + """Async data loader for feature computation. + + Loads data from database for feature computation. + """ + + async def load_sales_data( + self, + db: AsyncSession, + store_id: int, + product_id: int, + start_date: date_type, + end_date: date_type, + ) -> pd.DataFrame: + """Load sales data for a single series. + + Args: + db: Async database session. + store_id: Store ID. + product_id: Product ID. + start_date: Start date (inclusive). + end_date: End date (inclusive). + + Returns: + DataFrame with sales data. + """ + stmt = ( + select( + SalesDaily.date, + SalesDaily.store_id, + SalesDaily.product_id, + SalesDaily.quantity, + SalesDaily.unit_price, + SalesDaily.total_amount, + ) + .where( + (SalesDaily.store_id == store_id) + & (SalesDaily.product_id == product_id) + & (SalesDaily.date >= start_date) + & (SalesDaily.date <= end_date) + ) + .order_by(SalesDaily.date) + ) + + result = await db.execute(stmt) + rows = result.all() + + if not rows: + return pd.DataFrame( + columns=["date", "store_id", "product_id", "quantity", "unit_price", "total_amount"] + ) + + return pd.DataFrame( + [ + { + "date": row.date, + "store_id": row.store_id, + "product_id": row.product_id, + "quantity": row.quantity, + "unit_price": float(row.unit_price), + "total_amount": float(row.total_amount), + } + for row in rows + ] + ) + + async def load_calendar_data( + self, + db: AsyncSession, + start_date: date_type, + end_date: date_type, + ) -> pd.DataFrame: + """Load calendar data for date range. + + Args: + db: Async database session. + start_date: Start date (inclusive). + end_date: End date (inclusive). + + Returns: + DataFrame with calendar data. + """ + stmt = ( + select( + Calendar.date, + Calendar.day_of_week, + Calendar.month, + Calendar.quarter, + Calendar.year, + Calendar.is_holiday, + Calendar.holiday_name, + ) + .where((Calendar.date >= start_date) & (Calendar.date <= end_date)) + .order_by(Calendar.date) + ) + + result = await db.execute(stmt) + rows = result.all() + + if not rows: + return pd.DataFrame( + columns=[ + "date", + "day_of_week", + "month", + "quarter", + "year", + "is_holiday", + "holiday_name", + ] + ) + + return pd.DataFrame( + [ + { + "date": row.date, + "day_of_week": row.day_of_week, + "month": row.month, + "quarter": row.quarter, + "year": row.year, + "is_holiday": row.is_holiday, + "holiday_name": row.holiday_name, + } + for row in rows + ] + ) + + +async def compute_features_for_series( + db: AsyncSession, + store_id: int, + product_id: int, + cutoff_date: date_type, + lookback_days: int, + config: FeatureSetConfig, +) -> FeatureComputationResult: + """Compute features for a single series. + + Convenience function that loads data and computes features. + + Args: + db: Async database session. + store_id: Store ID. + product_id: Product ID. + cutoff_date: Maximum date to include. + lookback_days: Days of history to use. + config: Feature set configuration. + + Returns: + FeatureComputationResult with computed features. + """ + loader = FeatureDataLoader() + + # Calculate start date + start_date = cutoff_date - timedelta(days=lookback_days) + + # Load sales data + df = await loader.load_sales_data( + db=db, + store_id=store_id, + product_id=product_id, + start_date=start_date, + end_date=cutoff_date, + ) + + # Optionally load and merge calendar data + if config.calendar_config and config.calendar_config.include_is_holiday: + calendar_df = await loader.load_calendar_data( + db=db, + start_date=start_date, + end_date=cutoff_date, + ) + if not calendar_df.empty and not df.empty: + df = df.merge( + calendar_df[["date", "is_holiday"]], + on="date", + how="left", + ) + + # Compute features + service = FeatureEngineeringService(config) + return service.compute_features(df, cutoff_date=cutoff_date) diff --git a/app/features/featuresets/tests/__init__.py b/app/features/featuresets/tests/__init__.py new file mode 100644 index 00000000..e551bfd7 --- /dev/null +++ b/app/features/featuresets/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for featuresets module.""" diff --git a/app/features/featuresets/tests/conftest.py b/app/features/featuresets/tests/conftest.py new file mode 100644 index 00000000..d4cc6561 --- /dev/null +++ b/app/features/featuresets/tests/conftest.py @@ -0,0 +1,171 @@ +"""Test fixtures for featuresets module.""" + +import pandas as pd +import pytest + +from app.features.featuresets.schemas import ( + CalendarConfig, + ExogenousConfig, + FeatureSetConfig, + ImputationConfig, + LagConfig, + RollingConfig, +) + + +@pytest.fixture +def sample_time_series() -> pd.DataFrame: + """Create sample time series data for testing. + + Returns 30 days of data for a single store/product with sequential + quantity values (1, 2, 3, ...) for easy leakage detection. + """ + dates = pd.date_range(start="2024-01-01", periods=30, freq="D") + return pd.DataFrame( + { + "date": dates, + "store_id": [1] * 30, + "product_id": [1] * 30, + "quantity": list(range(1, 31)), # Sequential for leakage detection + "unit_price": [10.0] * 30, + "total_amount": [q * 10.0 for q in range(1, 31)], + } + ) + + +@pytest.fixture +def multi_series_time_series() -> pd.DataFrame: + """Create sample time series with multiple series. + + Returns data for 2 stores x 2 products to test group isolation. + """ + dates = pd.date_range(start="2024-01-01", periods=10, freq="D") + rows = [] + + for store_id in [1, 2]: + for product_id in [1, 2]: + base = (store_id - 1) * 100 + (product_id - 1) * 10 + for i, d in enumerate(dates): + rows.append( + { + "date": d, + "store_id": store_id, + "product_id": product_id, + "quantity": base + i + 1, # Unique per series + "unit_price": 10.0 + store_id, + "total_amount": (base + i + 1) * (10.0 + store_id), + } + ) + + return pd.DataFrame(rows) + + +@pytest.fixture +def sample_lag_config() -> LagConfig: + """Create sample lag configuration.""" + return LagConfig( + schema_version="1.0", + lags=(1, 7, 14), + target_column="quantity", + ) + + +@pytest.fixture +def sample_rolling_config() -> RollingConfig: + """Create sample rolling configuration.""" + return RollingConfig( + schema_version="1.0", + windows=(7, 14), + aggregations=("mean", "std"), + target_column="quantity", + ) + + +@pytest.fixture +def sample_calendar_config() -> CalendarConfig: + """Create sample calendar configuration.""" + return CalendarConfig( + schema_version="1.0", + include_day_of_week=True, + include_month=True, + include_quarter=True, + include_is_weekend=True, + use_cyclical_encoding=True, + ) + + +@pytest.fixture +def sample_exogenous_config() -> ExogenousConfig: + """Create sample exogenous configuration.""" + return ExogenousConfig( + schema_version="1.0", + include_price=True, + price_lags=(7,), + include_price_change=False, + ) + + +@pytest.fixture +def sample_imputation_config() -> ImputationConfig: + """Create sample imputation configuration.""" + return ImputationConfig( + schema_version="1.0", + strategies={ + "quantity": "zero", + "unit_price": "ffill", + }, + ) + + +@pytest.fixture +def sample_feature_config( + sample_lag_config: LagConfig, + sample_rolling_config: RollingConfig, + sample_calendar_config: CalendarConfig, +) -> FeatureSetConfig: + """Create sample complete feature configuration.""" + return FeatureSetConfig( + schema_version="1.0", + name="test_config", + description="Test feature configuration", + entity_columns=("store_id", "product_id"), + date_column="date", + target_column="quantity", + lag_config=sample_lag_config, + rolling_config=sample_rolling_config, + calendar_config=sample_calendar_config, + ) + + +@pytest.fixture +def minimal_feature_config() -> FeatureSetConfig: + """Create minimal feature configuration with only lags.""" + return FeatureSetConfig( + schema_version="1.0", + name="minimal_config", + lag_config=LagConfig(lags=(1,)), + ) + + +@pytest.fixture +def time_series_with_gaps() -> pd.DataFrame: + """Create time series with missing dates for imputation testing.""" + # Create dates with gaps (missing day 5, 10, 15) + all_dates = pd.date_range(start="2024-01-01", periods=20, freq="D") + included_dates = [d for i, d in enumerate(all_dates) if (i + 1) not in [5, 10, 15]] + + df = pd.DataFrame( + { + "date": included_dates, + "store_id": [1] * len(included_dates), + "product_id": [1] * len(included_dates), + "quantity": list(range(1, len(included_dates) + 1)), + "unit_price": [10.0] * len(included_dates), + } + ) + + # Add some NaN values + df.loc[3, "quantity"] = None + df.loc[7, "unit_price"] = None + + return df diff --git a/app/features/featuresets/tests/test_leakage.py b/app/features/featuresets/tests/test_leakage.py new file mode 100644 index 00000000..7f32b112 --- /dev/null +++ b/app/features/featuresets/tests/test_leakage.py @@ -0,0 +1,328 @@ +"""CRITICAL: Leakage prevention tests for feature engineering. + +These tests verify that feature computation NEVER uses future data. +Sequential values (1, 2, 3...) are used so any leakage is mathematically detectable. +""" + +from datetime import date + +import pandas as pd +import pytest + +from app.features.featuresets.schemas import ( + FeatureSetConfig, + LagConfig, + RollingConfig, +) +from app.features.featuresets.service import FeatureEngineeringService + + +class TestLagLeakage: + """Tests verifying lag features never use future data.""" + + def test_lag_features_no_future_data(self, sample_time_series: pd.DataFrame) -> None: + """CRITICAL: Lag features must only use past data. + + With sequential values (1, 2, 3...), lag_1 at row i should equal i (the value at i-1). + If lag_1 at row i equals i+1 or greater, we have future leakage. + """ + config = FeatureSetConfig( + name="test", + lag_config=LagConfig(lags=(1,)), + ) + service = FeatureEngineeringService(config) + result = service.compute_features(sample_time_series) + + # For each row with a valid lag, verify it uses PAST data only + for i in range(1, len(result.df)): + lag_value = result.df.iloc[i]["lag_1"] + current_quantity = result.df.iloc[i]["quantity"] + + # lag_1 should be the PREVIOUS row's value, which is always < current + assert lag_value < current_quantity, ( + f"LEAKAGE DETECTED at row {i}: lag_1={lag_value} >= current={current_quantity}. " + "Lag feature is using current or future data!" + ) + + # More specifically, lag_1 should exactly equal i (row index 0-based matches quantity-1) + assert lag_value == i, ( + f"LEAKAGE DETECTED at row {i}: lag_1={lag_value} != expected={i}. " + "Lag feature is not correctly shifted." + ) + + def test_lag_7_no_future_leakage(self, sample_time_series: pd.DataFrame) -> None: + """Verify lag_7 uses data from exactly 7 days ago.""" + config = FeatureSetConfig( + name="test", + lag_config=LagConfig(lags=(7,)), + ) + service = FeatureEngineeringService(config) + result = service.compute_features(sample_time_series) + + # lag_7 at row 7 should be the value from row 0 (which is 1) + # lag_7 at row 14 should be the value from row 7 (which is 8) + for i in range(7, len(result.df)): + lag_value = result.df.iloc[i]["lag_7"] + expected = i - 7 + 1 # quantity at row (i-7) = (i-7) + 1 + + assert lag_value == expected, ( + f"LEAKAGE or ERROR at row {i}: lag_7={lag_value} != expected={expected}" + ) + + # Verify no future data used + current_quantity = result.df.iloc[i]["quantity"] + assert lag_value < current_quantity, ( + f"LEAKAGE DETECTED: lag_7 at row {i} >= current value" + ) + + +class TestRollingLeakage: + """Tests verifying rolling features exclude current observation.""" + + def test_rolling_features_exclude_current(self, sample_time_series: pd.DataFrame) -> None: + """CRITICAL: Rolling features must NOT include current row's value. + + With sequential values, rolling_mean_7 at row i should be the mean of + values from rows (i-7) to (i-1), NOT including row i. + + If current value is included, the mean would be higher than expected. + """ + config = FeatureSetConfig( + name="test", + rolling_config=RollingConfig( + windows=(7,), + aggregations=("mean",), + min_periods=7, + ), + ) + service = FeatureEngineeringService(config) + result = service.compute_features(sample_time_series) + + # First 7 rows should be NaN (shift(1) + 7-day window) + for i in range(7): + assert pd.isna(result.df.iloc[i]["rolling_mean_7"]), ( + f"Row {i} should have NaN for rolling_mean_7 but has {result.df.iloc[i]['rolling_mean_7']}" + ) + + # Row 7 (index 7) should have mean of rows 0-6 (values 1-7) + # Mean of [1,2,3,4,5,6,7] = 28/7 = 4.0 + rolling_at_7 = result.df.iloc[7]["rolling_mean_7"] + assert rolling_at_7 == pytest.approx(4.0), ( + f"LEAKAGE DETECTED: rolling_mean_7 at row 7 = {rolling_at_7}, expected 4.0. " + "Current observation may be included in rolling window!" + ) + + # Row 8 should have mean of rows 1-7 (values 2-8) + # Mean of [2,3,4,5,6,7,8] = 35/7 = 5.0 + rolling_at_8 = result.df.iloc[8]["rolling_mean_7"] + assert rolling_at_8 == pytest.approx(5.0), ( + f"LEAKAGE DETECTED: rolling_mean_7 at row 8 = {rolling_at_8}, expected 5.0" + ) + + def test_rolling_max_excludes_current(self, sample_time_series: pd.DataFrame) -> None: + """Rolling max should never equal or exceed current value.""" + config = FeatureSetConfig( + name="test", + rolling_config=RollingConfig( + windows=(7,), + aggregations=("max",), + min_periods=7, + ), + ) + service = FeatureEngineeringService(config) + result = service.compute_features(sample_time_series) + + # For sequential data, rolling_max_7 at row i should be quantity[i-1] + # which is always < quantity[i] + for i in range(7, len(result.df)): + rolling_max = result.df.iloc[i]["rolling_max_7"] + current_quantity = result.df.iloc[i]["quantity"] + + # Rolling max of past 7 days (excluding current) should be < current + assert rolling_max < current_quantity, ( + f"LEAKAGE DETECTED at row {i}: rolling_max_7={rolling_max} >= current={current_quantity}. " + "Current observation is being included in rolling window!" + ) + + +class TestCutoffLeakage: + """Tests verifying cutoff date is strictly enforced.""" + + def test_cutoff_strictly_enforced(self, sample_time_series: pd.DataFrame) -> None: + """CRITICAL: No data after cutoff should be accessible.""" + cutoff = date(2024, 1, 15) # Only first 15 days + + config = FeatureSetConfig( + name="test", + lag_config=LagConfig(lags=(1,)), + ) + service = FeatureEngineeringService(config) + result = service.compute_features(sample_time_series, cutoff_date=cutoff) + + # Should only have 15 rows + assert len(result.df) == 15, f"Cutoff violation: expected 15 rows, got {len(result.df)}" + + # Max date should be cutoff + max_date = pd.to_datetime(result.df["date"]).max().date() + assert max_date <= cutoff, f"CUTOFF VIOLATION: max_date={max_date} > cutoff={cutoff}" + + # No quantity values > 15 should exist (they would be from after cutoff) + max_quantity = result.df["quantity"].max() + assert max_quantity <= 15, ( + f"CUTOFF VIOLATION: found quantity={max_quantity} which is from after cutoff" + ) + + def test_features_computed_only_from_pre_cutoff_data( + self, sample_time_series: pd.DataFrame + ) -> None: + """Features at cutoff should only use data from before cutoff.""" + cutoff = date(2024, 1, 15) + + config = FeatureSetConfig( + name="test", + rolling_config=RollingConfig( + windows=(7,), + aggregations=("mean",), + min_periods=7, + ), + ) + service = FeatureEngineeringService(config) + result = service.compute_features(sample_time_series, cutoff_date=cutoff) + + # At the last row (cutoff date), rolling_mean_7 should use rows 8-14 + # Values: 8, 9, 10, 11, 12, 13, 14 (not including 15!) + # Mean = 77/7 = 11.0 + last_row = result.df.iloc[-1] + expected_mean = pytest.approx(11.0) + + assert last_row["rolling_mean_7"] == expected_mean, ( + f"At cutoff, rolling_mean_7={last_row['rolling_mean_7']}, expected {expected_mean}. " + "Data from after cutoff may be leaking into features!" + ) + + +class TestGroupIsolationLeakage: + """Tests verifying no cross-series leakage.""" + + def test_group_isolation_no_cross_series_leakage( + self, multi_series_time_series: pd.DataFrame + ) -> None: + """CRITICAL: Features must not leak between different series. + + Each store/product combination should only use its own history. + """ + config = FeatureSetConfig( + name="test", + entity_columns=("store_id", "product_id"), + lag_config=LagConfig(lags=(1,)), + ) + service = FeatureEngineeringService(config) + result = service.compute_features(multi_series_time_series) + + # Check each series independently + for store_id in [1, 2]: + for product_id in [1, 2]: + series_mask = (result.df["store_id"] == store_id) & ( + result.df["product_id"] == product_id + ) + series_df = result.df[series_mask].reset_index(drop=True) + + # Base value for this series + base = (store_id - 1) * 100 + (product_id - 1) * 10 + + # First row of each series should have NaN lag + assert pd.isna(series_df.iloc[0]["lag_1"]), ( + f"Series ({store_id}, {product_id}) first row should have NaN lag_1" + ) + + # Second row should have lag from first row of SAME series only + expected_lag = base + 1 # First value in this series + actual_lag = series_df.iloc[1]["lag_1"] + + assert actual_lag == expected_lag, ( + f"CROSS-SERIES LEAKAGE: Store {store_id}, Product {product_id}: " + f"lag_1={actual_lag}, expected={expected_lag}. " + "Lag is using data from a different series!" + ) + + def test_rolling_group_isolation(self, multi_series_time_series: pd.DataFrame) -> None: + """Rolling features must not mix data from different series.""" + config = FeatureSetConfig( + name="test", + entity_columns=("store_id", "product_id"), + rolling_config=RollingConfig( + windows=(3,), + aggregations=("mean",), + min_periods=3, + ), + ) + service = FeatureEngineeringService(config) + result = service.compute_features(multi_series_time_series) + + # Check series (1, 1) - base=0, values: 1,2,3,4,5,6,7,8,9,10 + # At row 3 (value=4), rolling_mean_3 should be mean of [1,2,3] = 2.0 + series_11 = result.df[ + (result.df["store_id"] == 1) & (result.df["product_id"] == 1) + ].reset_index(drop=True) + + # Row 3 (index 3) has value 4, rolling should use rows 0,1,2 (values 1,2,3) + rolling_at_3 = series_11.iloc[3]["rolling_mean_3"] + assert rolling_at_3 == pytest.approx(2.0), ( + f"Series (1,1) at row 3: rolling_mean_3={rolling_at_3}, expected 2.0. " + "Cross-series contamination may have occurred!" + ) + + # Check series (2, 2) - base=110, values: 111,112,113,114... + series_22 = result.df[ + (result.df["store_id"] == 2) & (result.df["product_id"] == 2) + ].reset_index(drop=True) + + # Row 3 (value=114), rolling should use rows 0,1,2 (values 111,112,113) + # Mean = 336/3 = 112.0 + rolling_22_at_3 = series_22.iloc[3]["rolling_mean_3"] + assert rolling_22_at_3 == pytest.approx(112.0), ( + f"Series (2,2) at row 3: rolling_mean_3={rolling_22_at_3}, expected 112.0. " + "Cross-series contamination detected!" + ) + + +class TestEdgeCaseLeakage: + """Tests for edge cases that might cause subtle leakage.""" + + def test_first_row_never_has_valid_lag(self, sample_time_series: pd.DataFrame) -> None: + """First row of any series must have NaN for lag features (no history).""" + config = FeatureSetConfig( + name="test", + lag_config=LagConfig(lags=(1, 7, 14)), + ) + service = FeatureEngineeringService(config) + result = service.compute_features(sample_time_series) + + first_row = result.df.iloc[0] + assert pd.isna(first_row["lag_1"]), "First row must have NaN lag_1" + assert pd.isna(first_row["lag_7"]), "First row must have NaN lag_7" + assert pd.isna(first_row["lag_14"]), "First row must have NaN lag_14" + + def test_insufficient_history_has_nan(self, sample_time_series: pd.DataFrame) -> None: + """Rows without sufficient history must have NaN features.""" + config = FeatureSetConfig( + name="test", + rolling_config=RollingConfig( + windows=(14,), + aggregations=("mean",), + min_periods=14, + ), + ) + service = FeatureEngineeringService(config) + result = service.compute_features(sample_time_series) + + # First 14 rows should have NaN (shift(1) + 14-day window) + for i in range(14): + assert pd.isna(result.df.iloc[i]["rolling_mean_14"]), ( + f"Row {i} should have NaN rolling_mean_14 due to insufficient history" + ) + + # Row 14 should have valid value + assert not pd.isna(result.df.iloc[14]["rolling_mean_14"]), ( + "Row 14 should have valid rolling_mean_14" + ) diff --git a/app/features/featuresets/tests/test_schemas.py b/app/features/featuresets/tests/test_schemas.py new file mode 100644 index 00000000..9f855d8e --- /dev/null +++ b/app/features/featuresets/tests/test_schemas.py @@ -0,0 +1,261 @@ +"""Unit tests for feature engineering schemas.""" + +from datetime import date + +import pytest +from pydantic import ValidationError + +from app.features.featuresets.schemas import ( + CalendarConfig, + ComputeFeaturesRequest, + ExogenousConfig, + FeatureSetConfig, + ImputationConfig, + LagConfig, + RollingConfig, +) + + +class TestLagConfig: + """Tests for LagConfig validation.""" + + def test_valid_lags(self): + """Valid positive lags should be accepted.""" + config = LagConfig(lags=(1, 7, 14, 28)) + assert config.lags == (1, 7, 14, 28) + + def test_rejects_negative_lags(self): + """Negative lags should be rejected (prevents future leakage).""" + with pytest.raises(ValidationError) as exc_info: + LagConfig(lags=(-1, 7)) + + assert "positive integers" in str(exc_info.value).lower() + + def test_rejects_zero_lag(self): + """Zero lag should be rejected (current row is not a lag).""" + with pytest.raises(ValidationError) as exc_info: + LagConfig(lags=(0, 7)) + + assert "positive integers" in str(exc_info.value).lower() + + def test_rejects_empty_lags(self): + """Empty lags tuple should be rejected.""" + with pytest.raises(ValidationError) as exc_info: + LagConfig(lags=()) + + assert "at least one lag" in str(exc_info.value).lower() + + def test_default_values(self): + """Default values should be set correctly.""" + config = LagConfig() + assert config.lags == (1, 7, 14, 28) + assert config.target_column == "quantity" + assert config.fill_value is None + assert config.schema_version == "1.0" + + +class TestRollingConfig: + """Tests for RollingConfig validation.""" + + def test_valid_windows(self): + """Valid positive windows should be accepted.""" + config = RollingConfig(windows=(7, 14, 28)) + assert config.windows == (7, 14, 28) + + def test_rejects_negative_windows(self): + """Negative windows should be rejected.""" + with pytest.raises(ValidationError) as exc_info: + RollingConfig(windows=(-7, 14)) + + assert "positive integers" in str(exc_info.value).lower() + + def test_rejects_empty_windows(self): + """Empty windows tuple should be rejected.""" + with pytest.raises(ValidationError) as exc_info: + RollingConfig(windows=()) + + assert "at least one window" in str(exc_info.value).lower() + + def test_valid_aggregations(self): + """Valid aggregation functions should be accepted.""" + config = RollingConfig(aggregations=("mean", "std", "min", "max", "sum")) + assert len(config.aggregations) == 5 + + +class TestCalendarConfig: + """Tests for CalendarConfig validation.""" + + def test_default_cyclical_encoding(self): + """Cyclical encoding should be enabled by default.""" + config = CalendarConfig() + assert config.use_cyclical_encoding is True + + def test_all_features_can_be_disabled(self): + """All features should be individually disableable.""" + config = CalendarConfig( + include_day_of_week=False, + include_month=False, + include_quarter=False, + include_year=False, + include_is_weekend=False, + include_is_month_end=False, + include_is_holiday=False, + ) + assert config.include_day_of_week is False + + +class TestExogenousConfig: + """Tests for ExogenousConfig validation.""" + + def test_rejects_negative_price_lags(self): + """Negative price lags should be rejected.""" + with pytest.raises(ValidationError) as exc_info: + ExogenousConfig(price_lags=(-7, 14)) + + assert "positive integers" in str(exc_info.value).lower() + + +class TestImputationConfig: + """Tests for ImputationConfig validation.""" + + def test_valid_strategies(self): + """Valid imputation strategies should be accepted.""" + config = ImputationConfig( + strategies={ + "quantity": "zero", + "price": "ffill", + "inventory": "bfill", + "demand": "mean", + "optional": "drop", + } + ) + assert len(config.strategies) == 5 + + def test_rejects_invalid_strategy(self): + """Invalid strategy should be rejected.""" + with pytest.raises(ValidationError): + ImputationConfig(strategies={"quantity": "invalid"}) # type: ignore[dict-item] + + +class TestFeatureSetConfig: + """Tests for FeatureSetConfig.""" + + def test_valid_minimal_config(self): + """Minimal valid config should be accepted.""" + config = FeatureSetConfig(name="test") + assert config.name == "test" + assert config.lag_config is None + assert config.rolling_config is None + + def test_valid_full_config(self): + """Full config with all sub-configs should be accepted.""" + config = FeatureSetConfig( + name="full_test", + lag_config=LagConfig(), + rolling_config=RollingConfig(), + calendar_config=CalendarConfig(), + exogenous_config=ExogenousConfig(), + imputation_config=ImputationConfig(), + ) + assert config.lag_config is not None + assert config.rolling_config is not None + + def test_get_enabled_features(self): + """get_enabled_features should return correct list.""" + config = FeatureSetConfig( + name="test", + lag_config=LagConfig(), + calendar_config=CalendarConfig(), + ) + enabled = config.get_enabled_features() + assert "lag" in enabled + assert "calendar" in enabled + assert "rolling" not in enabled + assert "exogenous" not in enabled + + def test_config_hash_deterministic(self): + """config_hash should be deterministic for same config.""" + config1 = FeatureSetConfig( + name="test", + lag_config=LagConfig(lags=(1, 7)), + ) + config2 = FeatureSetConfig( + name="test", + lag_config=LagConfig(lags=(1, 7)), + ) + assert config1.config_hash() == config2.config_hash() + + def test_config_hash_differs_for_different_config(self): + """config_hash should differ for different configs.""" + config1 = FeatureSetConfig( + name="test", + lag_config=LagConfig(lags=(1, 7)), + ) + config2 = FeatureSetConfig( + name="test", + lag_config=LagConfig(lags=(1, 14)), + ) + assert config1.config_hash() != config2.config_hash() + + def test_config_is_frozen(self): + """Config should be immutable (frozen).""" + config = FeatureSetConfig(name="test") + with pytest.raises(ValidationError): + config.name = "modified" + + def test_rejects_empty_name(self): + """Empty name should be rejected.""" + with pytest.raises(ValidationError): + FeatureSetConfig(name="") + + def test_rejects_extra_fields(self): + """Extra fields should be rejected.""" + with pytest.raises(ValidationError): + FeatureSetConfig(name="test", unknown_field="value") # type: ignore[call-arg] + + +class TestComputeFeaturesRequest: + """Tests for ComputeFeaturesRequest validation.""" + + def test_valid_request(self): + """Valid request should be accepted.""" + request = ComputeFeaturesRequest( + store_id=1, + product_id=1, + cutoff_date=date(2024, 1, 31), + lookback_days=365, + config=FeatureSetConfig(name="test"), + ) + assert request.store_id == 1 + assert request.cutoff_date == date(2024, 1, 31) + + def test_rejects_zero_store_id(self): + """Zero store_id should be rejected.""" + with pytest.raises(ValidationError): + ComputeFeaturesRequest( + store_id=0, + product_id=1, + cutoff_date=date(2024, 1, 31), + config=FeatureSetConfig(name="test"), + ) + + def test_rejects_lookback_too_large(self): + """Lookback > 1095 should be rejected.""" + with pytest.raises(ValidationError): + ComputeFeaturesRequest( + store_id=1, + product_id=1, + cutoff_date=date(2024, 1, 31), + lookback_days=2000, + config=FeatureSetConfig(name="test"), + ) + + def test_default_lookback(self): + """Default lookback should be 365.""" + request = ComputeFeaturesRequest( + store_id=1, + product_id=1, + cutoff_date=date(2024, 1, 31), + config=FeatureSetConfig(name="test"), + ) + assert request.lookback_days == 365 diff --git a/app/features/featuresets/tests/test_service.py b/app/features/featuresets/tests/test_service.py new file mode 100644 index 00000000..9faddb7e --- /dev/null +++ b/app/features/featuresets/tests/test_service.py @@ -0,0 +1,319 @@ +"""Unit tests for FeatureEngineeringService.""" + +from datetime import date + +import pandas as pd +import pytest + +from app.features.featuresets.schemas import ( + CalendarConfig, + FeatureSetConfig, + ImputationConfig, + LagConfig, + RollingConfig, +) +from app.features.featuresets.service import FeatureEngineeringService + + +class TestLagFeatures: + """Tests for lag feature computation.""" + + def test_lag_1_computation(self, sample_time_series): + """Lag 1 should shift values by 1 position.""" + config = FeatureSetConfig( + name="test", + lag_config=LagConfig(lags=(1,)), + ) + service = FeatureEngineeringService(config) + result = service.compute_features(sample_time_series) + + # First row should have NaN for lag_1 + assert pd.isna(result.df.iloc[0]["lag_1"]) + + # Second row should have value from first row + assert result.df.iloc[1]["lag_1"] == 1 # quantity[0] = 1 + + # Third row should have value from second row + assert result.df.iloc[2]["lag_1"] == 2 # quantity[1] = 2 + + def test_lag_7_computation(self, sample_time_series): + """Lag 7 should shift values by 7 positions.""" + config = FeatureSetConfig( + name="test", + lag_config=LagConfig(lags=(7,)), + ) + service = FeatureEngineeringService(config) + result = service.compute_features(sample_time_series) + + # First 7 rows should have NaN + for i in range(7): + assert pd.isna(result.df.iloc[i]["lag_7"]) + + # Row 8 (index 7) should have value from row 1 (index 0) + assert result.df.iloc[7]["lag_7"] == 1 + + # Row 15 (index 14) should have value from row 8 (index 7) + assert result.df.iloc[14]["lag_7"] == 8 + + def test_multiple_lags(self, sample_time_series): + """Multiple lags should be computed correctly.""" + config = FeatureSetConfig( + name="test", + lag_config=LagConfig(lags=(1, 7)), + ) + service = FeatureEngineeringService(config) + result = service.compute_features(sample_time_series) + + assert "lag_1" in result.feature_columns + assert "lag_7" in result.feature_columns + assert len(result.feature_columns) == 2 + + def test_lag_fill_value(self, sample_time_series): + """fill_value should replace NaN in lag features.""" + config = FeatureSetConfig( + name="test", + lag_config=LagConfig(lags=(1,), fill_value=0.0), + ) + service = FeatureEngineeringService(config) + result = service.compute_features(sample_time_series) + + # First row should have 0 instead of NaN + assert result.df.iloc[0]["lag_1"] == 0.0 + + +class TestRollingFeatures: + """Tests for rolling feature computation.""" + + def test_rolling_mean_7_computation(self, sample_time_series): + """Rolling mean should use shift(1) + rolling window.""" + config = FeatureSetConfig( + name="test", + rolling_config=RollingConfig( + windows=(7,), + aggregations=("mean",), + min_periods=7, + ), + ) + service = FeatureEngineeringService(config) + result = service.compute_features(sample_time_series) + + # First 7 rows should have NaN (shift(1) + 7-day window) + for i in range(7): + assert pd.isna(result.df.iloc[i]["rolling_mean_7"]) + + # Row 8 (index 7) should have mean of rows 1-7 (indices 0-6) + # Values: 1, 2, 3, 4, 5, 6, 7 -> mean = 4.0 + assert result.df.iloc[7]["rolling_mean_7"] == pytest.approx(4.0) + + # Row 9 (index 8) should have mean of rows 2-8 (indices 1-7) + # Values: 2, 3, 4, 5, 6, 7, 8 -> mean = 5.0 + assert result.df.iloc[8]["rolling_mean_7"] == pytest.approx(5.0) + + def test_rolling_std_computation(self, sample_time_series): + """Rolling std should be computed correctly.""" + config = FeatureSetConfig( + name="test", + rolling_config=RollingConfig( + windows=(7,), + aggregations=("std",), + min_periods=7, + ), + ) + service = FeatureEngineeringService(config) + result = service.compute_features(sample_time_series) + + # Check that std is computed (non-zero for sequential data) + valid_stds = result.df["rolling_std_7"].dropna() + assert len(valid_stds) > 0 + assert all(std > 0 for std in valid_stds) + + def test_multiple_aggregations(self, sample_time_series): + """Multiple aggregations should be computed.""" + config = FeatureSetConfig( + name="test", + rolling_config=RollingConfig( + windows=(7,), + aggregations=("mean", "std", "min", "max"), + ), + ) + service = FeatureEngineeringService(config) + result = service.compute_features(sample_time_series) + + assert "rolling_mean_7" in result.feature_columns + assert "rolling_std_7" in result.feature_columns + assert "rolling_min_7" in result.feature_columns + assert "rolling_max_7" in result.feature_columns + + +class TestCalendarFeatures: + """Tests for calendar feature computation.""" + + def test_cyclical_day_of_week(self, sample_time_series): + """Day of week should use cyclical encoding.""" + config = FeatureSetConfig( + name="test", + calendar_config=CalendarConfig( + include_day_of_week=True, + use_cyclical_encoding=True, + ), + ) + service = FeatureEngineeringService(config) + result = service.compute_features(sample_time_series) + + assert "dow_sin" in result.feature_columns + assert "dow_cos" in result.feature_columns + + # Values should be in [-1, 1] range + assert result.df["dow_sin"].between(-1, 1).all() + assert result.df["dow_cos"].between(-1, 1).all() + + def test_non_cyclical_day_of_week(self, sample_time_series): + """Non-cyclical day of week should be integer.""" + config = FeatureSetConfig( + name="test", + calendar_config=CalendarConfig( + include_day_of_week=True, + use_cyclical_encoding=False, + ), + ) + service = FeatureEngineeringService(config) + result = service.compute_features(sample_time_series) + + assert "day_of_week" in result.feature_columns + # Day of week should be in [0, 6] range + assert result.df["day_of_week"].between(0, 6).all() + + def test_is_weekend(self, sample_time_series): + """is_weekend should correctly identify weekends.""" + config = FeatureSetConfig( + name="test", + calendar_config=CalendarConfig( + include_is_weekend=True, + ), + ) + service = FeatureEngineeringService(config) + result = service.compute_features(sample_time_series) + + assert "is_weekend" in result.feature_columns + # Values should be 0 or 1 + assert set(result.df["is_weekend"].unique()).issubset({0, 1}) + + def test_quarter(self, sample_time_series): + """Quarter should be computed correctly.""" + config = FeatureSetConfig( + name="test", + calendar_config=CalendarConfig( + include_quarter=True, + ), + ) + service = FeatureEngineeringService(config) + result = service.compute_features(sample_time_series) + + assert "quarter" in result.feature_columns + # January data should be Q1 + assert (result.df["quarter"] == 1).all() + + +class TestImputation: + """Tests for imputation strategies.""" + + def test_zero_fill(self, time_series_with_gaps): + """Zero fill should replace NaN with 0.""" + config = FeatureSetConfig( + name="test", + imputation_config=ImputationConfig( + strategies={"quantity": "zero"}, + ), + ) + service = FeatureEngineeringService(config) + result = service.compute_features(time_series_with_gaps) + + # No NaN in quantity column after imputation + assert not result.df["quantity"].isna().any() + + def test_ffill(self, time_series_with_gaps): + """Forward fill should propagate last valid value.""" + config = FeatureSetConfig( + name="test", + imputation_config=ImputationConfig( + strategies={"unit_price": "ffill"}, + ), + ) + service = FeatureEngineeringService(config) + result = service.compute_features(time_series_with_gaps) + + # No NaN in unit_price column after imputation (except possibly first rows) + # Forward fill only fills if there's a previous value + non_null_count = result.df["unit_price"].notna().sum() + assert non_null_count >= len(result.df) - 1 + + +class TestCutoffEnforcement: + """Tests for cutoff date enforcement.""" + + def test_cutoff_filters_data(self, sample_time_series): + """Cutoff should filter out data after cutoff date.""" + cutoff = date(2024, 1, 15) # Only first 15 days + config = FeatureSetConfig(name="test") + service = FeatureEngineeringService(config) + result = service.compute_features(sample_time_series, cutoff_date=cutoff) + + # Should only have 15 rows + assert len(result.df) == 15 + + # All dates should be <= cutoff + max_date = pd.to_datetime(result.df["date"]).max().date() + assert max_date <= cutoff + + def test_no_cutoff_uses_all_data(self, sample_time_series): + """No cutoff should use all data.""" + config = FeatureSetConfig(name="test") + service = FeatureEngineeringService(config) + result = service.compute_features(sample_time_series, cutoff_date=None) + + assert len(result.df) == 30 + + +class TestComputeFeatures: + """Integration tests for compute_features.""" + + def test_combined_features(self, sample_time_series, sample_feature_config): + """All feature types should be computed together.""" + service = FeatureEngineeringService(sample_feature_config) + result = service.compute_features(sample_time_series) + + # Should have lag, rolling, and calendar features + assert any("lag_" in col for col in result.feature_columns) + assert any("rolling_" in col for col in result.feature_columns) + assert any(col in result.feature_columns for col in ["dow_sin", "dow_cos", "quarter"]) + + def test_config_hash_in_result(self, sample_time_series, sample_feature_config): + """Result should include config hash.""" + service = FeatureEngineeringService(sample_feature_config) + result = service.compute_features(sample_time_series) + + assert result.config_hash == sample_feature_config.config_hash() + + def test_stats_populated(self, sample_time_series, sample_feature_config): + """Stats should be populated in result.""" + service = FeatureEngineeringService(sample_feature_config) + result = service.compute_features(sample_time_series) + + assert "input_rows" in result.stats + assert "output_rows" in result.stats + assert "feature_count" in result.stats + assert "null_counts" in result.stats + assert result.stats["input_rows"] == 30 + + def test_empty_dataframe_handling(self): + """Empty dataframe should be handled gracefully.""" + config = FeatureSetConfig( + name="test", + lag_config=LagConfig(lags=(1,)), + ) + service = FeatureEngineeringService(config) + empty_df = pd.DataFrame(columns=["date", "store_id", "product_id", "quantity"]) + result = service.compute_features(empty_df) + + assert len(result.df) == 0 + assert result.feature_columns == ["lag_1"] diff --git a/app/main.py b/app/main.py index 5b7837c7..584608c9 100644 --- a/app/main.py +++ b/app/main.py @@ -10,6 +10,7 @@ from app.core.health import router as health_router from app.core.logging import configure_logging, get_logger from app.core.middleware import RequestIdMiddleware +from app.features.featuresets.routes import router as featuresets_router from app.features.ingest.routes import router as ingest_router logger = get_logger(__name__) @@ -68,6 +69,7 @@ def create_app() -> FastAPI: # Routers app.include_router(health_router) app.include_router(ingest_router) + app.include_router(featuresets_router) return app diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 696e3cf9..1aa6a623 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -198,22 +198,54 @@ app/ --- -## 6) ForecastOps (Training + Backtesting + Registry) +## 6) Feature Engineering (Time-Safe) — ✅ IMPLEMENTED -### 6.1 Model Zoo (Minimum) +### 6.1 Core Service (Implemented via PRP-4) + +The `FeatureEngineeringService` provides time-safe feature computation with CRITICAL leakage prevention: + +**Leakage Prevention Patterns**: +- **Lag features**: `shift(lag)` with positive lag only — ensures only past data accessed +- **Rolling features**: `shift(1)` BEFORE `.rolling()` — excludes current observation from window +- **Group isolation**: `groupby(entity_cols, observed=True)` — prevents cross-series contamination +- **Cutoff enforcement**: Data filtered before feature computation — no future data in pipeline + +### 6.2 Feature Types + +| Type | Description | Output Example | +|------|-------------|----------------| +| Lag | Past values at specified periods | `lag_1`, `lag_7`, `lag_14` | +| Rolling | Rolling statistics (mean, std, min, max) | `rolling_mean_7`, `rolling_std_7` | +| Calendar | Date features with cyclical encoding | `dow_sin`, `dow_cos`, `month_sin` | +| Imputation | Zero-fill for sales, forward-fill for prices | N/A (modifies source columns) | + +### 6.3 API Endpoints + +- `POST /featuresets/compute` — Compute features for a single series +- `POST /featuresets/preview` — Preview features with sample rows + +### 6.4 Location + +- Schemas: `app/features/featuresets/schemas.py` +- Service: `app/features/featuresets/service.py` +- Routes: `app/features/featuresets/routes.py` +- Tests: `app/features/featuresets/tests/` (55 tests including leakage prevention) +- Demo: `examples/compute_features_demo.py` + +--- + +## 7) ForecastOps (Training + Backtesting + Registry) + +### 7.1 Model Zoo (Minimum) - naive - seasonal naive - moving average (configurable window) -### 6.2 Feature Engineering (Time-Safe) -- Compute features with an explicit **cutoff**. -- Lags/rolling windows must use history `<= cutoff` only. - -### 6.3 Backtesting Protocol +### 7.2 Backtesting Protocol - Time-based CV only: rolling or expanding splits (no random split). - Metrics: MAE, sMAPE (pinball loss later if needed). -### 6.4 Model Registry +### 7.3 Model Registry Each run stores: - run_id, timestamps - model_type + model_config (JSON) @@ -225,7 +257,7 @@ Each run stores: --- -## 7) Typed FastAPI Contracts (Serving Layer) +## 8) Typed FastAPI Contracts (Serving Layer) Minimum endpoint categories (planned): - `POST /ingest/sales-daily` (optional `/ingest/transactions`) @@ -239,7 +271,7 @@ Contracts are Pydantic v2 validated and use `response_model` for explicit output --- -## 8) Dashboard (React + Vite) +## 9) Dashboard (React + Vite) The UI is intentionally **table-first**: - Data Explorer @@ -252,22 +284,22 @@ Decision reference: `docs/ADR/ADR-0002-frontend-architecture-vite-spa-first.md` --- -## 9) RAG Knowledge Base (Postgres + pgvector) +## 10) RAG Knowledge Base (Postgres + pgvector) -### 9.1 Indexed Sources (Planned) +### 10.1 Indexed Sources (Planned) - `README.md` - `docs/*` (Architecture, ADRs, guides) - OpenAPI export - Run reports generated per training run -### 9.2 Evidence-Grounded Answers +### 10.2 Evidence-Grounded Answers RAG must return citations for non-trivial claims; if evidence is insufficient, it must respond “not found / insufficient evidence”. Decision reference: `docs/ADR/ADR-0003-vector-storage-pgvector-in-postgres.md` --- -## 10) Quality, CI, and Review Rules +## 11) Quality, CI, and Review Rules The repo standards live in `docs/validation/` and are treated as merge gates: - Ruff lint/format @@ -278,7 +310,7 @@ The repo standards live in `docs/validation/` and are treated as merge gates: --- -## 11) Roadmap (Phased Delivery) +## 12) Roadmap (Phased Delivery) - **Phase-0**: vertical-slice demo (seed → ingest → baseline train → predict → UI tables) - **Phase-1**: ForecastOps core (backtesting + registry + leaderboard) diff --git a/docs/PHASE-index.md b/docs/PHASE-index.md index e1bf3bcc..589b763b 100644 --- a/docs/PHASE-index.md +++ b/docs/PHASE-index.md @@ -11,7 +11,7 @@ This document indexes all implementation phases of the ForecastLabAI project. | 0 | Project Foundation | Completed | PRP-0, PRP-1 | [0-INIT_PHASE.md](./PHASE/0-INIT_PHASE.md) | | 1 | Data Platform | Completed | PRP-2 | [1-DATA_PLATFORM.md](./PHASE/1-DATA_PLATFORM.md) | | 2 | Ingest Layer | Completed | PRP-3 | [2-INGEST_LAYER.md](./PHASE/2-INGEST_LAYER.md) | -| 3 | Feature Engineering | Pending | PRP-4 | - | +| 3 | Feature Engineering | Completed | PRP-4 | [3-FEATURE_ENGINEERING.md](./PHASE/3-FEATURE_ENGINEERING.md) | | 4 | Forecasting | Pending | PRP-5 | - | | 5 | Backtesting | Pending | PRP-6 | - | | 6 | Model Registry | Pending | PRP-7 | - | @@ -125,13 +125,41 @@ This document indexes all implementation phases of the ForecastLabAI project. } ``` +### [Phase 3: Feature Engineering](./PHASE/3-FEATURE_ENGINEERING.md) + +**Date Completed**: 2026-01-31 + +**Summary**: Time-safe feature engineering with CRITICAL leakage prevention: +- FeatureEngineeringService with lag, rolling, calendar, and exogenous features +- CRITICAL: Lag features use positive shift() only (no future data access) +- CRITICAL: Rolling features use shift(1) BEFORE rolling to exclude current observation +- CRITICAL: Group-aware operations prevent cross-series leakage +- FastAPI endpoints: POST /featuresets/compute, POST /featuresets/preview +- 55 unit tests including comprehensive leakage prevention tests + +**Key Deliverables**: +- `app/features/featuresets/schemas.py` - Pydantic schemas for feature configuration +- `app/features/featuresets/service.py` - FeatureEngineeringService +- `app/features/featuresets/routes.py` - API endpoints +- `app/features/featuresets/tests/` - 55 tests (schemas, service, leakage prevention) +- `examples/compute_features_demo.py` - Demo script + +**Feature Types**: +- **Lag features**: Past values at specified lag periods with optional fill_value +- **Rolling features**: Mean, std, min, max, sum over configurable windows +- **Calendar features**: Day of week, month, quarter with cyclical encoding (sin/cos) +- **Imputation**: Zero-fill for sales, forward-fill for prices + +**Validation Results**: +- Ruff: All checks passed +- MyPy: 0 errors +- Pyright: 0 errors +- Pytest: 55 tests passed + --- ## Pending Phases -### Phase 3: Feature Engineering -Time-safe feature computation with lag, rolling, and exogenous features. - ### Phase 4: Forecasting Model zoo with unified interface for naive, seasonal, and ML models. @@ -190,3 +218,4 @@ Each phase document (`docs/PHASE/X-PHASE_NAME.md`) contains: | 2026-01-26 | 0 | Added CI/CD infrastructure (5 GitHub Actions workflows) | | 2026-01-26 | 1 | Data Platform schema and migrations completed (v0.1.3) | | 2026-01-26 | 2 | Ingest Layer with POST /ingest/sales-daily endpoint completed | +| 2026-01-31 | 3 | Feature Engineering with time-safe leakage prevention completed | diff --git a/docs/PHASE/3-FEATURE_ENGINEERING.md b/docs/PHASE/3-FEATURE_ENGINEERING.md new file mode 100644 index 00000000..bfd1ab10 --- /dev/null +++ b/docs/PHASE/3-FEATURE_ENGINEERING.md @@ -0,0 +1,283 @@ +# Phase 3: Feature Engineering + +**Date Completed**: 2026-01-31 +**PRP**: [PRP-4-feature-engineering.md](../../PRPs/PRP-4-feature-engineering.md) +**Release**: PR #24 + +--- + +## Executive Summary + +Phase 3 implements the Feature Engineering Layer for ForecastLabAI with CRITICAL leakage prevention patterns. The module provides time-safe feature computation for retail demand forecasting, ensuring that features computed at any cutoff date only use data available at that point in time. + +**Key Achievement**: Zero future data leakage through architectural constraints enforced at the service layer. + +--- + +## Deliverables + +### 1. Feature Configuration Schemas + +**File**: `app/features/featuresets/schemas.py` + +Pydantic v2 schemas with frozen configs for reproducibility: + +| Schema | Purpose | +|--------|---------| +| `LagConfig` | Lag feature configuration (lags, target_column, fill_value) | +| `RollingConfig` | Rolling window configuration (windows, aggregations, min_periods) | +| `CalendarConfig` | Calendar feature configuration (day_of_week, month, cyclical encoding) | +| `ExogenousConfig` | Exogenous feature configuration (price_lags, stockout flags) | +| `ImputationConfig` | Imputation strategies (zero, ffill, bfill, mean, drop) | +| `FeatureSetConfig` | Complete feature configuration with config_hash() | +| `ComputeFeaturesRequest` | API request schema with validation | +| `ComputeFeaturesResponse` | API response with features and metadata | + +**Key Features**: +- Frozen models (`frozen=True`) for immutability +- Schema versioning for registry storage +- Deterministic `config_hash()` for deduplication +- Validation: positive lags, non-empty configs, valid aggregations + +### 2. FeatureEngineeringService + +**File**: `app/features/featuresets/service.py` + +Core service with CRITICAL leakage prevention: + +```python +class FeatureEngineeringService: + """Time-safe feature engineering service. + + CRITICAL: All feature computation respects cutoff_date to prevent leakage. + """ + + def compute_features(self, df, cutoff_date=None) -> FeatureComputationResult: + # 1. Filter to cutoff BEFORE any computation + # 2. Sort by entity + date + # 3. Compute features with group isolation +``` + +**Leakage Prevention Patterns**: + +| Pattern | Implementation | Why | +|---------|---------------|-----| +| Lag features | `shift(lag)` with positive lag only | Ensures only past data accessed | +| Rolling features | `shift(1)` BEFORE `.rolling()` | Excludes current observation from window | +| Group isolation | `groupby(entity_cols, observed=True)` | Prevents cross-series contamination | +| Cutoff enforcement | Filter data before feature computation | No future data in pipeline | + +### 3. API Endpoints + +**File**: `app/features/featuresets/routes.py` + +| Endpoint | Method | Description | +|----------|--------|-------------| +| `/featuresets/compute` | POST | Compute features for a single series | +| `/featuresets/preview` | POST | Preview features with limited sample rows | + +**Response Schema**: +```json +{ + "rows": [ + { + "date": "2024-01-31", + "store_id": 1, + "product_id": 1, + "features": { + "lag_1": 10.0, + "lag_7": 8.0, + "rolling_mean_7": 9.5, + "dow_sin": 0.433, + "dow_cos": 0.901 + } + } + ], + "feature_columns": ["lag_1", "lag_7", "rolling_mean_7", "dow_sin", "dow_cos"], + "config_hash": "a1b2c3d4e5f6g7h8", + "cutoff_date": "2024-01-31", + "row_count": 365, + "null_counts": {"lag_1": 1, "lag_7": 7}, + "duration_ms": 45.23 +} +``` + +### 4. Test Suite + +**Directory**: `app/features/featuresets/tests/` + +| File | Tests | Coverage | +|------|-------|----------| +| `test_schemas.py` | 16 | Schema validation, config hash, frozen models | +| `test_service.py` | 19 | Lag, rolling, calendar, imputation, cutoff | +| `test_leakage.py` | 10 | CRITICAL leakage prevention tests | +| `conftest.py` | - | Test fixtures with sequential values for leakage detection | + +**Total**: 55 tests + +**Leakage Test Strategy**: +- Use sequential values (1, 2, 3...) so leakage is mathematically detectable +- Assert feature at row i never uses data from rows > i +- Test group isolation with multi-series fixtures + +### 5. Example Script + +**File**: `examples/compute_features_demo.py` + +Runnable demo showing: +- Feature configuration +- API calls to /compute and /preview +- Response handling + +--- + +## Configuration + +**File**: `app/core/config.py` + +New settings added: + +```python +# Feature Engineering +feature_max_lookback_days: int = 1095 # 3 years +feature_max_lag: int = 365 +feature_max_window: int = 90 +``` + +--- + +## Feature Types + +### Lag Features + +Past values at specified lag periods. + +```python +LagConfig( + lags=(1, 7, 14, 28), # Days to look back + target_column="quantity", + fill_value=None, # Optional: fill NaN with this value +) +``` + +**Output columns**: `lag_1`, `lag_7`, `lag_14`, `lag_28` + +### Rolling Features + +Rolling statistics over configurable windows. + +```python +RollingConfig( + windows=(7, 14, 28), + aggregations=("mean", "std", "min", "max"), + min_periods=7, +) +``` + +**Output columns**: `rolling_mean_7`, `rolling_std_7`, `rolling_min_7`, ... + +**CRITICAL**: Uses `shift(1)` before rolling to exclude current observation. + +### Calendar Features + +Date-derived features with optional cyclical encoding. + +```python +CalendarConfig( + include_day_of_week=True, + include_month=True, + include_quarter=True, + include_is_weekend=True, + use_cyclical_encoding=True, # sin/cos encoding +) +``` + +**Output columns**: +- Cyclical: `dow_sin`, `dow_cos`, `month_sin`, `month_cos` +- Non-cyclical: `day_of_week`, `month`, `quarter`, `is_weekend` + +### Imputation + +```python +ImputationConfig( + strategies={ + "quantity": "zero", # Zero-fill for sales + "unit_price": "ffill", # Forward-fill for prices + } +) +``` + +**Strategies**: `zero`, `ffill`, `bfill`, `mean`, `drop` + +--- + +## Dependencies + +Added to `pyproject.toml`: + +```toml +dependencies = [ + # ... existing + "pandas>=2.0.0", + "numpy>=1.24.0", +] + +[dependency-groups] +dev = [ + # ... existing + "pandas-stubs>=2.0.0", +] +``` + +--- + +## Directory Structure + +``` +app/features/featuresets/ +├── __init__.py # Module exports +├── schemas.py # Pydantic configuration schemas +├── service.py # FeatureEngineeringService +├── routes.py # FastAPI endpoints +└── tests/ + ├── __init__.py + ├── conftest.py # Test fixtures + ├── test_schemas.py # Schema validation tests + ├── test_service.py # Service unit tests + └── test_leakage.py # CRITICAL leakage tests + +examples/ +└── compute_features_demo.py # Demo script +``` + +--- + +## Validation Results + +``` +$ uv run ruff check app/features/featuresets/ +All checks passed! + +$ uv run mypy app/features/featuresets/ +Success: no issues found in 9 source files + +$ uv run pyright app/features/featuresets/ +0 errors, 0 warnings, 0 informations + +$ uv run pytest app/features/featuresets/tests/ -v +55 passed in 0.45s +``` + +--- + +## Next Phase Preparation + +Phase 4 (Forecasting) will use the feature engineering module to: +1. Generate features for training data using cutoff-based computation +2. Store feature configuration in model registry +3. Ensure reproducible feature computation during prediction + +**Integration Points**: +- `FeatureSetConfig.config_hash()` for registry storage +- `compute_features_for_series()` for model training pipelines +- Schema versioning for backward compatibility diff --git a/examples/compute_features_demo.py b/examples/compute_features_demo.py new file mode 100644 index 00000000..30fcab38 --- /dev/null +++ b/examples/compute_features_demo.py @@ -0,0 +1,229 @@ +#!/usr/bin/env python +"""Demo script for feature engineering computation. + +Usage: + uv run python examples/compute_features_demo.py + +This script demonstrates how to: +1. Configure feature engineering with various feature types +2. Compute time-safe features via the API +3. Preview features for debugging + +Requirements: + - API server running: uv run uvicorn app.main:app --port 8123 + - Database seeded with sales data +""" + +import json +from datetime import date + +import httpx + +# API configuration +API_BASE = "http://localhost:8123" +FEATURES_ENDPOINT = f"{API_BASE}/featuresets" + + +def create_sample_config() -> dict: + """Create a sample feature configuration. + + Returns: + FeatureSetConfig as a dictionary. + """ + return { + "name": "retail_forecast_v1", + "description": "Standard retail forecasting features", + "entity_columns": ["store_id", "product_id"], + "date_column": "date", + "target_column": "quantity", + "lag_config": { + "lags": [1, 7, 14, 28], + "target_column": "quantity", + "fill_value": None, + }, + "rolling_config": { + "windows": [7, 14, 28], + "aggregations": ["mean", "std", "min", "max"], + "target_column": "quantity", + "min_periods": 7, + }, + "calendar_config": { + "include_day_of_week": True, + "include_month": True, + "include_quarter": True, + "include_year": False, + "include_is_weekend": True, + "include_is_month_end": False, + "include_is_holiday": False, + "use_cyclical_encoding": True, + }, + "imputation_config": { + "strategies": { + "quantity": "zero", + "unit_price": "ffill", + } + }, + } + + +def compute_features( + store_id: int, + product_id: int, + cutoff_date: date, + lookback_days: int = 365, +) -> dict: + """Compute features for a single series. + + Args: + store_id: Store identifier. + product_id: Product identifier. + cutoff_date: Date up to which features are computed. + lookback_days: Number of days of history to use. + + Returns: + API response with computed features. + """ + request_body = { + "store_id": store_id, + "product_id": product_id, + "cutoff_date": cutoff_date.isoformat(), + "lookback_days": lookback_days, + "config": create_sample_config(), + } + + with httpx.Client(timeout=30.0) as client: + response = client.post( + f"{FEATURES_ENDPOINT}/compute", + json=request_body, + ) + response.raise_for_status() + return response.json() + + +def preview_features( + store_id: int, + product_id: int, + cutoff_date: date, + sample_rows: int = 10, +) -> dict: + """Preview features for debugging. + + Args: + store_id: Store identifier. + product_id: Product identifier. + cutoff_date: Date up to which features are computed. + sample_rows: Number of sample rows to return. + + Returns: + API response with sample feature rows. + """ + request_body = { + "store_id": store_id, + "product_id": product_id, + "cutoff_date": cutoff_date.isoformat(), + "sample_rows": sample_rows, + "config": create_sample_config(), + } + + with httpx.Client(timeout=30.0) as client: + response = client.post( + f"{FEATURES_ENDPOINT}/preview", + json=request_body, + ) + response.raise_for_status() + return response.json() + + +def main() -> None: + """Run the feature engineering demo.""" + print("ForecastLabAI - Feature Engineering Demo") + print("=" * 50) + print() + + # Demo parameters + store_id = 1 + product_id = 1 + cutoff_date = date(2024, 1, 31) + + print(f"Store ID: {store_id}") + print(f"Product ID: {product_id}") + print(f"Cutoff Date: {cutoff_date}") + print() + + # Show configuration + print("Feature Configuration:") + print("-" * 30) + config = create_sample_config() + print(f" Name: {config['name']}") + print(f" Lag features: {config['lag_config']['lags']}") + print(f" Rolling windows: {config['rolling_config']['windows']}") + print(f" Rolling aggregations: {config['rolling_config']['aggregations']}") + print(f" Cyclical encoding: {config['calendar_config']['use_cyclical_encoding']}") + print() + + try: + # Preview features + print("Previewing features (10 sample rows)...") + print("-" * 30) + + result = preview_features( + store_id=store_id, + product_id=product_id, + cutoff_date=cutoff_date, + sample_rows=10, + ) + + print(f" Config hash: {result['config_hash']}") + print(f" Row count: {result['row_count']}") + print(f" Feature columns: {len(result['feature_columns'])}") + print(f" Duration: {result['duration_ms']:.2f}ms") + print() + + print("Feature columns:") + for col in result["feature_columns"]: + print(f" - {col}") + print() + + print("Sample rows (last 3):") + for row in result["rows"][-3:]: + print(f" Date: {row['date']}") + print(f" Features: {json.dumps(row['features'], indent=6)}") + print() + + # Compute full features + print("Computing full features...") + print("-" * 30) + + full_result = compute_features( + store_id=store_id, + product_id=product_id, + cutoff_date=cutoff_date, + lookback_days=365, + ) + + print(f" Total rows: {full_result['row_count']}") + print(f" Duration: {full_result['duration_ms']:.2f}ms") + print() + + # Show null counts + if full_result["null_counts"]: + print("Null counts per feature:") + for col, count in sorted(full_result["null_counts"].items()): + if count > 0: + print(f" {col}: {count}") + print() + + print("Demo completed successfully!") + + except httpx.ConnectError: + print("ERROR: Cannot connect to API server.") + print("Please start the server with:") + print(" uv run uvicorn app.main:app --port 8123") + + except httpx.HTTPStatusError as e: + print(f"ERROR: API returned status {e.response.status_code}") + print(f"Response: {e.response.text}") + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 19584793..0c939478 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,8 @@ dependencies = [ "structlog>=24.4.0", "alembic>=1.14.0", "python-dotenv>=1.0.1", + "pandas>=3.0.0", + "numpy>=2.4.1", ] [project.optional-dependencies] @@ -158,3 +160,8 @@ markers = [ filterwarnings = [ "ignore::DeprecationWarning", ] + +[dependency-groups] +dev = [ + "pandas-stubs>=2.3.3.260113", +] diff --git a/uv.lock b/uv.lock index 8e2941b8..a8937ab1 100644 --- a/uv.lock +++ b/uv.lock @@ -1,6 +1,14 @@ version = 1 revision = 3 requires-python = ">=3.12" +resolution-markers = [ + "python_full_version >= '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version < '3.14' and sys_platform == 'win32'", + "python_full_version < '3.14' and sys_platform == 'emscripten'", + "python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", +] [[package]] name = "alembic" @@ -208,12 +216,14 @@ wheels = [ [[package]] name = "forecastlabai" -version = "0.1.2" +version = "0.1.5" source = { editable = "." } dependencies = [ { name = "alembic" }, { name = "asyncpg" }, { name = "fastapi" }, + { name = "numpy" }, + { name = "pandas" }, { name = "pydantic" }, { name = "pydantic-settings" }, { name = "python-dotenv" }, @@ -233,6 +243,11 @@ dev = [ { name = "ruff" }, ] +[package.dev-dependencies] +dev = [ + { name = "pandas-stubs" }, +] + [package.metadata] requires-dist = [ { name = "alembic", specifier = ">=1.14.0" }, @@ -240,6 +255,8 @@ requires-dist = [ { name = "fastapi", specifier = ">=0.115.0" }, { name = "httpx", marker = "extra == 'dev'", specifier = ">=0.28.0" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.13.0" }, + { name = "numpy", specifier = ">=2.4.1" }, + { name = "pandas", specifier = ">=3.0.0" }, { name = "pydantic", specifier = ">=2.10.0" }, { name = "pydantic-settings", specifier = ">=2.6.0" }, { name = "pyright", marker = "extra == 'dev'", specifier = ">=1.1.390" }, @@ -254,6 +271,9 @@ requires-dist = [ ] provides-extras = ["dev"] +[package.metadata.requires-dev] +dev = [{ name = "pandas-stubs", specifier = ">=2.3.3.260113" }] + [[package]] name = "greenlet" version = "3.3.1" @@ -559,6 +579,67 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/88/b2/d0896bdcdc8d28a7fc5717c305f1a861c26e18c05047949fb371034d98bd/nodeenv-1.10.0-py2.py3-none-any.whl", hash = "sha256:5bb13e3eed2923615535339b3c620e76779af4cb4c6a90deccc9e36b274d3827", size = 23438, upload-time = "2025-12-20T14:08:52.782Z" }, ] +[[package]] +name = "numpy" +version = "2.4.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/24/62/ae72ff66c0f1fd959925b4c11f8c2dea61f47f6acaea75a08512cdfe3fed/numpy-2.4.1.tar.gz", hash = "sha256:a1ceafc5042451a858231588a104093474c6a5c57dcc724841f5c888d237d690", size = 20721320, upload-time = "2026-01-10T06:44:59.619Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/78/7f/ec53e32bf10c813604edf07a3682616bd931d026fcde7b6d13195dfb684a/numpy-2.4.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d3703409aac693fa82c0aee023a1ae06a6e9d065dba10f5e8e80f642f1e9d0a2", size = 16656888, upload-time = "2026-01-10T06:42:40.913Z" }, + { url = "https://files.pythonhosted.org/packages/b8/e0/1f9585d7dae8f14864e948fd7fa86c6cb72dee2676ca2748e63b1c5acfe0/numpy-2.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7211b95ca365519d3596a1d8688a95874cc94219d417504d9ecb2df99fa7bfa8", size = 12373956, upload-time = "2026-01-10T06:42:43.091Z" }, + { url = "https://files.pythonhosted.org/packages/8e/43/9762e88909ff2326f5e7536fa8cb3c49fb03a7d92705f23e6e7f553d9cb3/numpy-2.4.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:5adf01965456a664fc727ed69cc71848f28d063217c63e1a0e200a118d5eec9a", size = 5202567, upload-time = "2026-01-10T06:42:45.107Z" }, + { url = "https://files.pythonhosted.org/packages/4b/ee/34b7930eb61e79feb4478800a4b95b46566969d837546aa7c034c742ef98/numpy-2.4.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:26f0bcd9c79a00e339565b303badc74d3ea2bd6d52191eeca5f95936cad107d0", size = 6549459, upload-time = "2026-01-10T06:42:48.152Z" }, + { url = "https://files.pythonhosted.org/packages/79/e3/5f115fae982565771be994867c89bcd8d7208dbfe9469185497d70de5ddf/numpy-2.4.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0093e85df2960d7e4049664b26afc58b03236e967fb942354deef3208857a04c", size = 14404859, upload-time = "2026-01-10T06:42:49.947Z" }, + { url = "https://files.pythonhosted.org/packages/d9/7d/9c8a781c88933725445a859cac5d01b5871588a15969ee6aeb618ba99eee/numpy-2.4.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7ad270f438cbdd402c364980317fb6b117d9ec5e226fff5b4148dd9aa9fc6e02", size = 16371419, upload-time = "2026-01-10T06:42:52.409Z" }, + { url = "https://files.pythonhosted.org/packages/a6/d2/8aa084818554543f17cf4162c42f162acbd3bb42688aefdba6628a859f77/numpy-2.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:297c72b1b98100c2e8f873d5d35fb551fce7040ade83d67dd51d38c8d42a2162", size = 16182131, upload-time = "2026-01-10T06:42:54.694Z" }, + { url = "https://files.pythonhosted.org/packages/60/db/0425216684297c58a8df35f3284ef56ec4a043e6d283f8a59c53562caf1b/numpy-2.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:cf6470d91d34bf669f61d515499859fa7a4c2f7c36434afb70e82df7217933f9", size = 18295342, upload-time = "2026-01-10T06:42:56.991Z" }, + { url = "https://files.pythonhosted.org/packages/31/4c/14cb9d86240bd8c386c881bafbe43f001284b7cce3bc01623ac9475da163/numpy-2.4.1-cp312-cp312-win32.whl", hash = "sha256:b6bcf39112e956594b3331316d90c90c90fb961e39696bda97b89462f5f3943f", size = 5959015, upload-time = "2026-01-10T06:42:59.631Z" }, + { url = "https://files.pythonhosted.org/packages/51/cf/52a703dbeb0c65807540d29699fef5fda073434ff61846a564d5c296420f/numpy-2.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:e1a27bb1b2dee45a2a53f5ca6ff2d1a7f135287883a1689e930d44d1ff296c87", size = 12310730, upload-time = "2026-01-10T06:43:01.627Z" }, + { url = "https://files.pythonhosted.org/packages/69/80/a828b2d0ade5e74a9fe0f4e0a17c30fdc26232ad2bc8c9f8b3197cf7cf18/numpy-2.4.1-cp312-cp312-win_arm64.whl", hash = "sha256:0e6e8f9d9ecf95399982019c01223dc130542960a12edfa8edd1122dfa66a8a8", size = 10312166, upload-time = "2026-01-10T06:43:03.673Z" }, + { url = "https://files.pythonhosted.org/packages/04/68/732d4b7811c00775f3bd522a21e8dd5a23f77eb11acdeb663e4a4ebf0ef4/numpy-2.4.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:d797454e37570cfd61143b73b8debd623c3c0952959adb817dd310a483d58a1b", size = 16652495, upload-time = "2026-01-10T06:43:06.283Z" }, + { url = "https://files.pythonhosted.org/packages/20/ca/857722353421a27f1465652b2c66813eeeccea9d76d5f7b74b99f298e60e/numpy-2.4.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:82c55962006156aeef1629b953fd359064aa47e4d82cfc8e67f0918f7da3344f", size = 12368657, upload-time = "2026-01-10T06:43:09.094Z" }, + { url = "https://files.pythonhosted.org/packages/81/0d/2377c917513449cc6240031a79d30eb9a163d32a91e79e0da47c43f2c0c8/numpy-2.4.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:71abbea030f2cfc3092a0ff9f8c8fdefdc5e0bf7d9d9c99663538bb0ecdac0b9", size = 5197256, upload-time = "2026-01-10T06:43:13.634Z" }, + { url = "https://files.pythonhosted.org/packages/17/39/569452228de3f5de9064ac75137082c6214be1f5c532016549a7923ab4b5/numpy-2.4.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:5b55aa56165b17aaf15520beb9cbd33c9039810e0d9643dd4379e44294c7303e", size = 6545212, upload-time = "2026-01-10T06:43:15.661Z" }, + { url = "https://files.pythonhosted.org/packages/8c/a4/77333f4d1e4dac4395385482557aeecf4826e6ff517e32ca48e1dafbe42a/numpy-2.4.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c0faba4a331195bfa96f93dd9dfaa10b2c7aa8cda3a02b7fd635e588fe821bf5", size = 14402871, upload-time = "2026-01-10T06:43:17.324Z" }, + { url = "https://files.pythonhosted.org/packages/ba/87/d341e519956273b39d8d47969dd1eaa1af740615394fe67d06f1efa68773/numpy-2.4.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d3e3087f53e2b4428766b54932644d148613c5a595150533ae7f00dab2f319a8", size = 16359305, upload-time = "2026-01-10T06:43:19.376Z" }, + { url = "https://files.pythonhosted.org/packages/32/91/789132c6666288eaa20ae8066bb99eba1939362e8f1a534949a215246e97/numpy-2.4.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:49e792ec351315e16da54b543db06ca8a86985ab682602d90c60ef4ff4db2a9c", size = 16181909, upload-time = "2026-01-10T06:43:21.808Z" }, + { url = "https://files.pythonhosted.org/packages/cf/b8/090b8bd27b82a844bb22ff8fdf7935cb1980b48d6e439ae116f53cdc2143/numpy-2.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:79e9e06c4c2379db47f3f6fc7a8652e7498251789bf8ff5bd43bf478ef314ca2", size = 18284380, upload-time = "2026-01-10T06:43:23.957Z" }, + { url = "https://files.pythonhosted.org/packages/67/78/722b62bd31842ff029412271556a1a27a98f45359dea78b1548a3a9996aa/numpy-2.4.1-cp313-cp313-win32.whl", hash = "sha256:3d1a100e48cb266090a031397863ff8a30050ceefd798f686ff92c67a486753d", size = 5957089, upload-time = "2026-01-10T06:43:27.535Z" }, + { url = "https://files.pythonhosted.org/packages/da/a6/cf32198b0b6e18d4fbfa9a21a992a7fca535b9bb2b0cdd217d4a3445b5ca/numpy-2.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:92a0e65272fd60bfa0d9278e0484c2f52fe03b97aedc02b357f33fe752c52ffb", size = 12307230, upload-time = "2026-01-10T06:43:29.298Z" }, + { url = "https://files.pythonhosted.org/packages/44/6c/534d692bfb7d0afe30611320c5fb713659dcb5104d7cc182aff2aea092f5/numpy-2.4.1-cp313-cp313-win_arm64.whl", hash = "sha256:20d4649c773f66cc2fc36f663e091f57c3b7655f936a4c681b4250855d1da8f5", size = 10313125, upload-time = "2026-01-10T06:43:31.782Z" }, + { url = "https://files.pythonhosted.org/packages/da/a1/354583ac5c4caa566de6ddfbc42744409b515039e085fab6e0ff942e0df5/numpy-2.4.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:f93bc6892fe7b0663e5ffa83b61aab510aacffd58c16e012bb9352d489d90cb7", size = 12496156, upload-time = "2026-01-10T06:43:34.237Z" }, + { url = "https://files.pythonhosted.org/packages/51/b0/42807c6e8cce58c00127b1dc24d365305189991f2a7917aa694a109c8d7d/numpy-2.4.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:178de8f87948163d98a4c9ab5bee4ce6519ca918926ec8df195af582de28544d", size = 5324663, upload-time = "2026-01-10T06:43:36.211Z" }, + { url = "https://files.pythonhosted.org/packages/fe/55/7a621694010d92375ed82f312b2f28017694ed784775269115323e37f5e2/numpy-2.4.1-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:98b35775e03ab7f868908b524fc0a84d38932d8daf7b7e1c3c3a1b6c7a2c9f15", size = 6645224, upload-time = "2026-01-10T06:43:37.884Z" }, + { url = "https://files.pythonhosted.org/packages/50/96/9fa8635ed9d7c847d87e30c834f7109fac5e88549d79ef3324ab5c20919f/numpy-2.4.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:941c2a93313d030f219f3a71fd3d91a728b82979a5e8034eb2e60d394a2b83f9", size = 14462352, upload-time = "2026-01-10T06:43:39.479Z" }, + { url = "https://files.pythonhosted.org/packages/03/d1/8cf62d8bb2062da4fb82dd5d49e47c923f9c0738032f054e0a75342faba7/numpy-2.4.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:529050522e983e00a6c1c6b67411083630de8b57f65e853d7b03d9281b8694d2", size = 16407279, upload-time = "2026-01-10T06:43:41.93Z" }, + { url = "https://files.pythonhosted.org/packages/86/1c/95c86e17c6b0b31ce6ef219da00f71113b220bcb14938c8d9a05cee0ff53/numpy-2.4.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:2302dc0224c1cbc49bb94f7064f3f923a971bfae45c33870dcbff63a2a550505", size = 16248316, upload-time = "2026-01-10T06:43:44.121Z" }, + { url = "https://files.pythonhosted.org/packages/30/b4/e7f5ff8697274c9d0fa82398b6a372a27e5cef069b37df6355ccb1f1db1a/numpy-2.4.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:9171a42fcad32dcf3fa86f0a4faa5e9f8facefdb276f54b8b390d90447cff4e2", size = 18329884, upload-time = "2026-01-10T06:43:46.613Z" }, + { url = "https://files.pythonhosted.org/packages/37/a4/b073f3e9d77f9aec8debe8ca7f9f6a09e888ad1ba7488f0c3b36a94c03ac/numpy-2.4.1-cp313-cp313t-win32.whl", hash = "sha256:382ad67d99ef49024f11d1ce5dcb5ad8432446e4246a4b014418ba3a1175a1f4", size = 6081138, upload-time = "2026-01-10T06:43:48.854Z" }, + { url = "https://files.pythonhosted.org/packages/16/16/af42337b53844e67752a092481ab869c0523bc95c4e5c98e4dac4e9581ac/numpy-2.4.1-cp313-cp313t-win_amd64.whl", hash = "sha256:62fea415f83ad8fdb6c20840578e5fbaf5ddd65e0ec6c3c47eda0f69da172510", size = 12447478, upload-time = "2026-01-10T06:43:50.476Z" }, + { url = "https://files.pythonhosted.org/packages/6c/f8/fa85b2eac68ec631d0b631abc448552cb17d39afd17ec53dcbcc3537681a/numpy-2.4.1-cp313-cp313t-win_arm64.whl", hash = "sha256:a7870e8c5fc11aef57d6fea4b4085e537a3a60ad2cdd14322ed531fdca68d261", size = 10382981, upload-time = "2026-01-10T06:43:52.575Z" }, + { url = "https://files.pythonhosted.org/packages/1b/a7/ef08d25698e0e4b4efbad8d55251d20fe2a15f6d9aa7c9b30cd03c165e6f/numpy-2.4.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:3869ea1ee1a1edc16c29bbe3a2f2a4e515cc3a44d43903ad41e0cacdbaf733dc", size = 16652046, upload-time = "2026-01-10T06:43:54.797Z" }, + { url = "https://files.pythonhosted.org/packages/8f/39/e378b3e3ca13477e5ac70293ec027c438d1927f18637e396fe90b1addd72/numpy-2.4.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:e867df947d427cdd7a60e3e271729090b0f0df80f5f10ab7dd436f40811699c3", size = 12378858, upload-time = "2026-01-10T06:43:57.099Z" }, + { url = "https://files.pythonhosted.org/packages/c3/74/7ec6154f0006910ed1fdbb7591cf4432307033102b8a22041599935f8969/numpy-2.4.1-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:e3bd2cb07841166420d2fa7146c96ce00cb3410664cbc1a6be028e456c4ee220", size = 5207417, upload-time = "2026-01-10T06:43:59.037Z" }, + { url = "https://files.pythonhosted.org/packages/f7/b7/053ac11820d84e42f8feea5cb81cc4fcd1091499b45b1ed8c7415b1bf831/numpy-2.4.1-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:f0a90aba7d521e6954670550e561a4cb925713bd944445dbe9e729b71f6cabee", size = 6542643, upload-time = "2026-01-10T06:44:01.852Z" }, + { url = "https://files.pythonhosted.org/packages/c0/c4/2e7908915c0e32ca636b92e4e4a3bdec4cb1e7eb0f8aedf1ed3c68a0d8cd/numpy-2.4.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5d558123217a83b2d1ba316b986e9248a1ed1971ad495963d555ccd75dcb1556", size = 14418963, upload-time = "2026-01-10T06:44:04.047Z" }, + { url = "https://files.pythonhosted.org/packages/eb/c0/3ed5083d94e7ffd7c404e54619c088e11f2e1939a9544f5397f4adb1b8ba/numpy-2.4.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2f44de05659b67d20499cbc96d49f2650769afcb398b79b324bb6e297bfe3844", size = 16363811, upload-time = "2026-01-10T06:44:06.207Z" }, + { url = "https://files.pythonhosted.org/packages/0e/68/42b66f1852bf525050a67315a4fb94586ab7e9eaa541b1bef530fab0c5dd/numpy-2.4.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:69e7419c9012c4aaf695109564e3387f1259f001b4326dfa55907b098af082d3", size = 16197643, upload-time = "2026-01-10T06:44:08.33Z" }, + { url = "https://files.pythonhosted.org/packages/d2/40/e8714fc933d85f82c6bfc7b998a0649ad9769a32f3494ba86598aaf18a48/numpy-2.4.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2ffd257026eb1b34352e749d7cc1678b5eeec3e329ad8c9965a797e08ccba205", size = 18289601, upload-time = "2026-01-10T06:44:10.841Z" }, + { url = "https://files.pythonhosted.org/packages/80/9a/0d44b468cad50315127e884802351723daca7cf1c98d102929468c81d439/numpy-2.4.1-cp314-cp314-win32.whl", hash = "sha256:727c6c3275ddefa0dc078524a85e064c057b4f4e71ca5ca29a19163c607be745", size = 6005722, upload-time = "2026-01-10T06:44:13.332Z" }, + { url = "https://files.pythonhosted.org/packages/7e/bb/c6513edcce5a831810e2dddc0d3452ce84d208af92405a0c2e58fd8e7881/numpy-2.4.1-cp314-cp314-win_amd64.whl", hash = "sha256:7d5d7999df434a038d75a748275cd6c0094b0ecdb0837342b332a82defc4dc4d", size = 12438590, upload-time = "2026-01-10T06:44:15.006Z" }, + { url = "https://files.pythonhosted.org/packages/e9/da/a598d5cb260780cf4d255102deba35c1d072dc028c4547832f45dd3323a8/numpy-2.4.1-cp314-cp314-win_arm64.whl", hash = "sha256:ce9ce141a505053b3c7bce3216071f3bf5c182b8b28930f14cd24d43932cd2df", size = 10596180, upload-time = "2026-01-10T06:44:17.386Z" }, + { url = "https://files.pythonhosted.org/packages/de/bc/ea3f2c96fcb382311827231f911723aeff596364eb6e1b6d1d91128aa29b/numpy-2.4.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:4e53170557d37ae404bf8d542ca5b7c629d6efa1117dac6a83e394142ea0a43f", size = 12498774, upload-time = "2026-01-10T06:44:19.467Z" }, + { url = "https://files.pythonhosted.org/packages/aa/ab/ef9d939fe4a812648c7a712610b2ca6140b0853c5efea361301006c02ae5/numpy-2.4.1-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:a73044b752f5d34d4232f25f18160a1cc418ea4507f5f11e299d8ac36875f8a0", size = 5327274, upload-time = "2026-01-10T06:44:23.189Z" }, + { url = "https://files.pythonhosted.org/packages/bd/31/d381368e2a95c3b08b8cf7faac6004849e960f4a042d920337f71cef0cae/numpy-2.4.1-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:fb1461c99de4d040666ca0444057b06541e5642f800b71c56e6ea92d6a853a0c", size = 6648306, upload-time = "2026-01-10T06:44:25.012Z" }, + { url = "https://files.pythonhosted.org/packages/c8/e5/0989b44ade47430be6323d05c23207636d67d7362a1796ccbccac6773dd2/numpy-2.4.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:423797bdab2eeefbe608d7c1ec7b2b4fd3c58d51460f1ee26c7500a1d9c9ee93", size = 14464653, upload-time = "2026-01-10T06:44:26.706Z" }, + { url = "https://files.pythonhosted.org/packages/10/a7/cfbe475c35371cae1358e61f20c5f075badc18c4797ab4354140e1d283cf/numpy-2.4.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:52b5f61bdb323b566b528899cc7db2ba5d1015bda7ea811a8bcf3c89c331fa42", size = 16405144, upload-time = "2026-01-10T06:44:29.378Z" }, + { url = "https://files.pythonhosted.org/packages/f8/a3/0c63fe66b534888fa5177cc7cef061541064dbe2b4b60dcc60ffaf0d2157/numpy-2.4.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:42d7dd5fa36d16d52a84f821eb96031836fd405ee6955dd732f2023724d0aa01", size = 16247425, upload-time = "2026-01-10T06:44:31.721Z" }, + { url = "https://files.pythonhosted.org/packages/6b/2b/55d980cfa2c93bd40ff4c290bf824d792bd41d2fe3487b07707559071760/numpy-2.4.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:e7b6b5e28bbd47b7532698e5db2fe1db693d84b58c254e4389d99a27bb9b8f6b", size = 18330053, upload-time = "2026-01-10T06:44:34.617Z" }, + { url = "https://files.pythonhosted.org/packages/23/12/8b5fc6b9c487a09a7957188e0943c9ff08432c65e34567cabc1623b03a51/numpy-2.4.1-cp314-cp314t-win32.whl", hash = "sha256:5de60946f14ebe15e713a6f22850c2372fa72f4ff9a432ab44aa90edcadaa65a", size = 6152482, upload-time = "2026-01-10T06:44:36.798Z" }, + { url = "https://files.pythonhosted.org/packages/00/a5/9f8ca5856b8940492fc24fbe13c1bc34d65ddf4079097cf9e53164d094e1/numpy-2.4.1-cp314-cp314t-win_amd64.whl", hash = "sha256:8f085da926c0d491ffff3096f91078cc97ea67e7e6b65e490bc8dcda65663be2", size = 12627117, upload-time = "2026-01-10T06:44:38.828Z" }, + { url = "https://files.pythonhosted.org/packages/ad/0d/eca3d962f9eef265f01a8e0d20085c6dd1f443cbffc11b6dede81fd82356/numpy-2.4.1-cp314-cp314t-win_arm64.whl", hash = "sha256:6436cffb4f2bf26c974344439439c95e152c9a527013f26b3577be6c2ca64295", size = 10667121, upload-time = "2026-01-10T06:44:41.644Z" }, +] + [[package]] name = "packaging" version = "26.0" @@ -568,6 +649,71 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" }, ] +[[package]] +name = "pandas" +version = "3.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "python-dateutil" }, + { name = "tzdata", marker = "sys_platform == 'emscripten' or sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/de/da/b1dc0481ab8d55d0f46e343cfe67d4551a0e14fcee52bd38ca1bd73258d8/pandas-3.0.0.tar.gz", hash = "sha256:0facf7e87d38f721f0af46fe70d97373a37701b1c09f7ed7aeeb292ade5c050f", size = 4633005, upload-time = "2026-01-21T15:52:04.726Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0b/38/db33686f4b5fa64d7af40d96361f6a4615b8c6c8f1b3d334eee46ae6160e/pandas-3.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9803b31f5039b3c3b10cc858c5e40054adb4b29b4d81cb2fd789f4121c8efbcd", size = 10334013, upload-time = "2026-01-21T15:50:34.771Z" }, + { url = "https://files.pythonhosted.org/packages/a5/7b/9254310594e9774906bacdd4e732415e1f86ab7dbb4b377ef9ede58cd8ec/pandas-3.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:14c2a4099cd38a1d18ff108168ea417909b2dea3bd1ebff2ccf28ddb6a74d740", size = 9874154, upload-time = "2026-01-21T15:50:36.67Z" }, + { url = "https://files.pythonhosted.org/packages/63/d4/726c5a67a13bc66643e66d2e9ff115cead482a44fc56991d0c4014f15aaf/pandas-3.0.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d257699b9a9960e6125686098d5714ac59d05222bef7a5e6af7a7fd87c650801", size = 10384433, upload-time = "2026-01-21T15:50:39.132Z" }, + { url = "https://files.pythonhosted.org/packages/bf/2e/9211f09bedb04f9832122942de8b051804b31a39cfbad199a819bb88d9f3/pandas-3.0.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:69780c98f286076dcafca38d8b8eee1676adf220199c0a39f0ecbf976b68151a", size = 10864519, upload-time = "2026-01-21T15:50:41.043Z" }, + { url = "https://files.pythonhosted.org/packages/00/8d/50858522cdc46ac88b9afdc3015e298959a70a08cd21e008a44e9520180c/pandas-3.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4a66384f017240f3858a4c8a7cf21b0591c3ac885cddb7758a589f0f71e87ebb", size = 11394124, upload-time = "2026-01-21T15:50:43.377Z" }, + { url = "https://files.pythonhosted.org/packages/86/3f/83b2577db02503cd93d8e95b0f794ad9d4be0ba7cb6c8bcdcac964a34a42/pandas-3.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:be8c515c9bc33989d97b89db66ea0cececb0f6e3c2a87fcc8b69443a6923e95f", size = 11920444, upload-time = "2026-01-21T15:50:45.932Z" }, + { url = "https://files.pythonhosted.org/packages/64/2d/4f8a2f192ed12c90a0aab47f5557ece0e56b0370c49de9454a09de7381b2/pandas-3.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:a453aad8c4f4e9f166436994a33884442ea62aa8b27d007311e87521b97246e1", size = 9730970, upload-time = "2026-01-21T15:50:47.962Z" }, + { url = "https://files.pythonhosted.org/packages/d4/64/ff571be435cf1e643ca98d0945d76732c0b4e9c37191a89c8550b105eed1/pandas-3.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:da768007b5a33057f6d9053563d6b74dd6d029c337d93c6d0d22a763a5c2ecc0", size = 9041950, upload-time = "2026-01-21T15:50:50.422Z" }, + { url = "https://files.pythonhosted.org/packages/6f/fa/7f0ac4ca8877c57537aaff2a842f8760e630d8e824b730eb2e859ffe96ca/pandas-3.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b78d646249b9a2bc191040988c7bb524c92fa8534fb0898a0741d7e6f2ffafa6", size = 10307129, upload-time = "2026-01-21T15:50:52.877Z" }, + { url = "https://files.pythonhosted.org/packages/6f/11/28a221815dcea4c0c9414dfc845e34a84a6a7dabc6da3194498ed5ba4361/pandas-3.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bc9cba7b355cb4162442a88ce495e01cb605f17ac1e27d6596ac963504e0305f", size = 9850201, upload-time = "2026-01-21T15:50:54.807Z" }, + { url = "https://files.pythonhosted.org/packages/ba/da/53bbc8c5363b7e5bd10f9ae59ab250fc7a382ea6ba08e4d06d8694370354/pandas-3.0.0-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3c9a1a149aed3b6c9bf246033ff91e1b02d529546c5d6fb6b74a28fea0cf4c70", size = 10354031, upload-time = "2026-01-21T15:50:57.463Z" }, + { url = "https://files.pythonhosted.org/packages/f7/a3/51e02ebc2a14974170d51e2410dfdab58870ea9bcd37cda15bd553d24dc4/pandas-3.0.0-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:95683af6175d884ee89471842acfca29172a85031fccdabc35e50c0984470a0e", size = 10861165, upload-time = "2026-01-21T15:50:59.32Z" }, + { url = "https://files.pythonhosted.org/packages/a5/fe/05a51e3cac11d161472b8297bd41723ea98013384dd6d76d115ce3482f9b/pandas-3.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1fbbb5a7288719e36b76b4f18d46ede46e7f916b6c8d9915b756b0a6c3f792b3", size = 11359359, upload-time = "2026-01-21T15:51:02.014Z" }, + { url = "https://files.pythonhosted.org/packages/ee/56/ba620583225f9b85a4d3e69c01df3e3870659cc525f67929b60e9f21dcd1/pandas-3.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8e8b9808590fa364416b49b2a35c1f4cf2785a6c156935879e57f826df22038e", size = 11912907, upload-time = "2026-01-21T15:51:05.175Z" }, + { url = "https://files.pythonhosted.org/packages/c9/8c/c6638d9f67e45e07656b3826405c5cc5f57f6fd07c8b2572ade328c86e22/pandas-3.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:98212a38a709feb90ae658cb6227ea3657c22ba8157d4b8f913cd4c950de5e7e", size = 9732138, upload-time = "2026-01-21T15:51:07.569Z" }, + { url = "https://files.pythonhosted.org/packages/7b/bf/bd1335c3bf1770b6d8fed2799993b11c4971af93bb1b729b9ebbc02ca2ec/pandas-3.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:177d9df10b3f43b70307a149d7ec49a1229a653f907aa60a48f1877d0e6be3be", size = 9033568, upload-time = "2026-01-21T15:51:09.484Z" }, + { url = "https://files.pythonhosted.org/packages/8e/c6/f5e2171914d5e29b9171d495344097d54e3ffe41d2d85d8115baba4dc483/pandas-3.0.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:2713810ad3806767b89ad3b7b69ba153e1c6ff6d9c20f9c2140379b2a98b6c98", size = 10741936, upload-time = "2026-01-21T15:51:11.693Z" }, + { url = "https://files.pythonhosted.org/packages/51/88/9a0164f99510a1acb9f548691f022c756c2314aad0d8330a24616c14c462/pandas-3.0.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:15d59f885ee5011daf8335dff47dcb8a912a27b4ad7826dc6cbe809fd145d327", size = 10393884, upload-time = "2026-01-21T15:51:14.197Z" }, + { url = "https://files.pythonhosted.org/packages/e0/53/b34d78084d88d8ae2b848591229da8826d1e65aacf00b3abe34023467648/pandas-3.0.0-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:24e6547fb64d2c92665dd2adbfa4e85fa4fd70a9c070e7cfb03b629a0bbab5eb", size = 10310740, upload-time = "2026-01-21T15:51:16.093Z" }, + { url = "https://files.pythonhosted.org/packages/5b/d3/bee792e7c3d6930b74468d990604325701412e55d7aaf47460a22311d1a5/pandas-3.0.0-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:48ee04b90e2505c693d3f8e8f524dab8cb8aaf7ddcab52c92afa535e717c4812", size = 10700014, upload-time = "2026-01-21T15:51:18.818Z" }, + { url = "https://files.pythonhosted.org/packages/55/db/2570bc40fb13aaed1cbc3fbd725c3a60ee162477982123c3adc8971e7ac1/pandas-3.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:66f72fb172959af42a459e27a8d8d2c7e311ff4c1f7db6deb3b643dbc382ae08", size = 11323737, upload-time = "2026-01-21T15:51:20.784Z" }, + { url = "https://files.pythonhosted.org/packages/bc/2e/297ac7f21c8181b62a4cccebad0a70caf679adf3ae5e83cb676194c8acc3/pandas-3.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4a4a400ca18230976724a5066f20878af785f36c6756e498e94c2a5e5d57779c", size = 11771558, upload-time = "2026-01-21T15:51:22.977Z" }, + { url = "https://files.pythonhosted.org/packages/0a/46/e1c6876d71c14332be70239acce9ad435975a80541086e5ffba2f249bcf6/pandas-3.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:940eebffe55528074341a5a36515f3e4c5e25e958ebbc764c9502cfc35ba3faa", size = 10473771, upload-time = "2026-01-21T15:51:25.285Z" }, + { url = "https://files.pythonhosted.org/packages/c0/db/0270ad9d13c344b7a36fa77f5f8344a46501abf413803e885d22864d10bf/pandas-3.0.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:597c08fb9fef0edf1e4fa2f9828dd27f3d78f9b8c9b4a748d435ffc55732310b", size = 10312075, upload-time = "2026-01-21T15:51:28.5Z" }, + { url = "https://files.pythonhosted.org/packages/09/9f/c176f5e9717f7c91becfe0f55a52ae445d3f7326b4a2cf355978c51b7913/pandas-3.0.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:447b2d68ac5edcbf94655fe909113a6dba6ef09ad7f9f60c80477825b6c489fe", size = 9900213, upload-time = "2026-01-21T15:51:30.955Z" }, + { url = "https://files.pythonhosted.org/packages/d9/e7/63ad4cc10b257b143e0a5ebb04304ad806b4e1a61c5da25f55896d2ca0f4/pandas-3.0.0-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:debb95c77ff3ed3ba0d9aa20c3a2f19165cc7956362f9873fce1ba0a53819d70", size = 10428768, upload-time = "2026-01-21T15:51:33.018Z" }, + { url = "https://files.pythonhosted.org/packages/9e/0e/4e4c2d8210f20149fd2248ef3fff26623604922bd564d915f935a06dd63d/pandas-3.0.0-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fedabf175e7cd82b69b74c30adbaa616de301291a5231138d7242596fc296a8d", size = 10882954, upload-time = "2026-01-21T15:51:35.287Z" }, + { url = "https://files.pythonhosted.org/packages/c6/60/c9de8ac906ba1f4d2250f8a951abe5135b404227a55858a75ad26f84db47/pandas-3.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:412d1a89aab46889f3033a386912efcdfa0f1131c5705ff5b668dda88305e986", size = 11430293, upload-time = "2026-01-21T15:51:37.57Z" }, + { url = "https://files.pythonhosted.org/packages/a1/69/806e6637c70920e5787a6d6896fd707f8134c2c55cd761e7249a97b7dc5a/pandas-3.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e979d22316f9350c516479dd3a92252be2937a9531ed3a26ec324198a99cdd49", size = 11952452, upload-time = "2026-01-21T15:51:39.618Z" }, + { url = "https://files.pythonhosted.org/packages/cb/de/918621e46af55164c400ab0ef389c9d969ab85a43d59ad1207d4ddbe30a5/pandas-3.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:083b11415b9970b6e7888800c43c82e81a06cd6b06755d84804444f0007d6bb7", size = 9851081, upload-time = "2026-01-21T15:51:41.758Z" }, + { url = "https://files.pythonhosted.org/packages/91/a1/3562a18dd0bd8c73344bfa26ff90c53c72f827df119d6d6b1dacc84d13e3/pandas-3.0.0-cp314-cp314-win_arm64.whl", hash = "sha256:5db1e62cb99e739fa78a28047e861b256d17f88463c76b8dafc7c1338086dca8", size = 9174610, upload-time = "2026-01-21T15:51:44.312Z" }, + { url = "https://files.pythonhosted.org/packages/ce/26/430d91257eaf366f1737d7a1c158677caaf6267f338ec74e3a1ec444111c/pandas-3.0.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:697b8f7d346c68274b1b93a170a70974cdc7d7354429894d5927c1effdcccd73", size = 10761999, upload-time = "2026-01-21T15:51:46.899Z" }, + { url = "https://files.pythonhosted.org/packages/ec/1a/954eb47736c2b7f7fe6a9d56b0cb6987773c00faa3c6451a43db4beb3254/pandas-3.0.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:8cb3120f0d9467ed95e77f67a75e030b67545bcfa08964e349252d674171def2", size = 10410279, upload-time = "2026-01-21T15:51:48.89Z" }, + { url = "https://files.pythonhosted.org/packages/20/fc/b96f3a5a28b250cd1b366eb0108df2501c0f38314a00847242abab71bb3a/pandas-3.0.0-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:33fd3e6baa72899746b820c31e4b9688c8e1b7864d7aec2de7ab5035c285277a", size = 10330198, upload-time = "2026-01-21T15:51:51.015Z" }, + { url = "https://files.pythonhosted.org/packages/90/b3/d0e2952f103b4fbef1ef22d0c2e314e74fc9064b51cee30890b5e3286ee6/pandas-3.0.0-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a8942e333dc67ceda1095227ad0febb05a3b36535e520154085db632c40ad084", size = 10728513, upload-time = "2026-01-21T15:51:53.387Z" }, + { url = "https://files.pythonhosted.org/packages/76/81/832894f286df828993dc5fd61c63b231b0fb73377e99f6c6c369174cf97e/pandas-3.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:783ac35c4d0fe0effdb0d67161859078618b1b6587a1af15928137525217a721", size = 11345550, upload-time = "2026-01-21T15:51:55.329Z" }, + { url = "https://files.pythonhosted.org/packages/34/a0/ed160a00fb4f37d806406bc0a79a8b62fe67f29d00950f8d16203ff3409b/pandas-3.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:125eb901e233f155b268bbef9abd9afb5819db74f0e677e89a61b246228c71ac", size = 11799386, upload-time = "2026-01-21T15:51:57.457Z" }, + { url = "https://files.pythonhosted.org/packages/36/c8/2ac00d7255252c5e3cf61b35ca92ca25704b0188f7454ca4aec08a33cece/pandas-3.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:b86d113b6c109df3ce0ad5abbc259fe86a1bd4adfd4a31a89da42f84f65509bb", size = 10873041, upload-time = "2026-01-21T15:52:00.034Z" }, + { url = "https://files.pythonhosted.org/packages/e6/3f/a80ac00acbc6b35166b42850e98a4f466e2c0d9c64054161ba9620f95680/pandas-3.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:1c39eab3ad38f2d7a249095f0a3d8f8c22cc0f847e98ccf5bbe732b272e2d9fa", size = 9441003, upload-time = "2026-01-21T15:52:02.281Z" }, +] + +[[package]] +name = "pandas-stubs" +version = "2.3.3.260113" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "types-pytz" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/92/5d/be23854a73fda69f1dbdda7bc10fbd6f930bd1fa87aaec389f00c901c1e8/pandas_stubs-2.3.3.260113.tar.gz", hash = "sha256:076e3724bcaa73de78932b012ec64b3010463d377fa63116f4e6850643d93800", size = 116131, upload-time = "2026-01-13T22:30:16.704Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/c6/df1fe324248424f77b89371116dab5243db7f052c32cc9fe7442ad9c5f75/pandas_stubs-2.3.3.260113-py3-none-any.whl", hash = "sha256:ec070b5c576e1badf12544ae50385872f0631fc35d99d00dc598c2954ec564d3", size = 168246, upload-time = "2026-01-13T22:30:15.244Z" }, +] + [[package]] name = "pathspec" version = "1.0.3" @@ -751,6 +897,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ee/49/1377b49de7d0c1ce41292161ea0f721913fa8722c19fb9c1e3aa0367eecb/pytest_cov-7.0.0-py3-none-any.whl", hash = "sha256:3b8e9558b16cc1479da72058bdecf8073661c7f57f7d3c5f22a1c23507f2d861", size = 22424, upload-time = "2025-09-09T10:57:00.695Z" }, ] +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" }, +] + [[package]] name = "python-dotenv" version = "1.2.1" @@ -832,6 +990,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9e/6a/40fee331a52339926a92e17ae748827270b288a35ef4a15c9c8f2ec54715/ruff-0.14.14-py3-none-win_arm64.whl", hash = "sha256:56e6981a98b13a32236a72a8da421d7839221fa308b223b9283312312e5ac76c", size = 10920448, upload-time = "2026-01-22T22:30:15.417Z" }, ] +[[package]] +name = "six" +version = "1.17.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, +] + [[package]] name = "sqlalchemy" version = "2.0.46" @@ -901,6 +1068,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a8/45/a132b9074aa18e799b891b91ad72133c98d8042c70f6240e4c5f9dabee2f/structlog-25.5.0-py3-none-any.whl", hash = "sha256:a8453e9b9e636ec59bd9e79bbd4a72f025981b3ba0f5837aebf48f02f37a7f9f", size = 72510, upload-time = "2025-10-27T08:28:21.535Z" }, ] +[[package]] +name = "types-pytz" +version = "2025.2.0.20251108" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/40/ff/c047ddc68c803b46470a357454ef76f4acd8c1088f5cc4891cdd909bfcf6/types_pytz-2025.2.0.20251108.tar.gz", hash = "sha256:fca87917836ae843f07129567b74c1929f1870610681b4c92cb86a3df5817bdb", size = 10961, upload-time = "2025-11-08T02:55:57.001Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e7/c1/56ef16bf5dcd255155cc736d276efa6ae0a5c26fd685e28f0412a4013c01/types_pytz-2025.2.0.20251108-py3-none-any.whl", hash = "sha256:0f1c9792cab4eb0e46c52f8845c8f77cf1e313cb3d68bf826aa867fe4717d91c", size = 10116, upload-time = "2025-11-08T02:55:56.194Z" }, +] + [[package]] name = "typing-extensions" version = "4.15.0" @@ -922,6 +1098,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" }, ] +[[package]] +name = "tzdata" +version = "2025.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5e/a7/c202b344c5ca7daf398f3b8a477eeb205cf3b6f32e7ec3a6bac0629ca975/tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7", size = 196772, upload-time = "2025-12-13T17:45:35.667Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/b0/003792df09decd6849a5e39c28b513c06e84436a54440380862b5aeff25d/tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1", size = 348521, upload-time = "2025-12-13T17:45:33.889Z" }, +] + [[package]] name = "uvicorn" version = "0.40.0" From 1e0db5eeb92d2abd4052fbabde7b4710780a36c9 Mon Sep 17 00:00:00 2001 From: "Gabe@w7dev" Date: Sat, 31 Jan 2026 23:46:14 +0000 Subject: [PATCH 2/5] fix: address code review feedback Service: - Fix min_periods falsy check to explicit None check (preserves 0) Tests: - Add expanding_mean to test_valid_strategies, expect 6 strategies Documentation: - Update PR reference from #24 to #25 - Fix all GET /featuresets/preview to POST in PRP-4 Co-Authored-By: Claude Opus 4.5 --- PRPs/PRP-4-feature-engineering.md | 12 ++++++------ app/features/featuresets/service.py | 2 +- app/features/featuresets/tests/test_schemas.py | 3 ++- docs/PHASE/3-FEATURE_ENGINEERING.md | 2 +- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/PRPs/PRP-4-feature-engineering.md b/PRPs/PRP-4-feature-engineering.md index 10a831e2..fe3fc008 100644 --- a/PRPs/PRP-4-feature-engineering.md +++ b/PRPs/PRP-4-feature-engineering.md @@ -38,7 +38,7 @@ Implement a time-safe feature engineering module for the ForecastLabAI forecasti - [ ] Exogenous features (price/promo/inventory) lagged appropriately - [ ] Imputation strategies: zero-fill for sales, forward-fill for prices - [ ] `POST /featuresets/compute` endpoint accepts config + data window + cutoff -- [ ] `GET /featuresets/preview` endpoint for single-series feature inspection +- [ ] `POST /featuresets/preview` endpoint for single-series feature inspection - [ ] Unit tests for each feature type - [ ] **Leakage tests** verify no future data used - [ ] Integration tests with real DB queries @@ -150,7 +150,7 @@ app/ ├── __init__.py # Module exports ├── schemas.py # FeatureSetConfig, LagConfig, RollingConfig, etc. ├── service.py # FeatureEngineeringService - ├── routes.py # POST /featuresets/compute, GET /featuresets/preview + ├── routes.py # POST /featuresets/compute, POST /featuresets/preview └── tests/ ├── __init__.py ├── conftest.py # Feature-specific fixtures, sample time series @@ -403,7 +403,7 @@ class ComputeFeaturesResponse(BaseModel): class PreviewFeaturesRequest(BaseModel): - """Request for GET /featuresets/preview.""" + """Request for POST /featuresets/preview.""" store_id: int = Field(..., ge=1) product_id: int = Field(..., ge=1) @@ -517,7 +517,7 @@ Task 7: Implement featuresets routes IMPLEMENT: - Router with tag "featuresets" - POST /featuresets/compute endpoint - - GET /featuresets/preview endpoint + - POST /featuresets/preview endpoint - Error handling with ForecastLabError VALIDATION: - uv run mypy app/features/featuresets/routes.py @@ -579,7 +579,7 @@ Task 13: Create integration tests for routes FILE: app/features/featuresets/tests/test_routes.py IMPLEMENT: - Test POST /featuresets/compute with valid payload - - Test GET /featuresets/preview returns sample rows + - Test POST /featuresets/preview returns sample rows - Test error handling for missing data - Test config_hash consistency REQUIRES: @@ -916,7 +916,7 @@ ROUTES: - ADD: app.include_router(featuresets_router) - ENDPOINTS: - POST /featuresets/compute - - GET /featuresets/preview + - POST /featuresets/preview DEPENDENCIES: - pandas (add to pyproject.toml if not present) diff --git a/app/features/featuresets/service.py b/app/features/featuresets/service.py index 71707022..417babaa 100644 --- a/app/features/featuresets/service.py +++ b/app/features/featuresets/service.py @@ -213,7 +213,7 @@ def _compute_rolling_features(self, df: pd.DataFrame) -> tuple[pd.DataFrame, lis columns: list[str] = [] for window in config.windows: - min_per = config.min_periods if config.min_periods else window + min_per = config.min_periods if config.min_periods is not None else window for agg in config.aggregations: col_name = f"rolling_{agg}_{window}" diff --git a/app/features/featuresets/tests/test_schemas.py b/app/features/featuresets/tests/test_schemas.py index 9f855d8e..4f9a3840 100644 --- a/app/features/featuresets/tests/test_schemas.py +++ b/app/features/featuresets/tests/test_schemas.py @@ -126,10 +126,11 @@ def test_valid_strategies(self): "price": "ffill", "inventory": "bfill", "demand": "mean", + "forecast": "expanding_mean", "optional": "drop", } ) - assert len(config.strategies) == 5 + assert len(config.strategies) == 6 def test_rejects_invalid_strategy(self): """Invalid strategy should be rejected.""" diff --git a/docs/PHASE/3-FEATURE_ENGINEERING.md b/docs/PHASE/3-FEATURE_ENGINEERING.md index bfd1ab10..bc4b4806 100644 --- a/docs/PHASE/3-FEATURE_ENGINEERING.md +++ b/docs/PHASE/3-FEATURE_ENGINEERING.md @@ -2,7 +2,7 @@ **Date Completed**: 2026-01-31 **PRP**: [PRP-4-feature-engineering.md](../../PRPs/PRP-4-feature-engineering.md) -**Release**: PR #24 +**Release**: PR #25 --- From e2c57ffb35cfa1fe0a4d0b6b9d1f56be9abdc7d9 Mon Sep 17 00:00:00 2001 From: Gabor Szabo <168316277+w7-mgfcode@users.noreply.github.com> Date: Sun, 1 Feb 2026 01:12:16 +0100 Subject: [PATCH 3/5] docs: update DAILY-FLOW.md for Phase 4 Forecasting (#27) * docs: update DAILY-FLOW.md for Phase 4 Forecasting Co-Authored-By: Claude Opus 4.5 * docs: add PRP-5 for Forecasting module Comprehensive PRP including: - Model zoo (naive, seasonal naive, moving average) - Unified BaseForecaster interface (fit/predict/serialize) - ModelBundle persistence with joblib - 15 ordered implementation tasks - 40+ test cases specified - Integration with FeatureEngineeringService Confidence: 8/10 Co-Authored-By: Claude Opus 4.5 --------- Co-authored-by: Gabe@w7dev Co-authored-by: Claude Opus 4.5 --- PRPs/PRP-5-forecasting.md | 970 ++++++++++++++++++++++++++++++++++++++ docs/DAILY-FLOW.md | 13 +- 2 files changed, 980 insertions(+), 3 deletions(-) create mode 100644 PRPs/PRP-5-forecasting.md diff --git a/PRPs/PRP-5-forecasting.md b/PRPs/PRP-5-forecasting.md new file mode 100644 index 00000000..5f5868cb --- /dev/null +++ b/PRPs/PRP-5-forecasting.md @@ -0,0 +1,970 @@ +# PRP-5: Forecast Models v0 (Baselines + Global ML Hooks) + +## Goal + +Implement a model zoo with baseline forecasting algorithms and a unified model interface for the ForecastLabAI forecasting pipeline. The module provides naive, seasonal naive, and moving average models with fit/predict/serialize capabilities, extensible to scikit-learn regression pipelines. + +**End State:** A production-ready `forecasting` vertical slice with: +- `BaseForecaster` — Abstract base class defining the model interface +- `NaiveForecaster`, `SeasonalNaiveForecaster`, `MovingAverageForecaster` — Baseline implementations +- `LightGBMForecaster` — Optional ML model with feature engineering integration +- `ModelBundle` — Joblib-based serialization with metadata and feature hash +- `POST /forecasting/train` — Train a model on historical data +- `POST /forecasting/predict` — Generate forecasts for a horizon +- Recursive multi-horizon forecasting support +- All validation gates passing (ruff, mypy, pyright, pytest) + +--- + +## Why + +- **Foundation for ForecastOps**: Models are required before backtesting (INITIAL-6) and registry (INITIAL-7) +- **Baseline Benchmarks**: Simple models establish performance baselines for ML comparison +- **Reproducibility**: Unified interface + serialization enables consistent model deployment +- **Integration Ready**: Works with FeatureEngineeringService for automated lag injection + +--- + +## What + +### User-Visible Behavior + +1. **Train Endpoint**: Accept store/product/date range, return trained model artifact +2. **Predict Endpoint**: Load model, generate multi-step forecasts +3. **Model Types**: naive, seasonal_naive, moving_average, lightgbm (feature-flagged) +4. **Persistence**: Save/load models with full metadata for reproducibility + +### Success Criteria + +- [ ] All three baseline models implement fit/predict/serialize/load +- [ ] Unified `BaseForecaster` interface with type hints +- [ ] Recursive forecasting for multi-horizon predictions +- [ ] `ModelBundle` includes model + config + feature_hash + metadata +- [ ] Deterministic results with configurable random seed +- [ ] Integration with `FeatureEngineeringService` for ML models +- [ ] 40+ unit tests including determinism and serialization tests +- [ ] Example files demonstrating each model type + +--- + +## All Needed Context + +### Documentation & References + +```yaml +# MUST READ - Include these in your context window + +# Scikit-learn Estimator Interface +- url: https://scikit-learn.org/stable/developers/develop.html + why: "BaseEstimator patterns, fit/predict contract, get_params/set_params" + critical: "All __init__ params must be explicit keyword args (no *args/**kwargs)" + +# Scikit-learn Model Persistence +- url: https://scikit-learn.org/stable/model_persistence.html + why: "Joblib dump/load patterns, compression, version compatibility warnings" + critical: "Models saved with one sklearn version may not load in another" + +# Scikit-learn Pipeline Composition +- url: https://scikit-learn.org/stable/modules/compose.html + why: "Pipeline construction for Scaling -> Encoding -> Regressor" + critical: "Pipeline requires fit/transform on all but last step" + +# Recursive Multi-Step Forecasting +- url: https://skforecast.org/0.9.1/user_guides/autoregresive-forecaster + why: "Pattern for iterating predictions as input for next step" + critical: "Error propagation increases with horizon length" + +# Naive/Seasonal Forecasting +- url: https://forecastegy.com/posts/naive-time-series-forecasting-in-python/ + why: "Implementation patterns for naive and seasonal naive" + +# Multi-Step Forecasting Strategies +- url: https://machinelearningmastery.com/multi-step-time-series-forecasting/ + why: "Recursive vs Direct vs Multi-output strategies" + +# Internal Codebase References +- file: app/features/featuresets/schemas.py + why: "Pattern for frozen Pydantic configs with config_hash()" + +- file: app/features/featuresets/service.py + why: "Pattern for service class with cutoff enforcement" + +- file: app/features/featuresets/tests/conftest.py + why: "Pattern for test fixtures with sequential data" + +- file: app/core/config.py + why: "Pattern for Settings with environment variables" + +- file: docs/ARCHITECTURE.md + why: "Section 7 describes ForecastOps requirements" +``` + +### Current Codebase Tree (Relevant Parts) + +``` +app/ +├── core/ +│ ├── config.py # Settings singleton +│ ├── database.py # AsyncSession, get_db +│ ├── exceptions.py # ForecastLabError base +│ └── logging.py # Structured logging +├── shared/ +│ └── models.py # TimestampMixin +├── features/ +│ ├── data_platform/ +│ │ └── models.py # SalesDaily, Store, Product, Calendar +│ ├── featuresets/ +│ │ ├── schemas.py # FeatureSetConfig, config_hash() +│ │ ├── service.py # FeatureEngineeringService +│ │ └── routes.py # POST /featuresets/compute +│ └── ingest/ +│ └── ... +└── main.py # FastAPI app with router registration +``` + +### Desired Codebase Tree + +``` +app/features/forecasting/ # NEW: Forecasting vertical slice +├── __init__.py # Module exports +├── schemas.py # ModelConfig, TrainRequest, PredictRequest, PredictResponse +├── models.py # BaseForecaster, NaiveForecaster, SeasonalNaiveForecaster, etc. +├── service.py # ForecastingService (orchestration) +├── persistence.py # ModelBundle, save/load functions +├── routes.py # POST /forecasting/train, POST /forecasting/predict +└── tests/ + ├── __init__.py + ├── conftest.py # Fixtures: sample configs, time series data + ├── test_schemas.py # Config validation, immutability + ├── test_models.py # Model fit/predict, determinism + ├── test_persistence.py # Serialization round-trip + ├── test_service.py # Orchestration logic + └── test_routes.py # Integration tests + +examples/models/ # NEW: Example scripts +├── baseline_naive.py # Train and predict with naive model +├── baseline_seasonal.py # Train and predict with seasonal naive +├── baseline_mavg.py # Train and predict with moving average +└── model_interface.md # Contract documentation + +app/core/config.py # MODIFY: Add forecasting settings +app/main.py # MODIFY: Register forecasting router +``` + +### Known Gotchas + +```python +# CRITICAL: Pydantic v2 uses model_config = ConfigDict(...), not class Config +# Example: frozen=True for immutability, extra="forbid" for strict validation + +# CRITICAL: Use field_validator (not @validator) with @classmethod decorator +# Example: @field_validator("horizon") @classmethod def validate_horizon(...) + +# CRITICAL: Joblib serialization includes Python version - document this +# Models may not load if trained on different Python/sklearn version + +# CRITICAL: Recursive forecasting propagates errors - warn users for long horizons + +# CRITICAL: All forecasters must be deterministic with fixed random_state +# Use Settings().forecast_random_seed consistently + +# CRITICAL: Multi-horizon forecasting updates lags recursively +# Prediction at t+1 becomes lag_1 for prediction at t+2 + +# CRITICAL: Feature engineering must use cutoff_date = last training date +# Never use future data when computing features for prediction +``` + +--- + +## Implementation Blueprint + +### Data Models and Schemas + +```python +# app/features/forecasting/schemas.py + +from pydantic import BaseModel, ConfigDict, Field, field_validator +from typing import Literal +from datetime import date +import hashlib + +class ModelConfigBase(BaseModel): + """Base configuration for all forecasting models.""" + model_config = ConfigDict(frozen=True, extra="forbid") + + schema_version: str = Field(default="1.0", pattern=r"^\d+\.\d+(\.\d+)?$") + + def config_hash(self) -> str: + """Deterministic hash for reproducibility.""" + return hashlib.sha256(self.model_dump_json().encode()).hexdigest()[:16] + + +class NaiveModelConfig(ModelConfigBase): + """Config for naive forecaster (last value).""" + model_type: Literal["naive"] = "naive" + + +class SeasonalNaiveModelConfig(ModelConfigBase): + """Config for seasonal naive forecaster.""" + model_type: Literal["seasonal_naive"] = "seasonal_naive" + season_length: int = Field(default=7, ge=1, le=365, description="Seasonality period in days") + + +class MovingAverageModelConfig(ModelConfigBase): + """Config for moving average forecaster.""" + model_type: Literal["moving_average"] = "moving_average" + window_size: int = Field(default=7, ge=1, le=90, description="Window size for averaging") + + +class LightGBMModelConfig(ModelConfigBase): + """Config for LightGBM regressor (feature-flagged).""" + model_type: Literal["lightgbm"] = "lightgbm" + n_estimators: int = Field(default=100, ge=10, le=1000) + max_depth: int = Field(default=6, ge=1, le=20) + learning_rate: float = Field(default=0.1, ge=0.001, le=1.0) + feature_config_hash: str | None = Field(default=None, description="Hash of FeatureSetConfig used") + + +# Union type for all configs +ModelConfig = NaiveModelConfig | SeasonalNaiveModelConfig | MovingAverageModelConfig | LightGBMModelConfig + + +class TrainRequest(BaseModel): + """Request body for POST /forecasting/train.""" + model_config = ConfigDict(strict=True) + + store_id: int = Field(..., ge=1) + product_id: int = Field(..., ge=1) + train_start_date: date + train_end_date: date + config: ModelConfig + + @field_validator("train_end_date") + @classmethod + def validate_date_range(cls, v: date, info) -> date: + if "train_start_date" in info.data and v <= info.data["train_start_date"]: + raise ValueError("train_end_date must be after train_start_date") + return v + + +class PredictRequest(BaseModel): + """Request body for POST /forecasting/predict.""" + model_config = ConfigDict(strict=True) + + store_id: int = Field(..., ge=1) + product_id: int = Field(..., ge=1) + horizon: int = Field(..., ge=1, le=90, description="Number of days to forecast") + model_path: str = Field(..., description="Path to saved model bundle") + + +class ForecastPoint(BaseModel): + """Single forecast point.""" + date: date + forecast: float + lower_bound: float | None = None + upper_bound: float | None = None + + +class PredictResponse(BaseModel): + """Response body for POST /forecasting/predict.""" + store_id: int + product_id: int + forecasts: list[ForecastPoint] + model_type: str + config_hash: str + horizon: int + duration_ms: float +``` + +### Model Interface (Abstract Base) + +```python +# app/features/forecasting/models.py + +from abc import ABC, abstractmethod +from dataclasses import dataclass +import numpy as np +import pandas as pd +from typing import Any + +@dataclass +class FitResult: + """Result of model fitting.""" + fitted: bool + n_observations: int + train_start: date + train_end: date + metrics: dict[str, float] # e.g., {"train_mae": 1.23} + + +class BaseForecaster(ABC): + """Abstract base class for all forecasting models. + + CRITICAL: All implementations must be deterministic with fixed random_state. + + Interface follows scikit-learn conventions: + - fit(y, X=None) -> self + - predict(horizon, X=None) -> np.ndarray + - get_params() -> dict + - set_params(**params) -> self + """ + + def __init__(self, random_state: int = 42): + self.random_state = random_state + self._is_fitted = False + self._last_values: np.ndarray | None = None + self._fit_result: FitResult | None = None + + @abstractmethod + def fit(self, y: np.ndarray, X: np.ndarray | None = None) -> "BaseForecaster": + """Fit the model on historical data. + + Args: + y: Target values (1D array of shape [n_samples]) + X: Optional exogenous features (2D array of shape [n_samples, n_features]) + + Returns: + self (for method chaining) + """ + pass + + @abstractmethod + def predict(self, horizon: int, X: np.ndarray | None = None) -> np.ndarray: + """Generate forecasts for the specified horizon. + + CRITICAL: For recursive forecasting, predictions at t+k become + inputs for predictions at t+k+1. + + Args: + horizon: Number of steps to forecast + X: Optional exogenous features for forecast period + + Returns: + Array of forecasts with shape [horizon] + """ + pass + + @abstractmethod + def get_params(self) -> dict[str, Any]: + """Get model parameters (scikit-learn convention).""" + pass + + @abstractmethod + def set_params(self, **params: Any) -> "BaseForecaster": + """Set model parameters (scikit-learn convention).""" + pass + + @property + def is_fitted(self) -> bool: + return self._is_fitted + + +class NaiveForecaster(BaseForecaster): + """Naive forecaster: predicts last observed value for all horizons. + + Formula: y_hat[t+h] = y[t] for all h + """ + + def __init__(self, random_state: int = 42): + super().__init__(random_state) + + def fit(self, y: np.ndarray, X: np.ndarray | None = None) -> "NaiveForecaster": + if len(y) == 0: + raise ValueError("Cannot fit on empty array") + self._last_values = np.array([y[-1]]) + self._is_fitted = True + return self + + def predict(self, horizon: int, X: np.ndarray | None = None) -> np.ndarray: + if not self._is_fitted: + raise RuntimeError("Model must be fitted before predict") + # Repeat last value for all horizons + return np.full(horizon, self._last_values[0]) + + def get_params(self) -> dict[str, Any]: + return {"random_state": self.random_state} + + def set_params(self, **params: Any) -> "NaiveForecaster": + for key, value in params.items(): + setattr(self, key, value) + return self + + +class SeasonalNaiveForecaster(BaseForecaster): + """Seasonal naive forecaster: predicts value from same season in previous cycle. + + Formula: y_hat[t+h] = y[t+h-m] where m is season_length + + For weekly seasonality (m=7), Friday's forecast = last Friday's value. + """ + + def __init__(self, season_length: int = 7, random_state: int = 42): + super().__init__(random_state) + self.season_length = season_length + + def fit(self, y: np.ndarray, X: np.ndarray | None = None) -> "SeasonalNaiveForecaster": + if len(y) < self.season_length: + raise ValueError(f"Need at least {self.season_length} observations") + # Store last season_length values for cycling + self._last_values = y[-self.season_length:] + self._is_fitted = True + return self + + def predict(self, horizon: int, X: np.ndarray | None = None) -> np.ndarray: + if not self._is_fitted: + raise RuntimeError("Model must be fitted before predict") + # Cycle through seasonal values + forecasts = np.zeros(horizon) + for h in range(horizon): + idx = h % self.season_length + forecasts[h] = self._last_values[idx] + return forecasts + + def get_params(self) -> dict[str, Any]: + return {"season_length": self.season_length, "random_state": self.random_state} + + def set_params(self, **params: Any) -> "SeasonalNaiveForecaster": + for key, value in params.items(): + setattr(self, key, value) + return self + + +class MovingAverageForecaster(BaseForecaster): + """Moving average forecaster: predicts mean of last N observations. + + Formula: y_hat[t+h] = mean(y[t-window+1:t+1]) + + CRITICAL: Does NOT update recursively - uses same average for all horizons. + """ + + def __init__(self, window_size: int = 7, random_state: int = 42): + super().__init__(random_state) + self.window_size = window_size + + def fit(self, y: np.ndarray, X: np.ndarray | None = None) -> "MovingAverageForecaster": + if len(y) < self.window_size: + raise ValueError(f"Need at least {self.window_size} observations") + # Compute mean of last window_size values + self._last_values = y[-self.window_size:] + self._forecast_value = float(np.mean(self._last_values)) + self._is_fitted = True + return self + + def predict(self, horizon: int, X: np.ndarray | None = None) -> np.ndarray: + if not self._is_fitted: + raise RuntimeError("Model must be fitted before predict") + # Constant forecast for all horizons + return np.full(horizon, self._forecast_value) + + def get_params(self) -> dict[str, Any]: + return {"window_size": self.window_size, "random_state": self.random_state} + + def set_params(self, **params: Any) -> "MovingAverageForecaster": + for key, value in params.items(): + setattr(self, key, value) + return self +``` + +### Persistence Layer + +```python +# app/features/forecasting/persistence.py + +from dataclasses import dataclass, field +from datetime import datetime +from pathlib import Path +from typing import Any +import hashlib +import json + +import joblib + +from app.features.forecasting.models import BaseForecaster +from app.features.forecasting.schemas import ModelConfig + + +@dataclass +class ModelBundle: + """Bundle containing model, config, and metadata for persistence. + + CRITICAL: Includes version info for compatibility checking. + """ + model: BaseForecaster + config: ModelConfig + metadata: dict[str, Any] = field(default_factory=dict) + + # Auto-populated on save + created_at: datetime | None = None + python_version: str | None = None + sklearn_version: str | None = None + bundle_hash: str | None = None + + def compute_hash(self) -> str: + """Compute deterministic hash of bundle contents.""" + content = { + "config_hash": self.config.config_hash(), + "model_params": self.model.get_params(), + "metadata": self.metadata, + } + return hashlib.sha256(json.dumps(content, sort_keys=True, default=str).encode()).hexdigest()[:16] + + +def save_model_bundle(bundle: ModelBundle, path: str | Path) -> Path: + """Save model bundle to disk using joblib. + + CRITICAL: Records Python and sklearn versions for compatibility warnings. + + Args: + bundle: ModelBundle to save + path: File path (will add .joblib extension if missing) + + Returns: + Path to saved file + """ + import sys + import sklearn + + path = Path(path) + if not path.suffix: + path = path.with_suffix(".joblib") + + # Ensure directory exists + path.parent.mkdir(parents=True, exist_ok=True) + + # Populate metadata + bundle.created_at = datetime.utcnow() + bundle.python_version = sys.version + bundle.sklearn_version = sklearn.__version__ + bundle.bundle_hash = bundle.compute_hash() + + # Save with compression + joblib.dump(bundle, path, compress=3) + + return path + + +def load_model_bundle(path: str | Path) -> ModelBundle: + """Load model bundle from disk. + + CRITICAL: Logs warning if versions don't match. + + Args: + path: Path to saved bundle + + Returns: + Loaded ModelBundle + + Raises: + FileNotFoundError: If path doesn't exist + """ + import sys + import sklearn + import structlog + + logger = structlog.get_logger() + path = Path(path) + + if not path.exists(): + raise FileNotFoundError(f"Model bundle not found: {path}") + + bundle: ModelBundle = joblib.load(path) + + # Version compatibility warnings + if bundle.python_version and not sys.version.startswith(bundle.python_version.split()[0]): + logger.warning( + "forecasting.model_version_mismatch", + saved_python=bundle.python_version, + current_python=sys.version, + ) + + if bundle.sklearn_version and bundle.sklearn_version != sklearn.__version__: + logger.warning( + "forecasting.sklearn_version_mismatch", + saved_sklearn=bundle.sklearn_version, + current_sklearn=sklearn.__version__, + ) + + return bundle +``` + +--- + +## Task List + +### Task 1: Add forecasting settings to config + +```yaml +FILE: app/core/config.py +ACTION: MODIFY +FIND: "feature_max_window: int = 90" +INJECT AFTER: + - forecast_random_seed: int = 42 + - forecast_default_horizon: int = 14 + - forecast_max_horizon: int = 90 + - forecast_model_artifacts_dir: str = "./artifacts/models" + - forecast_enable_lightgbm: bool = False # Feature flag +VALIDATION: + - uv run mypy app/core/config.py + - uv run pyright app/core/config.py +``` + +### Task 2: Create forecasting module structure + +```yaml +ACTION: CREATE directories and __init__.py +FILES: + - app/features/forecasting/__init__.py + - app/features/forecasting/tests/__init__.py +PATTERN: Mirror featuresets module exports +``` + +### Task 3: Implement schemas.py + +```yaml +FILE: app/features/forecasting/schemas.py +ACTION: CREATE +IMPLEMENT: + - ModelConfigBase with frozen=True, config_hash() + - NaiveModelConfig, SeasonalNaiveModelConfig, MovingAverageModelConfig + - LightGBMModelConfig (for feature-flagged ML) + - TrainRequest, PredictRequest schemas + - ForecastPoint, PredictResponse schemas + - TrainResponse schema +PATTERN: Mirror app/features/featuresets/schemas.py +VALIDATION: + - uv run mypy app/features/forecasting/schemas.py + - uv run pyright app/features/forecasting/schemas.py +``` + +### Task 4: Implement models.py (BaseForecaster + baselines) + +```yaml +FILE: app/features/forecasting/models.py +ACTION: CREATE +IMPLEMENT: + - FitResult dataclass + - BaseForecaster ABC with fit/predict/get_params/set_params + - NaiveForecaster implementation + - SeasonalNaiveForecaster implementation + - MovingAverageForecaster implementation + - model_factory() function to instantiate by type +CRITICAL: + - All models must be deterministic with random_state + - Store last N values for prediction + - Raise RuntimeError if predict called before fit +VALIDATION: + - uv run mypy app/features/forecasting/models.py + - uv run pyright app/features/forecasting/models.py +``` + +### Task 5: Implement persistence.py + +```yaml +FILE: app/features/forecasting/persistence.py +ACTION: CREATE +IMPLEMENT: + - ModelBundle dataclass + - save_model_bundle() with joblib compression + - load_model_bundle() with version warnings +CRITICAL: + - Record Python/sklearn versions + - Compute deterministic bundle hash + - Log warnings on version mismatch +VALIDATION: + - uv run mypy app/features/forecasting/persistence.py + - uv run pyright app/features/forecasting/persistence.py +``` + +### Task 6: Implement service.py + +```yaml +FILE: app/features/forecasting/service.py +ACTION: CREATE +IMPLEMENT: + - ForecastingService class + - train_model() method: load data, fit model, save bundle + - predict() method: load bundle, generate forecasts + - _load_training_data() helper: query SalesDaily + - _prepare_features() helper: call FeatureEngineeringService if ML model +CRITICAL: + - Use cutoff_date = train_end_date for feature computation + - Validate grain (store_id, product_id) + - Log forecasting.train_started, forecasting.train_completed +VALIDATION: + - uv run mypy app/features/forecasting/service.py + - uv run pyright app/features/forecasting/service.py +``` + +### Task 7: Implement routes.py + +```yaml +FILE: app/features/forecasting/routes.py +ACTION: CREATE +IMPLEMENT: + - APIRouter(prefix="/forecasting", tags=["forecasting"]) + - POST /train endpoint + - POST /predict endpoint + - Error handling with ForecastLabError +PATTERN: Mirror app/features/featuresets/routes.py +CRITICAL: + - time.perf_counter() for duration_ms + - Depends(get_db) for database session + - Structured logging on entry/exit/error +VALIDATION: + - uv run mypy app/features/forecasting/routes.py + - uv run pyright app/features/forecasting/routes.py +``` + +### Task 8: Register router in main.py + +```yaml +FILE: app/main.py +ACTION: MODIFY +FIND: "app.include_router(featuresets_router)" +INJECT AFTER: + - from app.features.forecasting.routes import router as forecasting_router + - app.include_router(forecasting_router) +VALIDATION: + - uv run python -c "from app.main import app; print('OK')" +``` + +### Task 9: Create test fixtures (conftest.py) + +```yaml +FILE: app/features/forecasting/tests/conftest.py +ACTION: CREATE +IMPLEMENT: + - sample_time_series: 60 days of sequential values + - sample_seasonal_series: data with weekly pattern + - sample_naive_config, sample_seasonal_config, sample_mavg_config + - tmp_model_path: temporary path for serialization tests +PATTERN: Mirror app/features/featuresets/tests/conftest.py +``` + +### Task 10: Create test_schemas.py + +```yaml +FILE: app/features/forecasting/tests/test_schemas.py +ACTION: CREATE +IMPLEMENT: + - Test config validation (positive values, ranges) + - Test config immutability (frozen=True) + - Test config_hash() determinism + - Test TrainRequest date validation + - Test PredictRequest horizon validation +VALIDATION: + - uv run pytest app/features/forecasting/tests/test_schemas.py -v +``` + +### Task 11: Create test_models.py + +```yaml +FILE: app/features/forecasting/tests/test_models.py +ACTION: CREATE +IMPLEMENT: + - TestNaiveForecaster: fit, predict, determinism + - TestSeasonalNaiveForecaster: fit, predict, seasonal cycling + - TestMovingAverageForecaster: fit, predict, window averaging + - Test error on predict before fit + - Test error on insufficient data + - Test get_params/set_params +CRITICAL: + - Use sequential data for determinism verification + - Assert exact expected values +VALIDATION: + - uv run pytest app/features/forecasting/tests/test_models.py -v +``` + +### Task 12: Create test_persistence.py + +```yaml +FILE: app/features/forecasting/tests/test_persistence.py +ACTION: CREATE +IMPLEMENT: + - Test save/load round-trip + - Test bundle_hash consistency + - Test version metadata recorded + - Test FileNotFoundError on missing path + - Test compression reduces file size +VALIDATION: + - uv run pytest app/features/forecasting/tests/test_persistence.py -v +``` + +### Task 13: Create test_service.py + +```yaml +FILE: app/features/forecasting/tests/test_service.py +ACTION: CREATE +IMPLEMENT: + - Test train_model happy path (mock DB) + - Test predict happy path (mock loaded bundle) + - Test error handling for missing data + - Test model_factory returns correct type +VALIDATION: + - uv run pytest app/features/forecasting/tests/test_service.py -v +``` + +### Task 14: Create example files + +```yaml +FILES: + - examples/models/baseline_naive.py + - examples/models/baseline_seasonal.py + - examples/models/baseline_mavg.py + - examples/models/model_interface.md +ACTION: CREATE +IMPLEMENT: + - Runnable demos showing train -> save -> load -> predict + - Documentation of model interface contract +``` + +### Task 15: Update module __init__.py exports + +```yaml +FILE: app/features/forecasting/__init__.py +ACTION: MODIFY +IMPLEMENT: + - Export all public classes + - __all__ list (sorted alphabetically) +VALIDATION: + - uv run python -c "from app.features.forecasting import *; print('OK')" +``` + +--- + +## Validation Loop + +### Level 1: Syntax & Style + +```bash +# Run after EACH file creation +uv run ruff check app/features/forecasting/ --fix +uv run ruff format app/features/forecasting/ + +# Expected: All checks passed! +``` + +### Level 2: Type Checking + +```bash +# Run after completing schemas, models, persistence, service +uv run mypy app/features/forecasting/ +uv run pyright app/features/forecasting/ + +# Expected: Success: no issues found +``` + +### Level 3: Unit Tests + +```bash +# Run incrementally as tests are created +uv run pytest app/features/forecasting/tests/test_schemas.py -v +uv run pytest app/features/forecasting/tests/test_models.py -v +uv run pytest app/features/forecasting/tests/test_persistence.py -v +uv run pytest app/features/forecasting/tests/test_service.py -v + +# Run all +uv run pytest app/features/forecasting/tests/ -v + +# Expected: 40+ tests passed +``` + +### Level 4: Integration Test + +```bash +# Start API +uv run uvicorn app.main:app --reload --port 8123 + +# Test train endpoint (requires seeded DB) +curl -X POST http://localhost:8123/forecasting/train \ + -H "Content-Type: application/json" \ + -d '{ + "store_id": 1, + "product_id": 1, + "train_start_date": "2024-01-01", + "train_end_date": "2024-01-31", + "config": {"model_type": "naive"} + }' + +# Test predict endpoint +curl -X POST http://localhost:8123/forecasting/predict \ + -H "Content-Type: application/json" \ + -d '{ + "store_id": 1, + "product_id": 1, + "horizon": 7, + "model_path": "./artifacts/models/model_xxx.joblib" + }' +``` + +### Level 5: Full Validation + +```bash +# Complete validation suite +uv run ruff check app/features/forecasting/ && \ +uv run mypy app/features/forecasting/ && \ +uv run pyright app/features/forecasting/ && \ +uv run pytest app/features/forecasting/tests/ -v + +# Expected: All green +``` + +--- + +## Final Checklist + +- [ ] All 15 tasks completed +- [ ] `uv run ruff check .` — no errors +- [ ] `uv run mypy app/features/forecasting/` — no errors +- [ ] `uv run pyright app/features/forecasting/` — no errors +- [ ] `uv run pytest app/features/forecasting/tests/ -v` — 40+ tests passed +- [ ] Example scripts run successfully +- [ ] Router registered in main.py +- [ ] Settings added to config.py +- [ ] Logging events follow standard format + +--- + +## Anti-Patterns to Avoid + +- **DON'T** hardcode horizons or window sizes — use config +- **DON'T** use random operations without `random_state` +- **DON'T** call predict() before fit() — raise RuntimeError +- **DON'T** use pickle directly — use joblib for sklearn compatibility +- **DON'T** ignore version mismatches — log warnings +- **DON'T** store entire training data in model — only store what's needed for prediction +- **DON'T** use future data in feature computation — enforce cutoff_date +- **DON'T** catch generic Exception — be specific + +--- + +## Confidence Score: 8/10 + +**Strengths:** +- Clear patterns from featuresets module to follow +- Well-documented scikit-learn interface standards +- Comprehensive task breakdown +- Executable validation gates + +**Risks:** +- LightGBM integration deferred (feature-flagged) +- Integration tests require seeded database +- Recursive ML forecasting complexity (deferred to backtesting phase) + +**Mitigation:** +- Focus on baseline models first (naive, seasonal, moving average) +- LightGBM is optional and feature-flagged +- Recursive forecasting for ML models will be addressed in INITIAL-6 + +--- + +## Sources + +- [Developing scikit-learn estimators](https://scikit-learn.org/stable/developers/develop.html) +- [scikit-learn Model Persistence](https://scikit-learn.org/stable/model_persistence.html) +- [scikit-learn Pipeline Composition](https://scikit-learn.org/stable/modules/compose.html) +- [Skforecast Recursive Multi-step Forecasting](https://skforecast.org/0.9.1/user_guides/autoregresive-forecaster) +- [Naive Time Series Forecasting in Python](https://forecastegy.com/posts/naive-time-series-forecasting-in-python/) +- [Multi-Step Time Series Forecasting Strategies](https://machinelearningmastery.com/multi-step-time-series-forecasting/) +- [Seasonal Persistence Forecasting](https://machinelearningmastery.com/seasonal-persistence-forecasting-python/) diff --git a/docs/DAILY-FLOW.md b/docs/DAILY-FLOW.md index 3ceefe42..66521dbc 100644 --- a/docs/DAILY-FLOW.md +++ b/docs/DAILY-FLOW.md @@ -162,14 +162,21 @@ gh run watch --- -## Következő Phase: Ingest Layer (PRP-3) +## Következő Phase: Forecasting (PRP-5) ```bash # Kezdés git checkout dev git pull origin dev -git checkout -b feat/prp-3-ingest-layer +git checkout -b feat/prp-5-forecasting # Fejlesztés... -# PR → dev → main → release → phase-2 snapshot +# PR → dev → main → release → phase-4 snapshot ``` + +### PRP-5 Scope (INITIAL-5) +- Model zoo: naive, seasonal naive, moving average +- Unified model interface: fit/predict, serialize/load +- Scikit-learn Pipeline: Scaling → Encoding → Regressor +- Joblib-based ModelBundle persistence +- Multi-horizon recursive forecasting From a9a055f39cb781dbb5b6f8f9b76e7d4e833d30ce Mon Sep 17 00:00:00 2001 From: Gabor Szabo <168316277+w7-mgfcode@users.noreply.github.com> Date: Sun, 1 Feb 2026 02:32:33 +0100 Subject: [PATCH 4/5] feat(forecasting): implement baseline model zoo and unified interface (#28) * feat(forecasting): implement baseline model zoo and unified interface Add forecasting module (PRP-5) with: - BaseForecaster ABC with scikit-learn-style interface (fit/predict) - NaiveForecaster, SeasonalNaiveForecaster, MovingAverageForecaster - ModelBundle persistence with joblib serialization - POST /forecasting/train and /forecasting/predict endpoints - ForecastingService for orchestration - 81 unit tests covering schemas, models, persistence, and service - Example scripts demonstrating each baseline model - LightGBM placeholder (feature-flagged, not yet implemented) Co-Authored-By: Claude Opus 4.5 * docs: Update documentation for forecasting module (PRP-5) - Add forecasting API endpoints to README.md with examples - Update ARCHITECTURE.md with forecasting implementation details - Add scikit-learn and joblib to dependencies list - Add forecasting config variables to .env.example - Mark forecasting module as IMPLEMENTED in architecture docs Co-Authored-By: Claude Opus 4.5 * fix: address CI lint and type check failures - Add type: ignore for intentional type mismatch in frozen config test - Add S101 ignore for examples/ to allow assert statements Co-Authored-By: Claude Opus 4.5 --------- Co-authored-by: Gabe@w7dev Co-authored-by: Claude Opus 4.5 --- .env.example | 7 + README.md | 45 +- app/core/config.py | 7 + app/features/forecasting/__init__.py | 82 ++++ app/features/forecasting/models.py | 460 ++++++++++++++++++ app/features/forecasting/persistence.py | 160 ++++++ app/features/forecasting/routes.py | 224 +++++++++ app/features/forecasting/schemas.py | 281 +++++++++++ app/features/forecasting/service.py | 334 +++++++++++++ app/features/forecasting/tests/__init__.py | 1 + app/features/forecasting/tests/conftest.py | 83 ++++ app/features/forecasting/tests/test_models.py | 268 ++++++++++ .../forecasting/tests/test_persistence.py | 217 +++++++++ .../forecasting/tests/test_schemas.py | 278 +++++++++++ .../forecasting/tests/test_service.py | 267 ++++++++++ app/main.py | 2 + docs/ARCHITECTURE.md | 66 ++- examples/models/baseline_mavg.py | 89 ++++ examples/models/baseline_naive.py | 64 +++ examples/models/baseline_seasonal.py | 81 +++ examples/models/model_interface.md | 215 ++++++++ pyproject.toml | 4 +- uv.lock | 129 ++++- 23 files changed, 3348 insertions(+), 16 deletions(-) create mode 100644 app/features/forecasting/__init__.py create mode 100644 app/features/forecasting/models.py create mode 100644 app/features/forecasting/persistence.py create mode 100644 app/features/forecasting/routes.py create mode 100644 app/features/forecasting/schemas.py create mode 100644 app/features/forecasting/service.py create mode 100644 app/features/forecasting/tests/__init__.py create mode 100644 app/features/forecasting/tests/conftest.py create mode 100644 app/features/forecasting/tests/test_models.py create mode 100644 app/features/forecasting/tests/test_persistence.py create mode 100644 app/features/forecasting/tests/test_schemas.py create mode 100644 app/features/forecasting/tests/test_service.py create mode 100644 examples/models/baseline_mavg.py create mode 100644 examples/models/baseline_naive.py create mode 100644 examples/models/baseline_seasonal.py create mode 100644 examples/models/model_interface.md diff --git a/.env.example b/.env.example index 9a197f84..d21b33f8 100644 --- a/.env.example +++ b/.env.example @@ -15,5 +15,12 @@ LOG_FORMAT=json API_HOST=0.0.0.0 API_PORT=8123 +# Forecasting settings +FORECAST_RANDOM_SEED=42 +FORECAST_DEFAULT_HORIZON=14 +FORECAST_MAX_HORIZON=90 +FORECAST_MODEL_ARTIFACTS_DIR=./artifacts/models +FORECAST_ENABLE_LIGHTGBM=false + # Frontend (Vite) VITE_API_BASE_URL=http://localhost:8123 diff --git a/README.md b/README.md index 0d5607d9..cc1dd062 100644 --- a/README.md +++ b/README.md @@ -94,7 +94,8 @@ app/ ├── features/ │ ├── data_platform/ # Store, product, calendar, sales tables │ ├── ingest/ # Batch upsert endpoints for sales data -│ └── featuresets/ # Time-safe feature engineering (lags, rolling, calendar) +│ ├── featuresets/ # Time-safe feature engineering (lags, rolling, calendar) +│ └── forecasting/ # Model training, prediction, persistence └── main.py # FastAPI entry point tests/ # Test fixtures and helpers @@ -103,6 +104,7 @@ examples/ ├── api/ # HTTP client examples ├── schema/ # Table documentation ├── queries/ # Example SQL queries +├── models/ # Baseline model examples (naive, seasonal_naive, moving_average) └── compute_features_demo.py # Feature engineering demo scripts/ # Utility scripts ``` @@ -184,6 +186,47 @@ curl -X POST http://localhost:8123/featuresets/compute \ See [examples/compute_features_demo.py](examples/compute_features_demo.py) for a complete demo. +### Forecasting + +- `POST /forecasting/train` - Train a forecasting model for a store/product series +- `POST /forecasting/predict` - Generate forecasts using a trained model + +**Example Training Request:** +```bash +curl -X POST http://localhost:8123/forecasting/train \ + -H "Content-Type: application/json" \ + -d '{ + "store_id": 1, + "product_id": 1, + "train_start_date": "2024-01-01", + "train_end_date": "2024-06-30", + "config": { + "model_type": "seasonal_naive", + "seasonal_period": 7 + } + }' +``` + +**Example Prediction Request:** +```bash +curl -X POST http://localhost:8123/forecasting/predict \ + -H "Content-Type: application/json" \ + -d '{ + "store_id": 1, + "product_id": 1, + "horizon": 14, + "model_path": "./artifacts/models/store_1_product_1_seasonal_naive_20240630.pkl" + }' +``` + +**Supported Model Types:** +- `naive` - Last observed value (simple baseline) +- `seasonal_naive` - Same period from previous season +- `moving_average` - Mean of last N observations +- `lightgbm` - LightGBM regressor (requires `forecast_enable_lightgbm=True`) + +See [examples/models/](examples/models/) for baseline model examples. + ## API Documentation Once the server is running: diff --git a/app/core/config.py b/app/core/config.py index a32ddcc2..d3635014 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -40,6 +40,13 @@ class Settings(BaseSettings): feature_max_lag: int = 365 feature_max_window: int = 90 + # Forecasting + forecast_random_seed: int = 42 + forecast_default_horizon: int = 14 + forecast_max_horizon: int = 90 + forecast_model_artifacts_dir: str = "./artifacts/models" + forecast_enable_lightgbm: bool = False + @property def is_development(self) -> bool: """Check if running in development mode.""" diff --git a/app/features/forecasting/__init__.py b/app/features/forecasting/__init__.py new file mode 100644 index 00000000..85fd8d22 --- /dev/null +++ b/app/features/forecasting/__init__.py @@ -0,0 +1,82 @@ +"""Forecasting module for baseline and ML models. + +This module provides a unified interface for training and predicting with +various forecasting models including naive, seasonal naive, and moving average. + +Exports: + Models: + - BaseForecaster: Abstract base class for all forecasters + - NaiveForecaster: Predicts last observed value + - SeasonalNaiveForecaster: Predicts value from same season + - MovingAverageForecaster: Predicts mean of last N observations + - model_factory: Create forecaster from config + + Schemas: + - ModelConfig: Union of all model configurations + - NaiveModelConfig, SeasonalNaiveModelConfig, MovingAverageModelConfig + - TrainRequest, TrainResponse + - PredictRequest, PredictResponse, ForecastPoint + + Persistence: + - ModelBundle: Container for model + config + metadata + - save_model_bundle, load_model_bundle + + Service: + - ForecastingService: Orchestration layer for training/prediction +""" + +from app.features.forecasting.models import ( + BaseForecaster, + FitResult, + MovingAverageForecaster, + NaiveForecaster, + SeasonalNaiveForecaster, + model_factory, +) +from app.features.forecasting.persistence import ( + ModelBundle, + load_model_bundle, + save_model_bundle, +) +from app.features.forecasting.schemas import ( + ForecastPoint, + LightGBMModelConfig, + ModelConfig, + ModelConfigBase, + MovingAverageModelConfig, + NaiveModelConfig, + PredictRequest, + PredictResponse, + SeasonalNaiveModelConfig, + TrainRequest, + TrainResponse, +) +from app.features.forecasting.service import ForecastingService + +__all__ = [ + # Models + "BaseForecaster", + "FitResult", + # Schemas + "ForecastPoint", + # Service + "ForecastingService", + "LightGBMModelConfig", + "ModelBundle", + "ModelConfig", + "ModelConfigBase", + "MovingAverageForecaster", + "MovingAverageModelConfig", + "NaiveForecaster", + "NaiveModelConfig", + "PredictRequest", + "PredictResponse", + "SeasonalNaiveForecaster", + "SeasonalNaiveModelConfig", + "TrainRequest", + "TrainResponse", + # Persistence + "load_model_bundle", + "model_factory", + "save_model_bundle", +] diff --git a/app/features/forecasting/models.py b/app/features/forecasting/models.py new file mode 100644 index 00000000..0727642b --- /dev/null +++ b/app/features/forecasting/models.py @@ -0,0 +1,460 @@ +"""Forecasting models with unified scikit-learn-style interface. + +All forecasters implement a common interface: +- fit(y, X=None) -> self +- predict(horizon, X=None) -> np.ndarray +- get_params() -> dict +- set_params(**params) -> self + +CRITICAL: All implementations must be deterministic with fixed random_state. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from datetime import date as date_type +from typing import TYPE_CHECKING, Any, Literal + +import numpy as np + +if TYPE_CHECKING: + from app.features.forecasting.schemas import ModelConfig + + +@dataclass +class FitResult: + """Result of model fitting. + + Attributes: + fitted: Whether the model was successfully fitted. + n_observations: Number of observations used for fitting. + train_start: Start date of training period. + train_end: End date of training period. + metrics: Dictionary of training metrics (e.g., {"train_mae": 1.23}). + """ + + fitted: bool + n_observations: int + train_start: date_type + train_end: date_type + metrics: dict[str, float] = field(default_factory=lambda: {}) + + +class BaseForecaster(ABC): + """Abstract base class for all forecasting models. + + CRITICAL: All implementations must be deterministic with fixed random_state. + + Interface follows scikit-learn conventions: + - fit(y, X=None) -> self + - predict(horizon, X=None) -> np.ndarray + - get_params() -> dict + - set_params(**params) -> self + + Attributes: + random_state: Random seed for reproducibility. + """ + + def __init__(self, random_state: int = 42) -> None: + """Initialize the forecaster. + + Args: + random_state: Random seed for reproducibility. + """ + self.random_state = random_state + self._is_fitted = False + self._last_values: np.ndarray[Any, np.dtype[np.floating[Any]]] | None = None + self._fit_result: FitResult | None = None + + @abstractmethod + def fit( + self, + y: np.ndarray[Any, np.dtype[np.floating[Any]]], + X: np.ndarray[Any, np.dtype[np.floating[Any]]] | None = None, + ) -> BaseForecaster: + """Fit the model on historical data. + + Args: + y: Target values (1D array of shape [n_samples]). + X: Optional exogenous features (2D array of shape [n_samples, n_features]). + + Returns: + self (for method chaining). + + Raises: + ValueError: If y is empty or has insufficient observations. + """ + + @abstractmethod + def predict( + self, horizon: int, X: np.ndarray[Any, np.dtype[np.floating[Any]]] | None = None + ) -> np.ndarray[Any, np.dtype[np.floating[Any]]]: + """Generate forecasts for the specified horizon. + + CRITICAL: For recursive forecasting, predictions at t+k become + inputs for predictions at t+k+1. + + Args: + horizon: Number of steps to forecast. + X: Optional exogenous features for forecast period. + + Returns: + Array of forecasts with shape [horizon]. + + Raises: + RuntimeError: If model has not been fitted. + """ + + @abstractmethod + def get_params(self) -> dict[str, Any]: + """Get model parameters (scikit-learn convention). + + Returns: + Dictionary of parameter names to values. + """ + + @abstractmethod + def set_params(self, **params: Any) -> BaseForecaster: # noqa: ANN401 + """Set model parameters (scikit-learn convention). + + Args: + **params: Parameter names and values to set. + + Returns: + self (for method chaining). + """ + + @property + def is_fitted(self) -> bool: + """Check if the model has been fitted. + + Returns: + True if fit() has been called successfully. + """ + return self._is_fitted + + +class NaiveForecaster(BaseForecaster): + """Naive forecaster: predicts last observed value for all horizons. + + Formula: y_hat[t+h] = y[t] for all h + + This is the simplest baseline model. It assumes the time series will + remain constant at its last observed value. + """ + + def __init__(self, random_state: int = 42) -> None: + """Initialize the naive forecaster. + + Args: + random_state: Random seed for reproducibility (unused but kept for interface). + """ + super().__init__(random_state) + self._last_value: float = 0.0 + + def fit( + self, + y: np.ndarray[Any, np.dtype[np.floating[Any]]], + X: np.ndarray[Any, np.dtype[np.floating[Any]]] | None = None, # noqa: ARG002 + ) -> NaiveForecaster: + """Fit by storing the last observed value. + + Args: + y: Target values (1D array). + X: Ignored for naive model. + + Returns: + self (for method chaining). + + Raises: + ValueError: If y is empty. + """ + if len(y) == 0: + raise ValueError("Cannot fit on empty array") + self._last_value = float(y[-1]) + self._last_values = np.array([self._last_value]) + self._is_fitted = True + return self + + def predict( + self, + horizon: int, + X: np.ndarray[Any, np.dtype[np.floating[Any]]] | None = None, # noqa: ARG002 + ) -> np.ndarray[Any, np.dtype[np.floating[Any]]]: + """Predict last value for all horizons. + + Args: + horizon: Number of steps to forecast. + X: Ignored for naive model. + + Returns: + Array of forecasts with shape [horizon]. + + Raises: + RuntimeError: If model has not been fitted. + """ + if not self._is_fitted: + raise RuntimeError("Model must be fitted before predict") + return np.full(horizon, self._last_value, dtype=np.float64) + + def get_params(self) -> dict[str, Any]: + """Get model parameters. + + Returns: + Dictionary with random_state. + """ + return {"random_state": self.random_state} + + def set_params(self, **params: Any) -> NaiveForecaster: # noqa: ANN401 + """Set model parameters. + + Args: + **params: Parameter names and values to set. + + Returns: + self (for method chaining). + """ + for key, value in params.items(): + setattr(self, key, value) + return self + + +class SeasonalNaiveForecaster(BaseForecaster): + """Seasonal naive forecaster: predicts value from same season in previous cycle. + + Formula: y_hat[t+h] = y[t+h-m] where m is season_length + + For weekly seasonality (m=7), Friday's forecast = last Friday's value. + + Attributes: + season_length: Seasonality period in days (default: 7 for weekly). + """ + + def __init__(self, season_length: int = 7, random_state: int = 42) -> None: + """Initialize the seasonal naive forecaster. + + Args: + season_length: Seasonality period in days. + random_state: Random seed for reproducibility (unused but kept for interface). + """ + super().__init__(random_state) + self.season_length = season_length + + def fit( + self, + y: np.ndarray[Any, np.dtype[np.floating[Any]]], + X: np.ndarray[Any, np.dtype[np.floating[Any]]] | None = None, # noqa: ARG002 + ) -> SeasonalNaiveForecaster: + """Fit by storing the last season_length values. + + Args: + y: Target values (1D array). + X: Ignored for seasonal naive model. + + Returns: + self (for method chaining). + + Raises: + ValueError: If y has fewer observations than season_length. + """ + if len(y) < self.season_length: + raise ValueError(f"Need at least {self.season_length} observations") + # Store last season_length values for cycling + self._last_values = np.array(y[-self.season_length :], dtype=np.float64) + self._is_fitted = True + return self + + def predict( + self, + horizon: int, + X: np.ndarray[Any, np.dtype[np.floating[Any]]] | None = None, # noqa: ARG002 + ) -> np.ndarray[Any, np.dtype[np.floating[Any]]]: + """Predict by cycling through seasonal values. + + Args: + horizon: Number of steps to forecast. + X: Ignored for seasonal naive model. + + Returns: + Array of forecasts with shape [horizon]. + + Raises: + RuntimeError: If model has not been fitted. + """ + if not self._is_fitted: + raise RuntimeError("Model must be fitted before predict") + if self._last_values is None: + raise RuntimeError("Model was not properly fitted") + # Cycle through seasonal values + forecasts = np.zeros(horizon, dtype=np.float64) + for h in range(horizon): + idx = h % self.season_length + forecasts[h] = self._last_values[idx] + return forecasts + + def get_params(self) -> dict[str, Any]: + """Get model parameters. + + Returns: + Dictionary with season_length and random_state. + """ + return {"season_length": self.season_length, "random_state": self.random_state} + + def set_params(self, **params: Any) -> SeasonalNaiveForecaster: # noqa: ANN401 + """Set model parameters. + + Args: + **params: Parameter names and values to set. + + Returns: + self (for method chaining). + """ + for key, value in params.items(): + setattr(self, key, value) + return self + + +class MovingAverageForecaster(BaseForecaster): + """Moving average forecaster: predicts mean of last N observations. + + Formula: y_hat[t+h] = mean(y[t-window+1:t+1]) + + CRITICAL: Does NOT update recursively - uses same average for all horizons. + + Attributes: + window_size: Window size for averaging (default: 7). + """ + + def __init__(self, window_size: int = 7, random_state: int = 42) -> None: + """Initialize the moving average forecaster. + + Args: + window_size: Window size for averaging. + random_state: Random seed for reproducibility (unused but kept for interface). + """ + super().__init__(random_state) + self.window_size = window_size + self._forecast_value: float = 0.0 + + def fit( + self, + y: np.ndarray[Any, np.dtype[np.floating[Any]]], + X: np.ndarray[Any, np.dtype[np.floating[Any]]] | None = None, # noqa: ARG002 + ) -> MovingAverageForecaster: + """Fit by computing mean of last window_size values. + + Args: + y: Target values (1D array). + X: Ignored for moving average model. + + Returns: + self (for method chaining). + + Raises: + ValueError: If y has fewer observations than window_size. + """ + if len(y) < self.window_size: + raise ValueError(f"Need at least {self.window_size} observations") + # Compute mean of last window_size values + self._last_values = np.array(y[-self.window_size :], dtype=np.float64) + self._forecast_value = float(np.mean(self._last_values)) + self._is_fitted = True + return self + + def predict( + self, + horizon: int, + X: np.ndarray[Any, np.dtype[np.floating[Any]]] | None = None, # noqa: ARG002 + ) -> np.ndarray[Any, np.dtype[np.floating[Any]]]: + """Predict constant value (mean) for all horizons. + + Args: + horizon: Number of steps to forecast. + X: Ignored for moving average model. + + Returns: + Array of forecasts with shape [horizon]. + + Raises: + RuntimeError: If model has not been fitted. + """ + if not self._is_fitted: + raise RuntimeError("Model must be fitted before predict") + # Constant forecast for all horizons + return np.full(horizon, self._forecast_value, dtype=np.float64) + + def get_params(self) -> dict[str, Any]: + """Get model parameters. + + Returns: + Dictionary with window_size and random_state. + """ + return {"window_size": self.window_size, "random_state": self.random_state} + + def set_params(self, **params: Any) -> MovingAverageForecaster: # noqa: ANN401 + """Set model parameters. + + Args: + **params: Parameter names and values to set. + + Returns: + self (for method chaining). + """ + for key, value in params.items(): + setattr(self, key, value) + return self + + +# Type alias for model type literals +ModelType = Literal["naive", "seasonal_naive", "moving_average", "lightgbm"] + + +def model_factory(config: ModelConfig, random_state: int = 42) -> BaseForecaster: + """Create a forecaster instance from a configuration. + + Args: + config: Model configuration. + random_state: Random seed for reproducibility. + + Returns: + Instantiated forecaster. + + Raises: + ValueError: If model_type is unknown or LightGBM is not enabled. + """ + from app.core.config import get_settings + + settings = get_settings() + + model_type: str = config.model_type + + if model_type == "naive": + return NaiveForecaster(random_state=random_state) + elif model_type == "seasonal_naive": + from app.features.forecasting.schemas import SeasonalNaiveModelConfig + + if isinstance(config, SeasonalNaiveModelConfig): + return SeasonalNaiveForecaster( + season_length=config.season_length, + random_state=random_state, + ) + raise ValueError("Invalid config type for seasonal_naive") + elif model_type == "moving_average": + from app.features.forecasting.schemas import MovingAverageModelConfig + + if isinstance(config, MovingAverageModelConfig): + return MovingAverageForecaster( + window_size=config.window_size, + random_state=random_state, + ) + raise ValueError("Invalid config type for moving_average") + elif model_type == "lightgbm": + if not settings.forecast_enable_lightgbm: + raise ValueError( + "LightGBM is not enabled. Set forecast_enable_lightgbm=True in settings." + ) + # LightGBM implementation would go here when feature-flagged + raise NotImplementedError("LightGBM forecaster not yet implemented") + else: + raise ValueError(f"Unknown model type: {model_type}") diff --git a/app/features/forecasting/persistence.py b/app/features/forecasting/persistence.py new file mode 100644 index 00000000..d0a435c7 --- /dev/null +++ b/app/features/forecasting/persistence.py @@ -0,0 +1,160 @@ +"""Model persistence layer using joblib serialization. + +Provides ModelBundle container for storing model + config + metadata, +and save/load functions with version compatibility warnings. + +CRITICAL: Models saved with one Python/sklearn version may not load in another. +""" + +from __future__ import annotations + +import hashlib +import json +import sys +from dataclasses import dataclass, field +from datetime import UTC, datetime +from pathlib import Path +from typing import TYPE_CHECKING + +import joblib # type: ignore[import-untyped] +import sklearn # type: ignore[import-untyped] +import structlog + +if TYPE_CHECKING: + from app.features.forecasting.models import BaseForecaster + from app.features.forecasting.schemas import ModelConfig + +logger = structlog.get_logger() + + +@dataclass +class ModelBundle: + """Bundle containing model, config, and metadata for persistence. + + CRITICAL: Includes version info for compatibility checking. + + Attributes: + model: The fitted forecaster model. + config: Model configuration used for training. + metadata: Additional metadata (e.g., store_id, product_id, dates). + created_at: Timestamp when bundle was created. + python_version: Python version used when saving. + sklearn_version: Scikit-learn version used when saving. + bundle_hash: Deterministic hash of bundle contents. + """ + + model: BaseForecaster + config: ModelConfig + metadata: dict[str, object] = field(default_factory=lambda: {}) + + # Auto-populated on save + created_at: datetime | None = None + python_version: str | None = None + sklearn_version: str | None = None + bundle_hash: str | None = None + + def compute_hash(self) -> str: + """Compute deterministic hash of bundle contents. + + Returns: + 16-character hex string hash. + """ + content = { + "config_hash": self.config.config_hash(), + "model_params": self.model.get_params(), + "metadata": self.metadata, + } + return hashlib.sha256( + json.dumps(content, sort_keys=True, default=str).encode() + ).hexdigest()[:16] + + +def save_model_bundle(bundle: ModelBundle, path: str | Path) -> Path: + """Save model bundle to disk using joblib. + + CRITICAL: Records Python and sklearn versions for compatibility warnings. + + Args: + bundle: ModelBundle to save. + path: File path (will add .joblib extension if missing). + + Returns: + Path to saved file. + + Raises: + OSError: If unable to create directory or write file. + """ + path = Path(path) + if not path.suffix: + path = path.with_suffix(".joblib") + + # Ensure directory exists + path.parent.mkdir(parents=True, exist_ok=True) + + # Populate metadata + bundle.created_at = datetime.now(UTC) + bundle.python_version = sys.version + bundle.sklearn_version = sklearn.__version__ + bundle.bundle_hash = bundle.compute_hash() + + # Save with compression + joblib.dump(bundle, path, compress=3) # pyright: ignore[reportUnknownMemberType] + + logger.info( + "forecasting.model_bundle_saved", + path=str(path), + bundle_hash=bundle.bundle_hash, + python_version=bundle.python_version, + sklearn_version=bundle.sklearn_version, + ) + + return path + + +def load_model_bundle(path: str | Path) -> ModelBundle: + """Load model bundle from disk. + + CRITICAL: Logs warning if versions don't match. + + Args: + path: Path to saved bundle. + + Returns: + Loaded ModelBundle. + + Raises: + FileNotFoundError: If path doesn't exist. + """ + path = Path(path) + + if not path.exists(): + raise FileNotFoundError(f"Model bundle not found: {path}") + + bundle: ModelBundle = joblib.load(path) # pyright: ignore[reportUnknownMemberType] + + # Version compatibility warnings + current_python_major_minor = f"{sys.version_info.major}.{sys.version_info.minor}" + if bundle.python_version: + saved_python_major_minor = bundle.python_version.split()[0].rsplit(".", 1)[0] + if saved_python_major_minor != current_python_major_minor: + logger.warning( + "forecasting.python_version_mismatch", + saved_python=bundle.python_version, + current_python=sys.version, + ) + + if bundle.sklearn_version and bundle.sklearn_version != sklearn.__version__: + logger.warning( + "forecasting.sklearn_version_mismatch", + saved_sklearn=bundle.sklearn_version, + current_sklearn=sklearn.__version__, + ) + + logger.info( + "forecasting.model_bundle_loaded", + path=str(path), + bundle_hash=bundle.bundle_hash, + model_type=bundle.config.model_type, + ) + + return bundle diff --git a/app/features/forecasting/routes.py b/app/features/forecasting/routes.py new file mode 100644 index 00000000..f9fbf007 --- /dev/null +++ b/app/features/forecasting/routes.py @@ -0,0 +1,224 @@ +"""Forecasting API routes for model training and prediction.""" + +from fastapi import APIRouter, Depends, HTTPException, status +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession + +from app.core.config import get_settings +from app.core.database import get_db +from app.core.exceptions import DatabaseError +from app.core.logging import get_logger +from app.features.forecasting.schemas import ( + PredictRequest, + PredictResponse, + TrainRequest, + TrainResponse, +) +from app.features.forecasting.service import ForecastingService + +logger = get_logger(__name__) + +router = APIRouter(prefix="/forecasting", tags=["forecasting"]) + + +@router.post( + "/train", + response_model=TrainResponse, + status_code=status.HTTP_200_OK, + summary="Train a forecasting model", + description=""" +Train a forecasting model for a single store/product series. + +**Model Types:** +- `naive`: Predicts last observed value for all horizons +- `seasonal_naive`: Predicts value from same season in previous cycle +- `moving_average`: Predicts mean of last N observations +- `lightgbm`: LightGBM regressor (feature-flagged, disabled by default) + +**Persistence:** Trained models are saved as ModelBundle files containing: +- The fitted model +- Configuration used for training +- Metadata (store_id, product_id, dates, n_observations) +- Version information for compatibility checking + +**Response:** Returns the path to the saved model bundle for use in prediction. +""", +) +async def train_model( + request: TrainRequest, + db: AsyncSession = Depends(get_db), +) -> TrainResponse: + """Train a forecasting model for a single series. + + Args: + request: Training request with config. + db: Async database session from dependency. + + Returns: + Response with training results and model path. + + Raises: + HTTPException: If model type is disabled or training fails. + NotFoundError: If no training data found. + DatabaseError: If database operation fails. + """ + settings = get_settings() + + # Check if LightGBM is enabled + if request.config.model_type == "lightgbm" and not settings.forecast_enable_lightgbm: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="LightGBM is disabled. Set forecast_enable_lightgbm=True in settings.", + ) + + logger.info( + "forecasting.train_request_received", + store_id=request.store_id, + product_id=request.product_id, + train_start_date=str(request.train_start_date), + train_end_date=str(request.train_end_date), + model_type=request.config.model_type, + ) + + service = ForecastingService() + + try: + response = await service.train_model( + db=db, + store_id=request.store_id, + product_id=request.product_id, + train_start_date=request.train_start_date, + train_end_date=request.train_end_date, + config=request.config, + ) + + logger.info( + "forecasting.train_request_completed", + store_id=request.store_id, + product_id=request.product_id, + model_type=request.config.model_type, + model_path=response.model_path, + n_observations=response.n_observations, + duration_ms=response.duration_ms, + ) + + return response + + except ValueError as e: + logger.warning( + "forecasting.train_request_failed", + store_id=request.store_id, + product_id=request.product_id, + error=str(e), + error_type=type(e).__name__, + ) + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=str(e), + ) from e + except SQLAlchemyError as e: + logger.error( + "forecasting.train_request_failed", + store_id=request.store_id, + product_id=request.product_id, + error=str(e), + error_type=type(e).__name__, + exc_info=True, + ) + raise DatabaseError( + message="Failed to train model", + details={"error": str(e)}, + ) from e + + +@router.post( + "/predict", + response_model=PredictResponse, + status_code=status.HTTP_200_OK, + summary="Generate forecasts using a trained model", + description=""" +Generate forecasts using a previously trained model. + +**Inputs:** +- `store_id`, `product_id`: Must match the model's training data +- `horizon`: Number of days to forecast (1-90) +- `model_path`: Path to saved model bundle + +**Forecast Points:** Each point includes: +- `date`: Forecast date +- `forecast`: Point forecast value +- `lower_bound`, `upper_bound`: Prediction intervals (optional, model-dependent) + +**Validation:** The service validates that the model was trained for the +requested store/product combination. +""", +) +async def predict( + request: PredictRequest, + db: AsyncSession = Depends(get_db), # noqa: ARG001 +) -> PredictResponse: + """Generate forecasts using a saved model. + + Args: + request: Prediction request with model path and horizon. + db: Async database session from dependency (unused but kept for consistency). + + Returns: + Response with forecast points. + + Raises: + HTTPException: If model not found or validation fails. + """ + logger.info( + "forecasting.predict_request_received", + store_id=request.store_id, + product_id=request.product_id, + horizon=request.horizon, + model_path=request.model_path, + ) + + service = ForecastingService() + + try: + response = await service.predict( + store_id=request.store_id, + product_id=request.product_id, + horizon=request.horizon, + model_path=request.model_path, + ) + + logger.info( + "forecasting.predict_request_completed", + store_id=request.store_id, + product_id=request.product_id, + horizon=request.horizon, + model_type=response.model_type, + duration_ms=response.duration_ms, + ) + + return response + + except FileNotFoundError as e: + logger.warning( + "forecasting.predict_request_failed", + store_id=request.store_id, + product_id=request.product_id, + error=str(e), + error_type=type(e).__name__, + ) + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=str(e), + ) from e + except ValueError as e: + logger.warning( + "forecasting.predict_request_failed", + store_id=request.store_id, + product_id=request.product_id, + error=str(e), + error_type=type(e).__name__, + ) + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=str(e), + ) from e diff --git a/app/features/forecasting/schemas.py b/app/features/forecasting/schemas.py new file mode 100644 index 00000000..1a6808fc --- /dev/null +++ b/app/features/forecasting/schemas.py @@ -0,0 +1,281 @@ +"""Pydantic schemas for forecasting configuration and API contracts. + +Model configs are designed to be: +- Immutable (frozen=True) for reproducibility +- Versioned (schema_version) for registry storage +- Hashable (config_hash) for deduplication +""" + +from __future__ import annotations + +import hashlib +from datetime import date as date_type +from typing import Literal + +from pydantic import BaseModel, ConfigDict, Field, field_validator + +# ============================================================================= +# Model Configuration Schemas +# ============================================================================= + + +class ModelConfigBase(BaseModel): + """Base configuration for all forecasting models. + + All model configs inherit from this base to ensure: + - Immutability after creation (frozen=True) + - No extra fields allowed (extra="forbid") + - Schema versioning for reproducibility + """ + + model_config = ConfigDict( + frozen=True, + extra="forbid", + ) + + schema_version: str = Field( + default="1.0", + description="Semantic version of this config schema", + pattern=r"^\d+\.\d+(\.\d+)?$", + ) + + def config_hash(self) -> str: + """Generate deterministic hash of configuration. + + Returns: + 16-character hex string hash of config JSON. + """ + config_json = self.model_dump_json() + return hashlib.sha256(config_json.encode()).hexdigest()[:16] + + +class NaiveModelConfig(ModelConfigBase): + """Configuration for naive forecaster (last value). + + The naive forecaster predicts the last observed value for all horizons. + Formula: y_hat[t+h] = y[t] for all h + + This is the simplest baseline and often performs surprisingly well for + stable time series. + """ + + model_type: Literal["naive"] = "naive" + + +class SeasonalNaiveModelConfig(ModelConfigBase): + """Configuration for seasonal naive forecaster. + + Predicts value from same season in previous cycle. + Formula: y_hat[t+h] = y[t+h-m] where m is season_length + + For weekly seasonality (m=7), Friday's forecast = last Friday's value. + + Attributes: + season_length: Seasonality period in days (default: 7 for weekly). + """ + + model_type: Literal["seasonal_naive"] = "seasonal_naive" + season_length: int = Field( + default=7, + ge=1, + le=365, + description="Seasonality period in days", + ) + + +class MovingAverageModelConfig(ModelConfigBase): + """Configuration for moving average forecaster. + + Predicts the mean of the last N observations for all horizons. + Formula: y_hat[t+h] = mean(y[t-window+1:t+1]) + + CRITICAL: Does NOT update recursively - uses same average for all horizons. + + Attributes: + window_size: Window size for averaging (default: 7). + """ + + model_type: Literal["moving_average"] = "moving_average" + window_size: int = Field( + default=7, + ge=1, + le=90, + description="Window size for averaging", + ) + + +class LightGBMModelConfig(ModelConfigBase): + """Configuration for LightGBM regressor (feature-flagged). + + LightGBM is an advanced ML model that uses gradient boosting on + decision trees. Requires feature engineering integration. + + CRITICAL: Only available when forecast_enable_lightgbm=True in settings. + + Attributes: + n_estimators: Number of boosting rounds. + max_depth: Maximum depth of trees. + learning_rate: Learning rate for gradient boosting. + feature_config_hash: Hash of FeatureSetConfig used for training. + """ + + model_type: Literal["lightgbm"] = "lightgbm" + n_estimators: int = Field( + default=100, + ge=10, + le=1000, + description="Number of boosting rounds", + ) + max_depth: int = Field( + default=6, + ge=1, + le=20, + description="Maximum depth of trees", + ) + learning_rate: float = Field( + default=0.1, + ge=0.001, + le=1.0, + description="Learning rate for gradient boosting", + ) + feature_config_hash: str | None = Field( + default=None, + description="Hash of FeatureSetConfig used for training", + ) + + +# Union type for all model configs +ModelConfig = ( + NaiveModelConfig | SeasonalNaiveModelConfig | MovingAverageModelConfig | LightGBMModelConfig +) + + +# ============================================================================= +# API Request/Response Schemas +# ============================================================================= + + +class TrainRequest(BaseModel): + """Request body for POST /forecasting/train. + + Attributes: + store_id: Store ID to train model for. + product_id: Product ID to train model for. + train_start_date: Start date of training period. + train_end_date: End date of training period (inclusive). + config: Model configuration. + """ + + model_config = ConfigDict(strict=True) + + store_id: int = Field(..., ge=1, description="Store ID") + product_id: int = Field(..., ge=1, description="Product ID") + train_start_date: date_type = Field( + ..., + description="Start date of training period", + ) + train_end_date: date_type = Field( + ..., + description="End date of training period (inclusive)", + ) + config: ModelConfig + + @field_validator("train_end_date") + @classmethod + def validate_date_range(cls, v: date_type, info: object) -> date_type: + """Ensure train_end_date is after train_start_date.""" + # Type narrow info to ValidationInfo-like object + data = getattr(info, "data", {}) + if "train_start_date" in data and v <= data["train_start_date"]: + raise ValueError("train_end_date must be after train_start_date") + return v + + +class TrainResponse(BaseModel): + """Response body for POST /forecasting/train. + + Attributes: + store_id: Store ID model was trained for. + product_id: Product ID model was trained for. + model_type: Type of model trained. + model_path: Path to saved model bundle. + config_hash: Hash of the configuration used. + n_observations: Number of observations used for training. + train_start_date: Start date of training period. + train_end_date: End date of training period. + duration_ms: Training duration in milliseconds. + """ + + store_id: int + product_id: int + model_type: str + model_path: str + config_hash: str + n_observations: int + train_start_date: date_type + train_end_date: date_type + duration_ms: float + + +class PredictRequest(BaseModel): + """Request body for POST /forecasting/predict. + + Attributes: + store_id: Store ID to predict for. + product_id: Product ID to predict for. + horizon: Number of days to forecast. + model_path: Path to saved model bundle. + """ + + model_config = ConfigDict(strict=True) + + store_id: int = Field(..., ge=1, description="Store ID") + product_id: int = Field(..., ge=1, description="Product ID") + horizon: int = Field( + ..., + ge=1, + le=90, + description="Number of days to forecast", + ) + model_path: str = Field( + ..., + description="Path to saved model bundle", + ) + + +class ForecastPoint(BaseModel): + """Single forecast point. + + Attributes: + date: Date of the forecast. + forecast: Point forecast value. + lower_bound: Lower bound of prediction interval (optional). + upper_bound: Upper bound of prediction interval (optional). + """ + + date: date_type + forecast: float + lower_bound: float | None = None + upper_bound: float | None = None + + +class PredictResponse(BaseModel): + """Response body for POST /forecasting/predict. + + Attributes: + store_id: Store ID predictions are for. + product_id: Product ID predictions are for. + forecasts: List of forecast points. + model_type: Type of model used. + config_hash: Hash of the configuration used. + horizon: Number of days forecasted. + duration_ms: Prediction duration in milliseconds. + """ + + store_id: int + product_id: int + forecasts: list[ForecastPoint] + model_type: str + config_hash: str + horizon: int + duration_ms: float diff --git a/app/features/forecasting/service.py b/app/features/forecasting/service.py new file mode 100644 index 00000000..9e4a6e18 --- /dev/null +++ b/app/features/forecasting/service.py @@ -0,0 +1,334 @@ +"""Forecasting service for model training and prediction. + +Orchestrates: +- Loading training data from database +- Model instantiation via factory +- Training and prediction +- Model persistence via ModelBundle + +CRITICAL: All operations respect time-safety constraints. +""" + +from __future__ import annotations + +import time +import uuid +from dataclasses import dataclass, field +from datetime import UTC, datetime, timedelta +from datetime import date as date_type +from pathlib import Path +from typing import TYPE_CHECKING, Any + +import numpy as np +import structlog +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.core.config import get_settings +from app.features.data_platform.models import SalesDaily +from app.features.forecasting.models import model_factory +from app.features.forecasting.persistence import ( + ModelBundle, + load_model_bundle, + save_model_bundle, +) +from app.features.forecasting.schemas import ( + ForecastPoint, + ModelConfig, + PredictResponse, + TrainResponse, +) + +if TYPE_CHECKING: + pass + +logger = structlog.get_logger() + + +@dataclass +class TrainingData: + """Container for loaded training data. + + Attributes: + y: Target values as numpy array. + dates: Corresponding dates. + store_id: Store ID. + product_id: Product ID. + n_observations: Number of observations. + """ + + y: np.ndarray[Any, np.dtype[np.floating[Any]]] + dates: list[date_type] + store_id: int + product_id: int + n_observations: int = field(init=False) + + def __post_init__(self) -> None: + """Compute derived fields.""" + self.n_observations = len(self.y) + + +class ForecastingService: + """Service for training and predicting with forecasting models. + + Provides orchestration layer for: + - Loading training data from database + - Training models with configured parameters + - Saving trained models as bundles + - Loading models and generating predictions + + CRITICAL: All operations use Settings for reproducibility. + """ + + def __init__(self) -> None: + """Initialize the forecasting service.""" + self.settings = get_settings() + + async def train_model( + self, + db: AsyncSession, + store_id: int, + product_id: int, + train_start_date: date_type, + train_end_date: date_type, + config: ModelConfig, + ) -> TrainResponse: + """Train a forecasting model and save to disk. + + Args: + db: Database session. + store_id: Store ID to train for. + product_id: Product ID to train for. + train_start_date: Start date of training period. + train_end_date: End date of training period (inclusive). + config: Model configuration. + + Returns: + TrainResponse with training results. + + Raises: + ValueError: If insufficient training data. + """ + start_time = time.perf_counter() + + logger.info( + "forecasting.train_started", + store_id=store_id, + product_id=product_id, + train_start_date=str(train_start_date), + train_end_date=str(train_end_date), + model_type=config.model_type, + config_hash=config.config_hash(), + ) + + # Load training data + training_data = await self._load_training_data( + db=db, + store_id=store_id, + product_id=product_id, + start_date=train_start_date, + end_date=train_end_date, + ) + + if training_data.n_observations == 0: + raise ValueError( + f"No training data found for store={store_id}, product={product_id} " + f"between {train_start_date} and {train_end_date}" + ) + + # Create and fit model + model = model_factory(config, random_state=self.settings.forecast_random_seed) + model.fit(training_data.y) + + # Create bundle with metadata + bundle = ModelBundle( + model=model, + config=config, + metadata={ + "store_id": store_id, + "product_id": product_id, + "train_start_date": str(train_start_date), + "train_end_date": str(train_end_date), + "n_observations": training_data.n_observations, + }, + ) + + # Save bundle + model_id = uuid.uuid4().hex[:12] + model_path = Path(self.settings.forecast_model_artifacts_dir) / f"model_{model_id}" + saved_path = save_model_bundle(bundle, model_path) + + duration_ms = (time.perf_counter() - start_time) * 1000 + + logger.info( + "forecasting.train_completed", + store_id=store_id, + product_id=product_id, + model_type=config.model_type, + config_hash=config.config_hash(), + n_observations=training_data.n_observations, + model_path=str(saved_path), + duration_ms=duration_ms, + ) + + return TrainResponse( + store_id=store_id, + product_id=product_id, + model_type=config.model_type, + model_path=str(saved_path), + config_hash=config.config_hash(), + n_observations=training_data.n_observations, + train_start_date=train_start_date, + train_end_date=train_end_date, + duration_ms=duration_ms, + ) + + async def predict( + self, + store_id: int, + product_id: int, + horizon: int, + model_path: str, + ) -> PredictResponse: + """Generate forecasts using a saved model. + + Args: + store_id: Store ID to predict for. + product_id: Product ID to predict for. + horizon: Number of days to forecast. + model_path: Path to saved model bundle. + + Returns: + PredictResponse with forecasts. + + Raises: + FileNotFoundError: If model bundle not found. + ValueError: If model was trained for different store/product. + """ + start_time = time.perf_counter() + + logger.info( + "forecasting.predict_started", + store_id=store_id, + product_id=product_id, + horizon=horizon, + model_path=model_path, + ) + + # Load model bundle + bundle = load_model_bundle(model_path) + + # Validate store/product match + bundle_store_id = bundle.metadata.get("store_id") + bundle_product_id = bundle.metadata.get("product_id") + + if bundle_store_id != store_id: + raise ValueError( + f"Model was trained for store={bundle_store_id}, " + f"but prediction requested for store={store_id}" + ) + + if bundle_product_id != product_id: + raise ValueError( + f"Model was trained for product={bundle_product_id}, " + f"but prediction requested for product={product_id}" + ) + + # Generate forecasts + forecasts_array = bundle.model.predict(horizon) + + # Get the training end date to compute forecast dates + train_end_date_str = bundle.metadata.get("train_end_date") + if isinstance(train_end_date_str, str): + train_end_date = date_type.fromisoformat(train_end_date_str) + else: + # Default to today if not stored + train_end_date = datetime.now(UTC).date() + + # Create forecast points + forecasts: list[ForecastPoint] = [] + for h in range(horizon): + forecast_date = train_end_date + timedelta(days=h + 1) + forecasts.append( + ForecastPoint( + date=forecast_date, + forecast=float(forecasts_array[h]), + ) + ) + + duration_ms = (time.perf_counter() - start_time) * 1000 + + logger.info( + "forecasting.predict_completed", + store_id=store_id, + product_id=product_id, + horizon=horizon, + model_type=bundle.config.model_type, + config_hash=bundle.config.config_hash(), + duration_ms=duration_ms, + ) + + return PredictResponse( + store_id=store_id, + product_id=product_id, + forecasts=forecasts, + model_type=bundle.config.model_type, + config_hash=bundle.config.config_hash(), + horizon=horizon, + duration_ms=duration_ms, + ) + + async def _load_training_data( + self, + db: AsyncSession, + store_id: int, + product_id: int, + start_date: date_type, + end_date: date_type, + ) -> TrainingData: + """Load training data from database. + + Args: + db: Database session. + store_id: Store ID. + product_id: Product ID. + start_date: Start date (inclusive). + end_date: End date (inclusive). + + Returns: + TrainingData container with loaded data. + """ + stmt = ( + select( + SalesDaily.date, + SalesDaily.quantity, + ) + .where( + (SalesDaily.store_id == store_id) + & (SalesDaily.product_id == product_id) + & (SalesDaily.date >= start_date) + & (SalesDaily.date <= end_date) + ) + .order_by(SalesDaily.date) + ) + + result = await db.execute(stmt) + rows = result.all() + + if not rows: + return TrainingData( + y=np.array([], dtype=np.float64), + dates=[], + store_id=store_id, + product_id=product_id, + ) + + dates = [row.date for row in rows] + y = np.array([float(row.quantity) for row in rows], dtype=np.float64) + + return TrainingData( + y=y, + dates=dates, + store_id=store_id, + product_id=product_id, + ) diff --git a/app/features/forecasting/tests/__init__.py b/app/features/forecasting/tests/__init__.py new file mode 100644 index 00000000..65f31af1 --- /dev/null +++ b/app/features/forecasting/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for the forecasting module.""" diff --git a/app/features/forecasting/tests/conftest.py b/app/features/forecasting/tests/conftest.py new file mode 100644 index 00000000..8d4b9e74 --- /dev/null +++ b/app/features/forecasting/tests/conftest.py @@ -0,0 +1,83 @@ +"""Test fixtures for forecasting module.""" + +from collections.abc import Generator +from pathlib import Path +from tempfile import TemporaryDirectory + +import numpy as np +import pytest + +from app.features.forecasting.schemas import ( + MovingAverageModelConfig, + NaiveModelConfig, + SeasonalNaiveModelConfig, +) + + +@pytest.fixture +def sample_time_series() -> np.ndarray: + """Create sample time series data for testing. + + Returns 60 days of sequential values (1, 2, 3, ...) for easy verification. + """ + return np.array(range(1, 61), dtype=np.float64) + + +@pytest.fixture +def sample_seasonal_series() -> np.ndarray: + """Create sample time series with weekly pattern. + + Returns 28 days (4 weeks) of data with a clear weekly pattern: + Week pattern: [10, 20, 30, 40, 50, 60, 70] repeated. + """ + weekly_pattern = np.array([10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0]) + return np.tile(weekly_pattern, 4) # 4 weeks = 28 days + + +@pytest.fixture +def sample_constant_series() -> np.ndarray: + """Create constant time series for testing. + + Returns 30 days of constant value (100) for testing moving average. + """ + return np.full(30, 100.0, dtype=np.float64) + + +@pytest.fixture +def sample_naive_config() -> NaiveModelConfig: + """Create sample naive model configuration.""" + return NaiveModelConfig( + schema_version="1.0", + model_type="naive", + ) + + +@pytest.fixture +def sample_seasonal_config() -> SeasonalNaiveModelConfig: + """Create sample seasonal naive configuration with weekly seasonality.""" + return SeasonalNaiveModelConfig( + schema_version="1.0", + model_type="seasonal_naive", + season_length=7, + ) + + +@pytest.fixture +def sample_mavg_config() -> MovingAverageModelConfig: + """Create sample moving average configuration.""" + return MovingAverageModelConfig( + schema_version="1.0", + model_type="moving_average", + window_size=7, + ) + + +@pytest.fixture +def tmp_model_path() -> Generator[str, None, None]: + """Create temporary path for model serialization tests. + + Yields: + Path to temporary directory for saving test models. + """ + with TemporaryDirectory() as tmpdir: + yield str(Path(tmpdir) / "test_model") diff --git a/app/features/forecasting/tests/test_models.py b/app/features/forecasting/tests/test_models.py new file mode 100644 index 00000000..d5a1bfc7 --- /dev/null +++ b/app/features/forecasting/tests/test_models.py @@ -0,0 +1,268 @@ +"""Tests for forecasting models.""" + +import numpy as np +import pytest + +from app.features.forecasting.models import ( + MovingAverageForecaster, + NaiveForecaster, + SeasonalNaiveForecaster, + model_factory, +) + + +class TestNaiveForecaster: + """Tests for NaiveForecaster.""" + + def test_fit_stores_last_value(self, sample_time_series): + """Test that fit stores the last value correctly.""" + model = NaiveForecaster() + model.fit(sample_time_series) + + assert model.is_fitted + assert model._last_value == 60.0 # Last value in 1-60 sequence + + def test_predict_repeats_last_value(self, sample_time_series): + """Test that predict repeats the last value for all horizons.""" + model = NaiveForecaster() + model.fit(sample_time_series) + + forecasts = model.predict(horizon=7) + + assert len(forecasts) == 7 + assert all(f == 60.0 for f in forecasts) + + def test_predict_before_fit_raises(self): + """Test that predict before fit raises RuntimeError.""" + model = NaiveForecaster() + + with pytest.raises(RuntimeError, match="must be fitted"): + model.predict(horizon=5) + + def test_fit_empty_array_raises(self): + """Test that fitting on empty array raises ValueError.""" + model = NaiveForecaster() + empty = np.array([], dtype=np.float64) + + with pytest.raises(ValueError, match="empty"): + model.fit(empty) + + def test_determinism(self, sample_time_series): + """Test that model is deterministic.""" + model1 = NaiveForecaster(random_state=42) + model2 = NaiveForecaster(random_state=42) + + model1.fit(sample_time_series) + model2.fit(sample_time_series) + + forecasts1 = model1.predict(horizon=10) + forecasts2 = model2.predict(horizon=10) + + np.testing.assert_array_equal(forecasts1, forecasts2) + + def test_get_params(self): + """Test get_params returns expected values.""" + model = NaiveForecaster(random_state=123) + params = model.get_params() + + assert params == {"random_state": 123} + + def test_set_params(self): + """Test set_params modifies model.""" + model = NaiveForecaster(random_state=42) + model.set_params(random_state=99) + + assert model.random_state == 99 + + +class TestSeasonalNaiveForecaster: + """Tests for SeasonalNaiveForecaster.""" + + def test_fit_stores_seasonal_values(self, sample_seasonal_series): + """Test that fit stores the last season_length values.""" + model = SeasonalNaiveForecaster(season_length=7) + model.fit(sample_seasonal_series) + + assert model.is_fitted + # Last 7 values of the pattern + expected = np.array([10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0]) + np.testing.assert_array_equal(model._last_values, expected) + + def test_predict_cycles_seasonal_pattern(self, sample_seasonal_series): + """Test that predict cycles through seasonal values.""" + model = SeasonalNaiveForecaster(season_length=7) + model.fit(sample_seasonal_series) + + # Predict 14 days (2 full cycles) + forecasts = model.predict(horizon=14) + + expected = np.array([10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0] * 2) + np.testing.assert_array_equal(forecasts, expected) + + def test_predict_partial_cycle(self, sample_seasonal_series): + """Test prediction for partial seasonal cycle.""" + model = SeasonalNaiveForecaster(season_length=7) + model.fit(sample_seasonal_series) + + forecasts = model.predict(horizon=3) + + expected = np.array([10.0, 20.0, 30.0]) + np.testing.assert_array_equal(forecasts, expected) + + def test_insufficient_data_raises(self): + """Test that insufficient data raises ValueError.""" + model = SeasonalNaiveForecaster(season_length=7) + short_data = np.array([1.0, 2.0, 3.0]) # Only 3 observations + + with pytest.raises(ValueError, match="at least 7"): + model.fit(short_data) + + def test_get_params(self): + """Test get_params returns expected values.""" + model = SeasonalNaiveForecaster(season_length=14, random_state=42) + params = model.get_params() + + assert params == {"season_length": 14, "random_state": 42} + + def test_set_params(self): + """Test set_params modifies model.""" + model = SeasonalNaiveForecaster(season_length=7) + model.set_params(season_length=30) + + assert model.season_length == 30 + + +class TestMovingAverageForecaster: + """Tests for MovingAverageForecaster.""" + + def test_fit_computes_window_mean(self, sample_constant_series): + """Test that fit computes mean of last window_size values.""" + model = MovingAverageForecaster(window_size=7) + model.fit(sample_constant_series) + + assert model.is_fitted + assert model._forecast_value == 100.0 # Mean of constant series + + def test_predict_returns_constant(self, sample_constant_series): + """Test that predict returns same value for all horizons.""" + model = MovingAverageForecaster(window_size=7) + model.fit(sample_constant_series) + + forecasts = model.predict(horizon=14) + + assert len(forecasts) == 14 + assert all(f == 100.0 for f in forecasts) + + def test_moving_average_calculation(self, sample_time_series): + """Test moving average is calculated correctly.""" + model = MovingAverageForecaster(window_size=7) + model.fit(sample_time_series) + + # Last 7 values: 54, 55, 56, 57, 58, 59, 60 + # Mean: (54 + 55 + 56 + 57 + 58 + 59 + 60) / 7 = 57.0 + expected_mean = 57.0 + + forecasts = model.predict(horizon=3) + assert all(f == expected_mean for f in forecasts) + + def test_insufficient_data_raises(self): + """Test that insufficient data raises ValueError.""" + model = MovingAverageForecaster(window_size=7) + short_data = np.array([1.0, 2.0, 3.0]) + + with pytest.raises(ValueError, match="at least 7"): + model.fit(short_data) + + def test_get_params(self): + """Test get_params returns expected values.""" + model = MovingAverageForecaster(window_size=14, random_state=42) + params = model.get_params() + + assert params == {"window_size": 14, "random_state": 42} + + +class TestModelFactory: + """Tests for model_factory function.""" + + def test_factory_creates_naive(self, sample_naive_config): + """Test factory creates NaiveForecaster for naive config.""" + model = model_factory(sample_naive_config, random_state=42) + + assert isinstance(model, NaiveForecaster) + assert model.random_state == 42 + + def test_factory_creates_seasonal_naive(self, sample_seasonal_config): + """Test factory creates SeasonalNaiveForecaster for seasonal_naive config.""" + model = model_factory(sample_seasonal_config, random_state=42) + + assert isinstance(model, SeasonalNaiveForecaster) + assert model.season_length == 7 + + def test_factory_creates_moving_average(self, sample_mavg_config): + """Test factory creates MovingAverageForecaster for moving_average config.""" + model = model_factory(sample_mavg_config, random_state=42) + + assert isinstance(model, MovingAverageForecaster) + assert model.window_size == 7 + + +class TestBaseForecasterInterface: + """Tests for BaseForecaster interface compliance.""" + + @pytest.mark.parametrize( + "model_class", + [NaiveForecaster, SeasonalNaiveForecaster, MovingAverageForecaster], + ) + def test_is_fitted_property(self, model_class): + """Test is_fitted property for all models.""" + if model_class == SeasonalNaiveForecaster: + model = model_class(season_length=7) + elif model_class == MovingAverageForecaster: + model = model_class(window_size=7) + else: + model = model_class() + + assert not model.is_fitted + + data = np.arange(1, 31, dtype=np.float64) + model.fit(data) + + assert model.is_fitted + + @pytest.mark.parametrize( + "model_class", + [NaiveForecaster, SeasonalNaiveForecaster, MovingAverageForecaster], + ) + def test_fit_returns_self(self, model_class): + """Test that fit returns self for method chaining.""" + if model_class == SeasonalNaiveForecaster: + model = model_class(season_length=7) + elif model_class == MovingAverageForecaster: + model = model_class(window_size=7) + else: + model = model_class() + + data = np.arange(1, 31, dtype=np.float64) + result = model.fit(data) + + assert result is model + + @pytest.mark.parametrize( + "model_class", + [NaiveForecaster, SeasonalNaiveForecaster, MovingAverageForecaster], + ) + def test_predict_returns_correct_shape(self, model_class): + """Test that predict returns array of correct shape.""" + if model_class == SeasonalNaiveForecaster: + model = model_class(season_length=7) + elif model_class == MovingAverageForecaster: + model = model_class(window_size=7) + else: + model = model_class() + + data = np.arange(1, 31, dtype=np.float64) + model.fit(data) + + for horizon in [1, 7, 14, 30]: + forecasts = model.predict(horizon) + assert forecasts.shape == (horizon,) diff --git a/app/features/forecasting/tests/test_persistence.py b/app/features/forecasting/tests/test_persistence.py new file mode 100644 index 00000000..7614e45f --- /dev/null +++ b/app/features/forecasting/tests/test_persistence.py @@ -0,0 +1,217 @@ +"""Tests for forecasting persistence layer.""" + +from pathlib import Path +from tempfile import TemporaryDirectory + +import numpy as np +import pytest + +from app.features.forecasting.models import NaiveForecaster, SeasonalNaiveForecaster +from app.features.forecasting.persistence import ( + ModelBundle, + load_model_bundle, + save_model_bundle, +) + + +class TestModelBundle: + """Tests for ModelBundle dataclass.""" + + def test_bundle_creation(self, sample_naive_config, sample_time_series): + """Test creating a model bundle.""" + model = NaiveForecaster() + model.fit(sample_time_series) + + bundle = ModelBundle( + model=model, + config=sample_naive_config, + metadata={"store_id": 1, "product_id": 2}, + ) + + assert bundle.model is model + assert bundle.config is sample_naive_config + assert bundle.metadata["store_id"] == 1 + assert bundle.created_at is None # Set on save + + def test_compute_hash_determinism(self, sample_naive_config, sample_time_series): + """Test that compute_hash is deterministic.""" + model = NaiveForecaster() + model.fit(sample_time_series) + + bundle1 = ModelBundle( + model=model, + config=sample_naive_config, + metadata={"store_id": 1}, + ) + + bundle2 = ModelBundle( + model=model, + config=sample_naive_config, + metadata={"store_id": 1}, + ) + + assert bundle1.compute_hash() == bundle2.compute_hash() + + def test_compute_hash_changes_with_metadata(self, sample_naive_config, sample_time_series): + """Test that compute_hash changes when metadata differs.""" + model = NaiveForecaster() + model.fit(sample_time_series) + + bundle1 = ModelBundle( + model=model, + config=sample_naive_config, + metadata={"store_id": 1}, + ) + + bundle2 = ModelBundle( + model=model, + config=sample_naive_config, + metadata={"store_id": 2}, + ) + + assert bundle1.compute_hash() != bundle2.compute_hash() + + +class TestSaveLoadBundle: + """Tests for save_model_bundle and load_model_bundle functions.""" + + def test_save_load_roundtrip(self, sample_naive_config, sample_time_series, tmp_model_path): + """Test saving and loading preserves model predictions.""" + # Create and fit model + model = NaiveForecaster() + model.fit(sample_time_series) + original_predictions = model.predict(horizon=7) + + # Create bundle and save + bundle = ModelBundle( + model=model, + config=sample_naive_config, + metadata={"store_id": 1, "product_id": 2}, + ) + + saved_path = save_model_bundle(bundle, tmp_model_path) + + assert saved_path.exists() + assert saved_path.suffix == ".joblib" + + # Load and verify + loaded_bundle = load_model_bundle(saved_path) + + loaded_predictions = loaded_bundle.model.predict(horizon=7) + np.testing.assert_array_equal(original_predictions, loaded_predictions) + + def test_save_adds_metadata(self, sample_naive_config, sample_time_series, tmp_model_path): + """Test that save adds version metadata.""" + model = NaiveForecaster() + model.fit(sample_time_series) + + bundle = ModelBundle( + model=model, + config=sample_naive_config, + metadata={}, + ) + + save_model_bundle(bundle, tmp_model_path) + + assert bundle.created_at is not None + assert bundle.python_version is not None + assert bundle.sklearn_version is not None + assert bundle.bundle_hash is not None + + def test_save_creates_directory(self, sample_naive_config, sample_time_series): + """Test that save creates parent directories if needed.""" + with TemporaryDirectory() as tmpdir: + deep_path = Path(tmpdir) / "a" / "b" / "c" / "model" + + model = NaiveForecaster() + model.fit(sample_time_series) + + bundle = ModelBundle( + model=model, + config=sample_naive_config, + ) + + saved_path = save_model_bundle(bundle, deep_path) + + assert saved_path.exists() + + def test_save_adds_joblib_extension( + self, sample_naive_config, sample_time_series, tmp_model_path + ): + """Test that save adds .joblib extension if missing.""" + model = NaiveForecaster() + model.fit(sample_time_series) + + bundle = ModelBundle( + model=model, + config=sample_naive_config, + ) + + saved_path = save_model_bundle(bundle, tmp_model_path) + + assert saved_path.suffix == ".joblib" + + def test_load_nonexistent_raises(self): + """Test that loading nonexistent file raises FileNotFoundError.""" + with pytest.raises(FileNotFoundError, match="not found"): + load_model_bundle("/nonexistent/path/model.joblib") + + def test_config_preserved(self, sample_seasonal_config, sample_seasonal_series, tmp_model_path): + """Test that config is preserved after save/load.""" + model = SeasonalNaiveForecaster(season_length=7) + model.fit(sample_seasonal_series) + + bundle = ModelBundle( + model=model, + config=sample_seasonal_config, + metadata={"key": "value"}, + ) + + save_model_bundle(bundle, tmp_model_path) + loaded_bundle = load_model_bundle(tmp_model_path + ".joblib") + + assert loaded_bundle.config.model_type == "seasonal_naive" + assert loaded_bundle.config.season_length == 7 + assert loaded_bundle.config.config_hash() == sample_seasonal_config.config_hash() + + def test_metadata_preserved(self, sample_naive_config, sample_time_series, tmp_model_path): + """Test that metadata is preserved after save/load.""" + model = NaiveForecaster() + model.fit(sample_time_series) + + metadata = { + "store_id": 42, + "product_id": 99, + "train_start_date": "2024-01-01", + "train_end_date": "2024-01-31", + } + + bundle = ModelBundle( + model=model, + config=sample_naive_config, + metadata=metadata, + ) + + save_model_bundle(bundle, tmp_model_path) + loaded_bundle = load_model_bundle(tmp_model_path + ".joblib") + + assert loaded_bundle.metadata["store_id"] == 42 + assert loaded_bundle.metadata["product_id"] == 99 + assert loaded_bundle.metadata["train_start_date"] == "2024-01-01" + + def test_bundle_hash_preserved(self, sample_naive_config, sample_time_series, tmp_model_path): + """Test that bundle_hash is preserved after save/load.""" + model = NaiveForecaster() + model.fit(sample_time_series) + + bundle = ModelBundle( + model=model, + config=sample_naive_config, + ) + + save_model_bundle(bundle, tmp_model_path) + original_hash = bundle.bundle_hash + + loaded_bundle = load_model_bundle(tmp_model_path + ".joblib") + + assert loaded_bundle.bundle_hash == original_hash diff --git a/app/features/forecasting/tests/test_schemas.py b/app/features/forecasting/tests/test_schemas.py new file mode 100644 index 00000000..cb559e62 --- /dev/null +++ b/app/features/forecasting/tests/test_schemas.py @@ -0,0 +1,278 @@ +"""Tests for forecasting schemas.""" + +from datetime import date + +import pytest +from pydantic import ValidationError + +from app.features.forecasting.schemas import ( + ForecastPoint, + LightGBMModelConfig, + MovingAverageModelConfig, + NaiveModelConfig, + PredictRequest, + PredictResponse, + SeasonalNaiveModelConfig, + TrainRequest, + TrainResponse, +) + + +class TestNaiveModelConfig: + """Tests for NaiveModelConfig schema.""" + + def test_default_values(self): + """Test default configuration values.""" + config = NaiveModelConfig() + assert config.model_type == "naive" + assert config.schema_version == "1.0" + + def test_frozen_immutability(self): + """Test that config is immutable (frozen=True).""" + config = NaiveModelConfig() + with pytest.raises(ValidationError): + config.model_type = "other" # type: ignore[assignment] + + def test_config_hash_determinism(self): + """Test that config_hash is deterministic.""" + config1 = NaiveModelConfig(schema_version="1.0") + config2 = NaiveModelConfig(schema_version="1.0") + assert config1.config_hash() == config2.config_hash() + + def test_config_hash_changes_with_params(self): + """Test that config_hash changes when params differ.""" + config1 = NaiveModelConfig(schema_version="1.0") + config2 = NaiveModelConfig(schema_version="2.0") + assert config1.config_hash() != config2.config_hash() + + +class TestSeasonalNaiveModelConfig: + """Tests for SeasonalNaiveModelConfig schema.""" + + def test_default_season_length(self): + """Test default season length is 7 (weekly).""" + config = SeasonalNaiveModelConfig() + assert config.season_length == 7 + + def test_custom_season_length(self): + """Test custom season length.""" + config = SeasonalNaiveModelConfig(season_length=30) + assert config.season_length == 30 + + def test_season_length_validation_min(self): + """Test season length minimum validation.""" + with pytest.raises(ValidationError): + SeasonalNaiveModelConfig(season_length=0) + + def test_season_length_validation_max(self): + """Test season length maximum validation.""" + with pytest.raises(ValidationError): + SeasonalNaiveModelConfig(season_length=400) + + def test_config_hash_includes_season_length(self): + """Test that config_hash includes season_length.""" + config1 = SeasonalNaiveModelConfig(season_length=7) + config2 = SeasonalNaiveModelConfig(season_length=14) + assert config1.config_hash() != config2.config_hash() + + +class TestMovingAverageModelConfig: + """Tests for MovingAverageModelConfig schema.""" + + def test_default_window_size(self): + """Test default window size is 7.""" + config = MovingAverageModelConfig() + assert config.window_size == 7 + + def test_window_size_validation_min(self): + """Test window size minimum validation.""" + with pytest.raises(ValidationError): + MovingAverageModelConfig(window_size=0) + + def test_window_size_validation_max(self): + """Test window size maximum validation.""" + with pytest.raises(ValidationError): + MovingAverageModelConfig(window_size=100) + + def test_frozen_immutability(self): + """Test that config is immutable.""" + config = MovingAverageModelConfig() + with pytest.raises(ValidationError): + config.window_size = 14 + + +class TestLightGBMModelConfig: + """Tests for LightGBMModelConfig schema.""" + + def test_default_values(self): + """Test default configuration values.""" + config = LightGBMModelConfig() + assert config.model_type == "lightgbm" + assert config.n_estimators == 100 + assert config.max_depth == 6 + assert config.learning_rate == 0.1 + assert config.feature_config_hash is None + + def test_parameter_validation(self): + """Test parameter range validation.""" + # n_estimators too low + with pytest.raises(ValidationError): + LightGBMModelConfig(n_estimators=5) + + # max_depth too high + with pytest.raises(ValidationError): + LightGBMModelConfig(max_depth=25) + + # learning_rate too low + with pytest.raises(ValidationError): + LightGBMModelConfig(learning_rate=0.0001) + + +class TestTrainRequest: + """Tests for TrainRequest schema.""" + + def test_valid_request(self): + """Test valid training request.""" + request = TrainRequest( + store_id=1, + product_id=2, + train_start_date=date(2024, 1, 1), + train_end_date=date(2024, 1, 31), + config=NaiveModelConfig(), + ) + assert request.store_id == 1 + assert request.product_id == 2 + + def test_date_range_validation(self): + """Test that train_end_date must be after train_start_date.""" + with pytest.raises(ValidationError): + TrainRequest( + store_id=1, + product_id=1, + train_start_date=date(2024, 1, 31), + train_end_date=date(2024, 1, 1), + config=NaiveModelConfig(), + ) + + def test_same_date_validation(self): + """Test that train_end_date cannot equal train_start_date.""" + with pytest.raises(ValidationError): + TrainRequest( + store_id=1, + product_id=1, + train_start_date=date(2024, 1, 15), + train_end_date=date(2024, 1, 15), + config=NaiveModelConfig(), + ) + + def test_store_id_validation(self): + """Test store_id must be positive.""" + with pytest.raises(ValidationError): + TrainRequest( + store_id=0, + product_id=1, + train_start_date=date(2024, 1, 1), + train_end_date=date(2024, 1, 31), + config=NaiveModelConfig(), + ) + + +class TestPredictRequest: + """Tests for PredictRequest schema.""" + + def test_valid_request(self): + """Test valid prediction request.""" + request = PredictRequest( + store_id=1, + product_id=2, + horizon=14, + model_path="/path/to/model.joblib", + ) + assert request.horizon == 14 + + def test_horizon_validation_min(self): + """Test horizon minimum validation.""" + with pytest.raises(ValidationError): + PredictRequest( + store_id=1, + product_id=1, + horizon=0, + model_path="/path/to/model.joblib", + ) + + def test_horizon_validation_max(self): + """Test horizon maximum validation.""" + with pytest.raises(ValidationError): + PredictRequest( + store_id=1, + product_id=1, + horizon=100, + model_path="/path/to/model.joblib", + ) + + +class TestForecastPoint: + """Tests for ForecastPoint schema.""" + + def test_basic_forecast_point(self): + """Test basic forecast point without bounds.""" + point = ForecastPoint( + date=date(2024, 2, 1), + forecast=100.5, + ) + assert point.date == date(2024, 2, 1) + assert point.forecast == 100.5 + assert point.lower_bound is None + assert point.upper_bound is None + + def test_forecast_point_with_bounds(self): + """Test forecast point with prediction intervals.""" + point = ForecastPoint( + date=date(2024, 2, 1), + forecast=100.0, + lower_bound=80.0, + upper_bound=120.0, + ) + assert point.lower_bound == 80.0 + assert point.upper_bound == 120.0 + + +class TestPredictResponse: + """Tests for PredictResponse schema.""" + + def test_valid_response(self): + """Test valid prediction response.""" + response = PredictResponse( + store_id=1, + product_id=2, + forecasts=[ + ForecastPoint(date=date(2024, 2, 1), forecast=100.0), + ForecastPoint(date=date(2024, 2, 2), forecast=101.0), + ], + model_type="naive", + config_hash="abc123def456", + horizon=2, + duration_ms=10.5, + ) + assert len(response.forecasts) == 2 + assert response.horizon == 2 + + +class TestTrainResponse: + """Tests for TrainResponse schema.""" + + def test_valid_response(self): + """Test valid training response.""" + response = TrainResponse( + store_id=1, + product_id=2, + model_type="naive", + model_path="/artifacts/models/model_abc123.joblib", + config_hash="abc123def456", + n_observations=31, + train_start_date=date(2024, 1, 1), + train_end_date=date(2024, 1, 31), + duration_ms=150.5, + ) + assert response.n_observations == 31 + assert response.model_path.endswith(".joblib") diff --git a/app/features/forecasting/tests/test_service.py b/app/features/forecasting/tests/test_service.py new file mode 100644 index 00000000..fa93e27e --- /dev/null +++ b/app/features/forecasting/tests/test_service.py @@ -0,0 +1,267 @@ +"""Tests for forecasting service.""" + +from datetime import date +from pathlib import Path +from tempfile import TemporaryDirectory +from unittest.mock import AsyncMock, MagicMock, patch + +import numpy as np +import pytest + +from app.features.forecasting.models import NaiveForecaster, model_factory +from app.features.forecasting.persistence import ModelBundle, save_model_bundle +from app.features.forecasting.schemas import ( + MovingAverageModelConfig, + NaiveModelConfig, + SeasonalNaiveModelConfig, +) +from app.features.forecasting.service import ForecastingService, TrainingData + + +class TestTrainingData: + """Tests for TrainingData dataclass.""" + + def test_n_observations_computed(self): + """Test that n_observations is computed from y.""" + data = TrainingData( + y=np.array([1.0, 2.0, 3.0, 4.0, 5.0]), + dates=[date(2024, 1, i) for i in range(1, 6)], + store_id=1, + product_id=2, + ) + + assert data.n_observations == 5 + + def test_empty_data(self): + """Test empty training data.""" + data = TrainingData( + y=np.array([], dtype=np.float64), + dates=[], + store_id=1, + product_id=1, + ) + + assert data.n_observations == 0 + + +class TestModelFactory: + """Tests for model_factory function.""" + + def test_naive_config_creates_naive_forecaster(self): + """Test that naive config creates NaiveForecaster.""" + config = NaiveModelConfig() + model = model_factory(config, random_state=42) + + assert isinstance(model, NaiveForecaster) + assert model.random_state == 42 + + def test_seasonal_config_creates_seasonal_forecaster(self): + """Test that seasonal_naive config creates SeasonalNaiveForecaster.""" + config = SeasonalNaiveModelConfig(season_length=14) + model = model_factory(config, random_state=42) + + from app.features.forecasting.models import SeasonalNaiveForecaster + + assert isinstance(model, SeasonalNaiveForecaster) + assert model.season_length == 14 + + def test_mavg_config_creates_mavg_forecaster(self): + """Test that moving_average config creates MovingAverageForecaster.""" + config = MovingAverageModelConfig(window_size=21) + model = model_factory(config, random_state=42) + + from app.features.forecasting.models import MovingAverageForecaster + + assert isinstance(model, MovingAverageForecaster) + assert model.window_size == 21 + + +class TestForecastingServicePredict: + """Tests for ForecastingService.predict method.""" + + @pytest.fixture + def saved_model_path(self, sample_naive_config, sample_time_series): + """Create a saved model for prediction tests.""" + with TemporaryDirectory() as tmpdir: + model = NaiveForecaster() + model.fit(sample_time_series) + + bundle = ModelBundle( + model=model, + config=sample_naive_config, + metadata={ + "store_id": 1, + "product_id": 2, + "train_end_date": "2024-01-31", + }, + ) + + path = Path(tmpdir) / "test_model" + saved = save_model_bundle(bundle, path) + yield str(saved) + + @pytest.mark.asyncio + async def test_predict_returns_correct_horizon(self, saved_model_path): + """Test that predict returns correct number of forecast points.""" + service = ForecastingService() + + response = await service.predict( + store_id=1, + product_id=2, + horizon=7, + model_path=saved_model_path, + ) + + assert len(response.forecasts) == 7 + assert response.horizon == 7 + + @pytest.mark.asyncio + async def test_predict_validates_store_id(self, saved_model_path): + """Test that predict validates store_id matches model.""" + service = ForecastingService() + + with pytest.raises(ValueError, match="store=1"): + await service.predict( + store_id=999, # Wrong store + product_id=2, + horizon=7, + model_path=saved_model_path, + ) + + @pytest.mark.asyncio + async def test_predict_validates_product_id(self, saved_model_path): + """Test that predict validates product_id matches model.""" + service = ForecastingService() + + with pytest.raises(ValueError, match="product=2"): + await service.predict( + store_id=1, + product_id=999, # Wrong product + horizon=7, + model_path=saved_model_path, + ) + + @pytest.mark.asyncio + async def test_predict_file_not_found(self): + """Test that predict raises for missing model file.""" + service = ForecastingService() + + with pytest.raises(FileNotFoundError): + await service.predict( + store_id=1, + product_id=1, + horizon=7, + model_path="/nonexistent/model.joblib", + ) + + @pytest.mark.asyncio + async def test_predict_forecast_dates(self, saved_model_path): + """Test that predict generates correct forecast dates.""" + service = ForecastingService() + + response = await service.predict( + store_id=1, + product_id=2, + horizon=3, + model_path=saved_model_path, + ) + + # Train end date was 2024-01-31, so forecasts start 2024-02-01 + assert response.forecasts[0].date == date(2024, 2, 1) + assert response.forecasts[1].date == date(2024, 2, 2) + assert response.forecasts[2].date == date(2024, 2, 3) + + @pytest.mark.asyncio + async def test_predict_includes_model_type(self, saved_model_path): + """Test that predict response includes model type.""" + service = ForecastingService() + + response = await service.predict( + store_id=1, + product_id=2, + horizon=7, + model_path=saved_model_path, + ) + + assert response.model_type == "naive" + + @pytest.mark.asyncio + async def test_predict_includes_config_hash(self, saved_model_path): + """Test that predict response includes config hash.""" + service = ForecastingService() + + response = await service.predict( + store_id=1, + product_id=2, + horizon=7, + model_path=saved_model_path, + ) + + assert response.config_hash is not None + assert len(response.config_hash) == 16 + + +class TestForecastingServiceTrain: + """Tests for ForecastingService.train_model method.""" + + @pytest.mark.asyncio + async def test_train_empty_data_raises(self): + """Test that training with no data raises ValueError.""" + service = ForecastingService() + + # Mock database session that returns empty result + mock_db = AsyncMock() + mock_result = MagicMock() + mock_result.all.return_value = [] + mock_db.execute.return_value = mock_result + + with pytest.raises(ValueError, match="No training data"): + await service.train_model( + db=mock_db, + store_id=1, + product_id=1, + train_start_date=date(2024, 1, 1), + train_end_date=date(2024, 1, 31), + config=NaiveModelConfig(), + ) + + @pytest.mark.asyncio + async def test_train_returns_model_path(self): + """Test that training returns a valid model path.""" + service = ForecastingService() + + # Mock database session with sample data + mock_db = AsyncMock() + mock_result = MagicMock() + + # Create mock rows + mock_rows = [] + for i in range(30): + row = MagicMock() + row.date = date(2024, 1, i + 1) + row.quantity = float(i + 1) + mock_rows.append(row) + + mock_result.all.return_value = mock_rows + mock_db.execute.return_value = mock_result + + with TemporaryDirectory() as tmpdir: + with patch("app.features.forecasting.service.get_settings") as mock_settings: + settings = MagicMock() + settings.forecast_random_seed = 42 + settings.forecast_model_artifacts_dir = tmpdir + mock_settings.return_value = settings + + response = await service.train_model( + db=mock_db, + store_id=1, + product_id=2, + train_start_date=date(2024, 1, 1), + train_end_date=date(2024, 1, 30), + config=NaiveModelConfig(), + ) + + assert response.model_path.endswith(".joblib") + assert Path(response.model_path).exists() + assert response.n_observations == 30 + assert response.model_type == "naive" diff --git a/app/main.py b/app/main.py index 584608c9..9175219b 100644 --- a/app/main.py +++ b/app/main.py @@ -11,6 +11,7 @@ from app.core.logging import configure_logging, get_logger from app.core.middleware import RequestIdMiddleware from app.features.featuresets.routes import router as featuresets_router +from app.features.forecasting.routes import router as forecasting_router from app.features.ingest.routes import router as ingest_router logger = get_logger(__name__) @@ -70,6 +71,7 @@ def create_app() -> FastAPI: app.include_router(health_router) app.include_router(ingest_router) app.include_router(featuresets_router) + app.include_router(forecasting_router) return app diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 1aa6a623..0e2b5735 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -236,16 +236,51 @@ The `FeatureEngineeringService` provides time-safe feature computation with CRIT ## 7) ForecastOps (Training + Backtesting + Registry) -### 7.1 Model Zoo (Minimum) -- naive -- seasonal naive -- moving average (configurable window) +### 7.1 Model Zoo — ✅ IMPLEMENTED (Baseline Models) -### 7.2 Backtesting Protocol +**Implemented via PRP-5** - Forecasting module provides: + +| Model | Description | Config Parameters | +|-------|-------------|-------------------| +| `naive` | Last observed value | None | +| `seasonal_naive` | Previous season value | `seasonal_period` (e.g., 7 for weekly) | +| `moving_average` | Mean of last N observations | `window` (default: 7) | +| `lightgbm` | LightGBM regressor | Feature-flagged, disabled by default | + +**Model Interface:** All models inherit from `BaseForecaster` with typed `fit()` and `predict()` methods. + +**Persistence:** Models saved as `ModelBundle` (joblib) containing fitted model, config, metadata, and version info. + +### 7.2 API Endpoints + +- `POST /forecasting/train` - Train model for a single series (returns model_path) +- `POST /forecasting/predict` - Generate forecasts using saved model + +### 7.3 Location + +- Models: `app/features/forecasting/models.py` +- Persistence: `app/features/forecasting/persistence.py` +- Schemas: `app/features/forecasting/schemas.py` +- Service: `app/features/forecasting/service.py` +- Routes: `app/features/forecasting/routes.py` +- Tests: `app/features/forecasting/tests/` (comprehensive test coverage) +- Examples: `examples/models/` (baseline_naive.py, baseline_seasonal.py, baseline_mavg.py) + +### 7.4 Configuration (Settings) + +```python +forecast_random_seed: int = 42 +forecast_default_horizon: int = 14 +forecast_max_horizon: int = 90 +forecast_model_artifacts_dir: str = "./artifacts/models" +forecast_enable_lightgbm: bool = False +``` + +### 7.5 Backtesting Protocol (Planned) - Time-based CV only: rolling or expanding splits (no random split). - Metrics: MAE, sMAPE (pinball loss later if needed). -### 7.3 Model Registry +### 7.6 Model Registry (Planned) Each run stores: - run_id, timestamps - model_type + model_config (JSON) @@ -259,13 +294,18 @@ Each run stores: ## 8) Typed FastAPI Contracts (Serving Layer) -Minimum endpoint categories (planned): -- `POST /ingest/sales-daily` (optional `/ingest/transactions`) -- `POST /train` (returns `run_id`, optional `job_id`) -- `POST /predict` -- `GET /runs`, `GET /runs/{run_id}` -- `GET /data/kpis`, `GET /data/drilldowns` -- `POST /rag/query` (optional `/rag/index` in dev) +**Implemented Endpoints:** +- `GET /health` - Health check +- `POST /ingest/sales-daily` - Batch upsert daily sales records +- `POST /featuresets/compute` - Compute time-safe features +- `POST /featuresets/preview` - Preview features with sample rows +- `POST /forecasting/train` - Train forecasting model (returns model_path) +- `POST /forecasting/predict` - Generate forecasts using saved model + +**Planned Endpoints:** +- `GET /runs`, `GET /runs/{run_id}` - Model registry and leaderboard +- `GET /data/kpis`, `GET /data/drilldowns` - Data exploration +- `POST /rag/query` - RAG knowledge base queries (optional `/rag/index` in dev) Contracts are Pydantic v2 validated and use `response_model` for explicit output typing. diff --git a/examples/models/baseline_mavg.py b/examples/models/baseline_mavg.py new file mode 100644 index 00000000..7cd7c900 --- /dev/null +++ b/examples/models/baseline_mavg.py @@ -0,0 +1,89 @@ +"""Example: Training and predicting with the Moving Average forecaster. + +The Moving Average forecaster predicts the mean of the last N observations +for all horizons. This is a simple baseline that smooths out short-term +fluctuations. + +Usage: + python examples/models/baseline_mavg.py +""" + +import numpy as np + +from app.features.forecasting.models import MovingAverageForecaster +from app.features.forecasting.persistence import ModelBundle, load_model_bundle, save_model_bundle +from app.features.forecasting.schemas import MovingAverageModelConfig + + +def main(): + # 1. Create sample data with some variation + # Sequential values with noise + np.random.seed(42) + base = np.arange(1, 31, dtype=np.float64) + noise = np.random.normal(0, 2, 30) + y = base + noise + print(f"Training data: {len(y)} observations") + print(f"Last 7 values: {y[-7:].round(2)}") + print(f"Mean of last 7: {np.mean(y[-7:]):.2f}") + + # 2. Create and configure the model + config = MovingAverageModelConfig( + schema_version="1.0", + window_size=7, + ) + model = MovingAverageForecaster(window_size=7, random_state=42) + + # 3. Fit the model + model.fit(y) + print(f"\nModel fitted: {model.is_fitted}") + print(f"Model params: {model.get_params()}") + print(f"Computed forecast value: {model._forecast_value:.2f}") + + # 4. Generate predictions + horizon = 7 + forecasts = model.predict(horizon=horizon) + print(f"\n{horizon}-day forecast:") + for i, f in enumerate(forecasts): + print(f" Day {i + 1}: {f:.2f}") + + # 5. Verify all forecasts are the same (constant prediction) + print("\nVerifying constant forecast:") + assert all(f == forecasts[0] for f in forecasts), "All forecasts should be equal!" + print(f" ✓ All {horizon} forecasts equal: {forecasts[0]:.2f}") + + # 6. Compare different window sizes + print("\nComparing window sizes:") + for window in [3, 7, 14, 21]: + if window <= len(y): + temp_model = MovingAverageForecaster(window_size=window) + temp_model.fit(y) + pred = temp_model.predict(horizon=1)[0] + actual_mean = np.mean(y[-window:]) + print(f" Window {window:2d}: forecast={pred:.2f}, actual_mean={actual_mean:.2f}") + + # 7. Save the model bundle + bundle = ModelBundle( + model=model, + config=config, + metadata={ + "store_id": 1, + "product_id": 1, + "train_start_date": "2024-01-01", + "train_end_date": "2024-01-30", + "n_observations": len(y), + "window_size": 7, + }, + ) + + model_path = save_model_bundle(bundle, "./artifacts/models/mavg_example") + print(f"\nModel saved to: {model_path}") + + # 8. Load and verify + loaded_bundle = load_model_bundle(model_path) + loaded_forecasts = loaded_bundle.model.predict(horizon=3) + print(f"\nLoaded model forecast: {loaded_forecasts}") + print(f"Config: window_size={loaded_bundle.config.window_size}") + + +if __name__ == "__main__": + main() diff --git a/examples/models/baseline_naive.py b/examples/models/baseline_naive.py new file mode 100644 index 00000000..997b8ff1 --- /dev/null +++ b/examples/models/baseline_naive.py @@ -0,0 +1,64 @@ +"""Example: Training and predicting with the Naive forecaster. + +The Naive forecaster predicts the last observed value for all horizons. +This is the simplest baseline model and often works well for stable series. + +Usage: + python examples/models/baseline_naive.py +""" + +import numpy as np + +from app.features.forecasting.models import NaiveForecaster +from app.features.forecasting.persistence import ModelBundle, load_model_bundle, save_model_bundle +from app.features.forecasting.schemas import NaiveModelConfig + + +def main(): + # 1. Create sample data (30 days of sequential values) + y = np.array(range(1, 31), dtype=np.float64) + print(f"Training data: {len(y)} observations") + print(f"Last value: {y[-1]}") + + # 2. Create and configure the model + config = NaiveModelConfig(schema_version="1.0") + model = NaiveForecaster(random_state=42) + + # 3. Fit the model + model.fit(y) + print(f"\nModel fitted: {model.is_fitted}") + print(f"Model params: {model.get_params()}") + + # 4. Generate predictions + horizon = 7 + forecasts = model.predict(horizon=horizon) + print(f"\n{horizon}-day forecast:") + for i, f in enumerate(forecasts): + print(f" Day {i + 1}: {f:.2f}") + + # 5. Save the model bundle + bundle = ModelBundle( + model=model, + config=config, + metadata={ + "store_id": 1, + "product_id": 1, + "train_start_date": "2024-01-01", + "train_end_date": "2024-01-30", + "n_observations": len(y), + }, + ) + + model_path = save_model_bundle(bundle, "./artifacts/models/naive_example") + print(f"\nModel saved to: {model_path}") + + # 6. Load and verify + loaded_bundle = load_model_bundle(model_path) + loaded_forecasts = loaded_bundle.model.predict(horizon=horizon) + print(f"\nLoaded model forecast: {loaded_forecasts}") + print(f"Config hash: {loaded_bundle.config.config_hash()}") + print(f"Bundle hash: {loaded_bundle.bundle_hash}") + + +if __name__ == "__main__": + main() diff --git a/examples/models/baseline_seasonal.py b/examples/models/baseline_seasonal.py new file mode 100644 index 00000000..65aeffb3 --- /dev/null +++ b/examples/models/baseline_seasonal.py @@ -0,0 +1,81 @@ +"""Example: Training and predicting with the Seasonal Naive forecaster. + +The Seasonal Naive forecaster predicts values from the same season in +the previous cycle. For weekly seasonality (season_length=7), Friday's +forecast equals last Friday's value. + +Usage: + python examples/models/baseline_seasonal.py +""" + +import numpy as np + +from app.features.forecasting.models import SeasonalNaiveForecaster +from app.features.forecasting.persistence import ModelBundle, load_model_bundle, save_model_bundle +from app.features.forecasting.schemas import SeasonalNaiveModelConfig + + +def main(): + # 1. Create sample data with weekly pattern + # Pattern: Mon=10, Tue=20, Wed=30, Thu=40, Fri=50, Sat=60, Sun=70 + weekly_pattern = np.array([10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0]) + y = np.tile(weekly_pattern, 4) # 4 weeks = 28 days + print(f"Training data: {len(y)} observations (4 weeks)") + print(f"Weekly pattern: {weekly_pattern}") + print(f"Last week: {y[-7:]}") + + # 2. Create and configure the model + config = SeasonalNaiveModelConfig( + schema_version="1.0", + season_length=7, # Weekly seasonality + ) + model = SeasonalNaiveForecaster(season_length=7, random_state=42) + + # 3. Fit the model + model.fit(y) + print(f"\nModel fitted: {model.is_fitted}") + print(f"Model params: {model.get_params()}") + print(f"Stored seasonal values: {model._last_values}") + + # 4. Generate predictions for 2 weeks + horizon = 14 + forecasts = model.predict(horizon=horizon) + print(f"\n{horizon}-day forecast (2 weeks):") + days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] + for i, f in enumerate(forecasts): + day_name = days[i % 7] + print(f" Day {i + 1} ({day_name}): {f:.2f}") + + # 5. Verify seasonality is preserved + print("\nVerifying seasonal cycling:") + print(f" Week 1: {forecasts[:7]}") + print(f" Week 2: {forecasts[7:]}") + assert np.array_equal(forecasts[:7], forecasts[7:]), "Seasonal pattern should repeat!" + print(" ✓ Pattern repeats correctly") + + # 6. Save the model bundle + bundle = ModelBundle( + model=model, + config=config, + metadata={ + "store_id": 1, + "product_id": 1, + "train_start_date": "2024-01-01", + "train_end_date": "2024-01-28", + "n_observations": len(y), + "seasonality": "weekly", + }, + ) + + model_path = save_model_bundle(bundle, "./artifacts/models/seasonal_example") + print(f"\nModel saved to: {model_path}") + + # 7. Load and verify + loaded_bundle = load_model_bundle(model_path) + loaded_forecasts = loaded_bundle.model.predict(horizon=7) + print(f"\nLoaded model forecast (1 week): {loaded_forecasts}") + print(f"Config: season_length={loaded_bundle.config.season_length}") + + +if __name__ == "__main__": + main() diff --git a/examples/models/model_interface.md b/examples/models/model_interface.md new file mode 100644 index 00000000..bf5d391a --- /dev/null +++ b/examples/models/model_interface.md @@ -0,0 +1,215 @@ +# Model Interface Contract + +This document describes the interface contract for ForecastLabAI forecasting models. + +## BaseForecaster Interface + +All forecasting models implement the `BaseForecaster` abstract base class, which follows +scikit-learn conventions for estimators. + +### Required Methods + +#### `fit(y, X=None) -> self` + +Fit the model on historical data. + +**Parameters:** +- `y` (np.ndarray): Target values, 1D array of shape `[n_samples]` +- `X` (np.ndarray | None): Optional exogenous features, 2D array of shape `[n_samples, n_features]` + +**Returns:** +- `self`: For method chaining + +**Raises:** +- `ValueError`: If `y` is empty or has insufficient observations for the model + +**Example:** +```python +model = NaiveForecaster() +model.fit(y=np.array([1, 2, 3, 4, 5])) +``` + +#### `predict(horizon, X=None) -> np.ndarray` + +Generate forecasts for the specified horizon. + +**Parameters:** +- `horizon` (int): Number of steps to forecast +- `X` (np.ndarray | None): Optional exogenous features for forecast period + +**Returns:** +- `np.ndarray`: Array of forecasts with shape `[horizon]` + +**Raises:** +- `RuntimeError`: If model has not been fitted + +**Example:** +```python +forecasts = model.predict(horizon=7) +# Returns: array([5., 5., 5., 5., 5., 5., 5.]) +``` + +#### `get_params() -> dict[str, Any]` + +Get model parameters (scikit-learn convention). + +**Returns:** +- `dict`: Dictionary of parameter names to values + +**Example:** +```python +params = model.get_params() +# Returns: {"random_state": 42} +``` + +#### `set_params(**params) -> self` + +Set model parameters (scikit-learn convention). + +**Parameters:** +- `**params`: Parameter names and values to set + +**Returns:** +- `self`: For method chaining + +**Example:** +```python +model.set_params(random_state=99) +``` + +### Properties + +#### `is_fitted: bool` + +Check if the model has been fitted. + +**Returns:** +- `True` if `fit()` has been called successfully + +--- + +## Model Configurations + +Each model type has a corresponding configuration schema: + +### NaiveModelConfig + +```python +{ + "schema_version": "1.0", + "model_type": "naive" +} +``` + +### SeasonalNaiveModelConfig + +```python +{ + "schema_version": "1.0", + "model_type": "seasonal_naive", + "season_length": 7 # 1-365 +} +``` + +### MovingAverageModelConfig + +```python +{ + "schema_version": "1.0", + "model_type": "moving_average", + "window_size": 7 # 1-90 +} +``` + +--- + +## Model Formulas + +### Naive Forecaster + +``` +ŷ[t+h] = y[t] for all h ∈ [1, horizon] +``` + +Predicts the last observed value for all future horizons. + +### Seasonal Naive Forecaster + +``` +ŷ[t+h] = y[t + h - m] where m = season_length +``` + +Predicts the value from the same position in the previous seasonal cycle. + +### Moving Average Forecaster + +``` +ŷ[t+h] = mean(y[t-window+1:t+1]) for all h ∈ [1, horizon] +``` + +Predicts the average of the last `window_size` observations. + +--- + +## Persistence (ModelBundle) + +Models are persisted using `ModelBundle` which includes: + +```python +@dataclass +class ModelBundle: + model: BaseForecaster # Fitted model + config: ModelConfig # Configuration used + metadata: dict[str, Any] # Custom metadata (store_id, dates, etc.) + created_at: datetime # Save timestamp + python_version: str # Python version + sklearn_version: str # Scikit-learn version + bundle_hash: str # Deterministic hash +``` + +### Save/Load + +```python +from app.features.forecasting.persistence import save_model_bundle, load_model_bundle + +# Save +path = save_model_bundle(bundle, "./artifacts/models/my_model") + +# Load +bundle = load_model_bundle(path) +forecasts = bundle.model.predict(horizon=7) +``` + +--- + +## Determinism + +All models must be deterministic given the same: +1. Input data (`y`, `X`) +2. Configuration parameters +3. `random_state` + +This ensures reproducibility in experiments and backtesting. + +--- + +## Input/Output Shapes + +| Method | Input Shape | Output Shape | +|--------|-------------|--------------| +| `fit(y)` | `[n_samples]` | `self` | +| `fit(y, X)` | `y: [n_samples]`, `X: [n_samples, n_features]` | `self` | +| `predict(horizon)` | `int` | `[horizon]` | +| `predict(horizon, X)` | `int`, `X: [horizon, n_features]` | `[horizon]` | + +--- + +## Error Handling + +| Scenario | Exception | Message | +|----------|-----------|---------| +| Empty training data | `ValueError` | "Cannot fit on empty array" | +| Insufficient data for seasonal | `ValueError` | "Need at least {season_length} observations" | +| Insufficient data for MA | `ValueError` | "Need at least {window_size} observations" | +| Predict before fit | `RuntimeError` | "Model must be fitted before predict" | +| Unknown model type | `ValueError` | "Unknown model type: {type}" | diff --git a/pyproject.toml b/pyproject.toml index c987ea52..381aeb31 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,8 @@ dependencies = [ "python-dotenv>=1.0.1", "pandas>=3.0.0", "numpy>=2.4.1", + "scikit-learn>=1.6.0", + "joblib>=1.4.0", ] [project.optional-dependencies] @@ -84,7 +86,7 @@ ignore = [ "**/tests/**/*.py" = ["S101", "ANN", "ARG001", "ARG002", "ARG005", "D", "E731"] "__init__.py" = ["F401"] "scripts/**/*.py" = ["T201", "ANN", "S101"] -"examples/**/*.py" = ["T201", "ANN"] +"examples/**/*.py" = ["T201", "ANN", "S101"] "app/core/health.py" = ["B008"] "app/features/*/routes.py" = ["B008"] "alembic/env.py" = ["ANN"] diff --git a/uv.lock b/uv.lock index a8937ab1..8cf97caa 100644 --- a/uv.lock +++ b/uv.lock @@ -216,17 +216,19 @@ wheels = [ [[package]] name = "forecastlabai" -version = "0.1.5" +version = "0.1.6" source = { editable = "." } dependencies = [ { name = "alembic" }, { name = "asyncpg" }, { name = "fastapi" }, + { name = "joblib" }, { name = "numpy" }, { name = "pandas" }, { name = "pydantic" }, { name = "pydantic-settings" }, { name = "python-dotenv" }, + { name = "scikit-learn" }, { name = "sqlalchemy", extra = ["asyncio"] }, { name = "structlog" }, { name = "uvicorn", extra = ["standard"] }, @@ -254,6 +256,7 @@ requires-dist = [ { name = "asyncpg", specifier = ">=0.30.0" }, { name = "fastapi", specifier = ">=0.115.0" }, { name = "httpx", marker = "extra == 'dev'", specifier = ">=0.28.0" }, + { name = "joblib", specifier = ">=1.4.0" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.13.0" }, { name = "numpy", specifier = ">=2.4.1" }, { name = "pandas", specifier = ">=3.0.0" }, @@ -265,6 +268,7 @@ requires-dist = [ { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=6.0.0" }, { name = "python-dotenv", specifier = ">=1.0.1" }, { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.8.0" }, + { name = "scikit-learn", specifier = ">=1.6.0" }, { name = "sqlalchemy", extras = ["asyncio"], specifier = ">=2.0.36" }, { name = "structlog", specifier = ">=24.4.0" }, { name = "uvicorn", extras = ["standard"], specifier = ">=0.32.0" }, @@ -401,6 +405,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, ] +[[package]] +name = "joblib" +version = "1.5.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/41/f2/d34e8b3a08a9cc79a50b2208a93dce981fe615b64d5a4d4abee421d898df/joblib-1.5.3.tar.gz", hash = "sha256:8561a3269e6801106863fd0d6d84bb737be9e7631e33aaed3fb9ce5953688da3", size = 331603, upload-time = "2025-12-15T08:41:46.427Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071, upload-time = "2025-12-15T08:41:44.973Z" }, +] + [[package]] name = "librt" version = "0.7.8" @@ -990,6 +1003,111 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9e/6a/40fee331a52339926a92e17ae748827270b288a35ef4a15c9c8f2ec54715/ruff-0.14.14-py3-none-win_arm64.whl", hash = "sha256:56e6981a98b13a32236a72a8da421d7839221fa308b223b9283312312e5ac76c", size = 10920448, upload-time = "2026-01-22T22:30:15.417Z" }, ] +[[package]] +name = "scikit-learn" +version = "1.8.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "joblib" }, + { name = "numpy" }, + { name = "scipy" }, + { name = "threadpoolctl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0e/d4/40988bf3b8e34feec1d0e6a051446b1f66225f8529b9309becaeef62b6c4/scikit_learn-1.8.0.tar.gz", hash = "sha256:9bccbb3b40e3de10351f8f5068e105d0f4083b1a65fa07b6634fbc401a6287fd", size = 7335585, upload-time = "2025-12-10T07:08:53.618Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/90/74/e6a7cc4b820e95cc38cf36cd74d5aa2b42e8ffc2d21fe5a9a9c45c1c7630/scikit_learn-1.8.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5fb63362b5a7ddab88e52b6dbb47dac3fd7dafeee740dc6c8d8a446ddedade8e", size = 8548242, upload-time = "2025-12-10T07:07:51.568Z" }, + { url = "https://files.pythonhosted.org/packages/49/d8/9be608c6024d021041c7f0b3928d4749a706f4e2c3832bbede4fb4f58c95/scikit_learn-1.8.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:5025ce924beccb28298246e589c691fe1b8c1c96507e6d27d12c5fadd85bfd76", size = 8079075, upload-time = "2025-12-10T07:07:53.697Z" }, + { url = "https://files.pythonhosted.org/packages/dd/47/f187b4636ff80cc63f21cd40b7b2d177134acaa10f6bb73746130ee8c2e5/scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4496bb2cf7a43ce1a2d7524a79e40bc5da45cf598dbf9545b7e8316ccba47bb4", size = 8660492, upload-time = "2025-12-10T07:07:55.574Z" }, + { url = "https://files.pythonhosted.org/packages/97/74/b7a304feb2b49df9fafa9382d4d09061a96ee9a9449a7cbea7988dda0828/scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a0bcfe4d0d14aec44921545fd2af2338c7471de9cb701f1da4c9d85906ab847a", size = 8931904, upload-time = "2025-12-10T07:07:57.666Z" }, + { url = "https://files.pythonhosted.org/packages/9f/c4/0ab22726a04ede56f689476b760f98f8f46607caecff993017ac1b64aa5d/scikit_learn-1.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:35c007dedb2ffe38fe3ee7d201ebac4a2deccd2408e8621d53067733e3c74809", size = 8019359, upload-time = "2025-12-10T07:07:59.838Z" }, + { url = "https://files.pythonhosted.org/packages/24/90/344a67811cfd561d7335c1b96ca21455e7e472d281c3c279c4d3f2300236/scikit_learn-1.8.0-cp312-cp312-win_arm64.whl", hash = "sha256:8c497fff237d7b4e07e9ef1a640887fa4fb765647f86fbe00f969ff6280ce2bb", size = 7641898, upload-time = "2025-12-10T07:08:01.36Z" }, + { url = "https://files.pythonhosted.org/packages/03/aa/e22e0768512ce9255eba34775be2e85c2048da73da1193e841707f8f039c/scikit_learn-1.8.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0d6ae97234d5d7079dc0040990a6f7aeb97cb7fa7e8945f1999a429b23569e0a", size = 8513770, upload-time = "2025-12-10T07:08:03.251Z" }, + { url = "https://files.pythonhosted.org/packages/58/37/31b83b2594105f61a381fc74ca19e8780ee923be2d496fcd8d2e1147bd99/scikit_learn-1.8.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:edec98c5e7c128328124a029bceb09eda2d526997780fef8d65e9a69eead963e", size = 8044458, upload-time = "2025-12-10T07:08:05.336Z" }, + { url = "https://files.pythonhosted.org/packages/2d/5a/3f1caed8765f33eabb723596666da4ebbf43d11e96550fb18bdec42b467b/scikit_learn-1.8.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:74b66d8689d52ed04c271e1329f0c61635bcaf5b926db9b12d58914cdc01fe57", size = 8610341, upload-time = "2025-12-10T07:08:07.732Z" }, + { url = "https://files.pythonhosted.org/packages/38/cf/06896db3f71c75902a8e9943b444a56e727418f6b4b4a90c98c934f51ed4/scikit_learn-1.8.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8fdf95767f989b0cfedb85f7ed8ca215d4be728031f56ff5a519ee1e3276dc2e", size = 8900022, upload-time = "2025-12-10T07:08:09.862Z" }, + { url = "https://files.pythonhosted.org/packages/1c/f9/9b7563caf3ec8873e17a31401858efab6b39a882daf6c1bfa88879c0aa11/scikit_learn-1.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:2de443b9373b3b615aec1bb57f9baa6bb3a9bd093f1269ba95c17d870422b271", size = 7989409, upload-time = "2025-12-10T07:08:12.028Z" }, + { url = "https://files.pythonhosted.org/packages/49/bd/1f4001503650e72c4f6009ac0c4413cb17d2d601cef6f71c0453da2732fc/scikit_learn-1.8.0-cp313-cp313-win_arm64.whl", hash = "sha256:eddde82a035681427cbedded4e6eff5e57fa59216c2e3e90b10b19ab1d0a65c3", size = 7619760, upload-time = "2025-12-10T07:08:13.688Z" }, + { url = "https://files.pythonhosted.org/packages/d2/7d/a630359fc9dcc95496588c8d8e3245cc8fd81980251079bc09c70d41d951/scikit_learn-1.8.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:7cc267b6108f0a1499a734167282c00c4ebf61328566b55ef262d48e9849c735", size = 8826045, upload-time = "2025-12-10T07:08:15.215Z" }, + { url = "https://files.pythonhosted.org/packages/cc/56/a0c86f6930cfcd1c7054a2bc417e26960bb88d32444fe7f71d5c2cfae891/scikit_learn-1.8.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:fe1c011a640a9f0791146011dfd3c7d9669785f9fed2b2a5f9e207536cf5c2fd", size = 8420324, upload-time = "2025-12-10T07:08:17.561Z" }, + { url = "https://files.pythonhosted.org/packages/46/1e/05962ea1cebc1cf3876667ecb14c283ef755bf409993c5946ade3b77e303/scikit_learn-1.8.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:72358cce49465d140cc4e7792015bb1f0296a9742d5622c67e31399b75468b9e", size = 8680651, upload-time = "2025-12-10T07:08:19.952Z" }, + { url = "https://files.pythonhosted.org/packages/fe/56/a85473cd75f200c9759e3a5f0bcab2d116c92a8a02ee08ccd73b870f8bb4/scikit_learn-1.8.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:80832434a6cc114f5219211eec13dcbc16c2bac0e31ef64c6d346cde3cf054cb", size = 8925045, upload-time = "2025-12-10T07:08:22.11Z" }, + { url = "https://files.pythonhosted.org/packages/cc/b7/64d8cfa896c64435ae57f4917a548d7ac7a44762ff9802f75a79b77cb633/scikit_learn-1.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:ee787491dbfe082d9c3013f01f5991658b0f38aa8177e4cd4bf434c58f551702", size = 8507994, upload-time = "2025-12-10T07:08:23.943Z" }, + { url = "https://files.pythonhosted.org/packages/5e/37/e192ea709551799379958b4c4771ec507347027bb7c942662c7fbeba31cb/scikit_learn-1.8.0-cp313-cp313t-win_arm64.whl", hash = "sha256:bf97c10a3f5a7543f9b88cbf488d33d175e9146115a451ae34568597ba33dcde", size = 7869518, upload-time = "2025-12-10T07:08:25.71Z" }, + { url = "https://files.pythonhosted.org/packages/24/05/1af2c186174cc92dcab2233f327336058c077d38f6fe2aceb08e6ab4d509/scikit_learn-1.8.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:c22a2da7a198c28dd1a6e1136f19c830beab7fdca5b3e5c8bba8394f8a5c45b3", size = 8528667, upload-time = "2025-12-10T07:08:27.541Z" }, + { url = "https://files.pythonhosted.org/packages/a8/25/01c0af38fe969473fb292bba9dc2b8f9b451f3112ff242c647fee3d0dfe7/scikit_learn-1.8.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:6b595b07a03069a2b1740dc08c2299993850ea81cce4fe19b2421e0c970de6b7", size = 8066524, upload-time = "2025-12-10T07:08:29.822Z" }, + { url = "https://files.pythonhosted.org/packages/be/ce/a0623350aa0b68647333940ee46fe45086c6060ec604874e38e9ab7d8e6c/scikit_learn-1.8.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:29ffc74089f3d5e87dfca4c2c8450f88bdc61b0fc6ed5d267f3988f19a1309f6", size = 8657133, upload-time = "2025-12-10T07:08:31.865Z" }, + { url = "https://files.pythonhosted.org/packages/b8/cb/861b41341d6f1245e6ca80b1c1a8c4dfce43255b03df034429089ca2a2c5/scikit_learn-1.8.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fb65db5d7531bccf3a4f6bec3462223bea71384e2cda41da0f10b7c292b9e7c4", size = 8923223, upload-time = "2025-12-10T07:08:34.166Z" }, + { url = "https://files.pythonhosted.org/packages/76/18/a8def8f91b18cd1ba6e05dbe02540168cb24d47e8dcf69e8d00b7da42a08/scikit_learn-1.8.0-cp314-cp314-win_amd64.whl", hash = "sha256:56079a99c20d230e873ea40753102102734c5953366972a71d5cb39a32bc40c6", size = 8096518, upload-time = "2025-12-10T07:08:36.339Z" }, + { url = "https://files.pythonhosted.org/packages/d1/77/482076a678458307f0deb44e29891d6022617b2a64c840c725495bee343f/scikit_learn-1.8.0-cp314-cp314-win_arm64.whl", hash = "sha256:3bad7565bc9cf37ce19a7c0d107742b320c1285df7aab1a6e2d28780df167242", size = 7754546, upload-time = "2025-12-10T07:08:38.128Z" }, + { url = "https://files.pythonhosted.org/packages/2d/d1/ef294ca754826daa043b2a104e59960abfab4cf653891037d19dd5b6f3cf/scikit_learn-1.8.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:4511be56637e46c25721e83d1a9cea9614e7badc7040c4d573d75fbe257d6fd7", size = 8848305, upload-time = "2025-12-10T07:08:41.013Z" }, + { url = "https://files.pythonhosted.org/packages/5b/e2/b1f8b05138ee813b8e1a4149f2f0d289547e60851fd1bb268886915adbda/scikit_learn-1.8.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:a69525355a641bf8ef136a7fa447672fb54fe8d60cab5538d9eb7c6438543fb9", size = 8432257, upload-time = "2025-12-10T07:08:42.873Z" }, + { url = "https://files.pythonhosted.org/packages/26/11/c32b2138a85dcb0c99f6afd13a70a951bfdff8a6ab42d8160522542fb647/scikit_learn-1.8.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c2656924ec73e5939c76ac4c8b026fc203b83d8900362eb2599d8aee80e4880f", size = 8678673, upload-time = "2025-12-10T07:08:45.362Z" }, + { url = "https://files.pythonhosted.org/packages/c7/57/51f2384575bdec454f4fe4e7a919d696c9ebce914590abf3e52d47607ab8/scikit_learn-1.8.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:15fc3b5d19cc2be65404786857f2e13c70c83dd4782676dd6814e3b89dc8f5b9", size = 8922467, upload-time = "2025-12-10T07:08:47.408Z" }, + { url = "https://files.pythonhosted.org/packages/35/4d/748c9e2872637a57981a04adc038dacaa16ba8ca887b23e34953f0b3f742/scikit_learn-1.8.0-cp314-cp314t-win_amd64.whl", hash = "sha256:00d6f1d66fbcf4eba6e356e1420d33cc06c70a45bb1363cd6f6a8e4ebbbdece2", size = 8774395, upload-time = "2025-12-10T07:08:49.337Z" }, + { url = "https://files.pythonhosted.org/packages/60/22/d7b2ebe4704a5e50790ba089d5c2ae308ab6bb852719e6c3bd4f04c3a363/scikit_learn-1.8.0-cp314-cp314t-win_arm64.whl", hash = "sha256:f28dd15c6bb0b66ba09728cf09fd8736c304be29409bd8445a080c1280619e8c", size = 8002647, upload-time = "2025-12-10T07:08:51.601Z" }, +] + +[[package]] +name = "scipy" +version = "1.17.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/56/3e/9cca699f3486ce6bc12ff46dc2031f1ec8eb9ccc9a320fdaf925f1417426/scipy-1.17.0.tar.gz", hash = "sha256:2591060c8e648d8b96439e111ac41fd8342fdeff1876be2e19dea3fe8930454e", size = 30396830, upload-time = "2026-01-10T21:34:23.009Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0b/11/7241a63e73ba5a516f1930ac8d5b44cbbfabd35ac73a2d08ca206df007c4/scipy-1.17.0-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:0d5018a57c24cb1dd828bcf51d7b10e65986d549f52ef5adb6b4d1ded3e32a57", size = 31364580, upload-time = "2026-01-10T21:25:25.717Z" }, + { url = "https://files.pythonhosted.org/packages/ed/1d/5057f812d4f6adc91a20a2d6f2ebcdb517fdbc87ae3acc5633c9b97c8ba5/scipy-1.17.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:88c22af9e5d5a4f9e027e26772cc7b5922fab8bcc839edb3ae33de404feebd9e", size = 27969012, upload-time = "2026-01-10T21:25:30.921Z" }, + { url = "https://files.pythonhosted.org/packages/e3/21/f6ec556c1e3b6ec4e088da667d9987bb77cc3ab3026511f427dc8451187d/scipy-1.17.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:f3cd947f20fe17013d401b64e857c6b2da83cae567adbb75b9dcba865abc66d8", size = 20140691, upload-time = "2026-01-10T21:25:34.802Z" }, + { url = "https://files.pythonhosted.org/packages/7a/fe/5e5ad04784964ba964a96f16c8d4676aa1b51357199014dce58ab7ec5670/scipy-1.17.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:e8c0b331c2c1f531eb51f1b4fc9ba709521a712cce58f1aa627bc007421a5306", size = 22463015, upload-time = "2026-01-10T21:25:39.277Z" }, + { url = "https://files.pythonhosted.org/packages/4a/69/7c347e857224fcaf32a34a05183b9d8a7aca25f8f2d10b8a698b8388561a/scipy-1.17.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5194c445d0a1c7a6c1a4a4681b6b7c71baad98ff66d96b949097e7513c9d6742", size = 32724197, upload-time = "2026-01-10T21:25:44.084Z" }, + { url = "https://files.pythonhosted.org/packages/d1/fe/66d73b76d378ba8cc2fe605920c0c75092e3a65ae746e1e767d9d020a75a/scipy-1.17.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9eeb9b5f5997f75507814ed9d298ab23f62cf79f5a3ef90031b1ee2506abdb5b", size = 35009148, upload-time = "2026-01-10T21:25:50.591Z" }, + { url = "https://files.pythonhosted.org/packages/af/07/07dec27d9dc41c18d8c43c69e9e413431d20c53a0339c388bcf72f353c4b/scipy-1.17.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:40052543f7bbe921df4408f46003d6f01c6af109b9e2c8a66dd1cf6cf57f7d5d", size = 34798766, upload-time = "2026-01-10T21:25:59.41Z" }, + { url = "https://files.pythonhosted.org/packages/81/61/0470810c8a093cdacd4ba7504b8a218fd49ca070d79eca23a615f5d9a0b0/scipy-1.17.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:0cf46c8013fec9d3694dc572f0b54100c28405d55d3e2cb15e2895b25057996e", size = 37405953, upload-time = "2026-01-10T21:26:07.75Z" }, + { url = "https://files.pythonhosted.org/packages/92/ce/672ed546f96d5d41ae78c4b9b02006cedd0b3d6f2bf5bb76ea455c320c28/scipy-1.17.0-cp312-cp312-win_amd64.whl", hash = "sha256:0937a0b0d8d593a198cededd4c439a0ea216a3f36653901ea1f3e4be949056f8", size = 36328121, upload-time = "2026-01-10T21:26:16.509Z" }, + { url = "https://files.pythonhosted.org/packages/9d/21/38165845392cae67b61843a52c6455d47d0cc2a40dd495c89f4362944654/scipy-1.17.0-cp312-cp312-win_arm64.whl", hash = "sha256:f603d8a5518c7426414d1d8f82e253e454471de682ce5e39c29adb0df1efb86b", size = 24314368, upload-time = "2026-01-10T21:26:23.087Z" }, + { url = "https://files.pythonhosted.org/packages/0c/51/3468fdfd49387ddefee1636f5cf6d03ce603b75205bf439bbf0e62069bfd/scipy-1.17.0-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:65ec32f3d32dfc48c72df4291345dae4f048749bc8d5203ee0a3f347f96c5ce6", size = 31344101, upload-time = "2026-01-10T21:26:30.25Z" }, + { url = "https://files.pythonhosted.org/packages/b2/9a/9406aec58268d437636069419e6977af953d1e246df941d42d3720b7277b/scipy-1.17.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:1f9586a58039d7229ce77b52f8472c972448cded5736eaf102d5658bbac4c269", size = 27950385, upload-time = "2026-01-10T21:26:36.801Z" }, + { url = "https://files.pythonhosted.org/packages/4f/98/e7342709e17afdfd1b26b56ae499ef4939b45a23a00e471dfb5375eea205/scipy-1.17.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:9fad7d3578c877d606b1150135c2639e9de9cecd3705caa37b66862977cc3e72", size = 20122115, upload-time = "2026-01-10T21:26:42.107Z" }, + { url = "https://files.pythonhosted.org/packages/fd/0e/9eeeb5357a64fd157cbe0302c213517c541cc16b8486d82de251f3c68ede/scipy-1.17.0-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:423ca1f6584fc03936972b5f7c06961670dbba9f234e71676a7c7ccf938a0d61", size = 22442402, upload-time = "2026-01-10T21:26:48.029Z" }, + { url = "https://files.pythonhosted.org/packages/c9/10/be13397a0e434f98e0c79552b2b584ae5bb1c8b2be95db421533bbca5369/scipy-1.17.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fe508b5690e9eaaa9467fc047f833af58f1152ae51a0d0aed67aa5801f4dd7d6", size = 32696338, upload-time = "2026-01-10T21:26:55.521Z" }, + { url = "https://files.pythonhosted.org/packages/63/1e/12fbf2a3bb240161651c94bb5cdd0eae5d4e8cc6eaeceb74ab07b12a753d/scipy-1.17.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6680f2dfd4f6182e7d6db161344537da644d1cf85cf293f015c60a17ecf08752", size = 34977201, upload-time = "2026-01-10T21:27:03.501Z" }, + { url = "https://files.pythonhosted.org/packages/19/5b/1a63923e23ccd20bd32156d7dd708af5bbde410daa993aa2500c847ab2d2/scipy-1.17.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:eec3842ec9ac9de5917899b277428886042a93db0b227ebbe3a333b64ec7643d", size = 34777384, upload-time = "2026-01-10T21:27:11.423Z" }, + { url = "https://files.pythonhosted.org/packages/39/22/b5da95d74edcf81e540e467202a988c50fef41bd2011f46e05f72ba07df6/scipy-1.17.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d7425fcafbc09a03731e1bc05581f5fad988e48c6a861f441b7ab729a49a55ea", size = 37379586, upload-time = "2026-01-10T21:27:20.171Z" }, + { url = "https://files.pythonhosted.org/packages/b9/b6/8ac583d6da79e7b9e520579f03007cb006f063642afd6b2eeb16b890bf93/scipy-1.17.0-cp313-cp313-win_amd64.whl", hash = "sha256:87b411e42b425b84777718cc41516b8a7e0795abfa8e8e1d573bf0ef014f0812", size = 36287211, upload-time = "2026-01-10T21:28:43.122Z" }, + { url = "https://files.pythonhosted.org/packages/55/fb/7db19e0b3e52f882b420417644ec81dd57eeef1bd1705b6f689d8ff93541/scipy-1.17.0-cp313-cp313-win_arm64.whl", hash = "sha256:357ca001c6e37601066092e7c89cca2f1ce74e2a520ca78d063a6d2201101df2", size = 24312646, upload-time = "2026-01-10T21:28:49.893Z" }, + { url = "https://files.pythonhosted.org/packages/20/b6/7feaa252c21cc7aff335c6c55e1b90ab3e3306da3f048109b8b639b94648/scipy-1.17.0-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:ec0827aa4d36cb79ff1b81de898e948a51ac0b9b1c43e4a372c0508c38c0f9a3", size = 31693194, upload-time = "2026-01-10T21:27:27.454Z" }, + { url = "https://files.pythonhosted.org/packages/76/bb/bbb392005abce039fb7e672cb78ac7d158700e826b0515cab6b5b60c26fb/scipy-1.17.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:819fc26862b4b3c73a60d486dbb919202f3d6d98c87cf20c223511429f2d1a97", size = 28365415, upload-time = "2026-01-10T21:27:34.26Z" }, + { url = "https://files.pythonhosted.org/packages/37/da/9d33196ecc99fba16a409c691ed464a3a283ac454a34a13a3a57c0d66f3a/scipy-1.17.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:363ad4ae2853d88ebcde3ae6ec46ccca903ea9835ee8ba543f12f575e7b07e4e", size = 20537232, upload-time = "2026-01-10T21:27:40.306Z" }, + { url = "https://files.pythonhosted.org/packages/56/9d/f4b184f6ddb28e9a5caea36a6f98e8ecd2a524f9127354087ce780885d83/scipy-1.17.0-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:979c3a0ff8e5ba254d45d59ebd38cde48fce4f10b5125c680c7a4bfe177aab07", size = 22791051, upload-time = "2026-01-10T21:27:46.539Z" }, + { url = "https://files.pythonhosted.org/packages/9b/9d/025cccdd738a72140efc582b1641d0dd4caf2e86c3fb127568dc80444e6e/scipy-1.17.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:130d12926ae34399d157de777472bf82e9061c60cc081372b3118edacafe1d00", size = 32815098, upload-time = "2026-01-10T21:27:54.389Z" }, + { url = "https://files.pythonhosted.org/packages/48/5f/09b879619f8bca15ce392bfc1894bd9c54377e01d1b3f2f3b595a1b4d945/scipy-1.17.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6e886000eb4919eae3a44f035e63f0fd8b651234117e8f6f29bad1cd26e7bc45", size = 35031342, upload-time = "2026-01-10T21:28:03.012Z" }, + { url = "https://files.pythonhosted.org/packages/f2/9a/f0f0a9f0aa079d2f106555b984ff0fbb11a837df280f04f71f056ea9c6e4/scipy-1.17.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:13c4096ac6bc31d706018f06a49abe0485f96499deb82066b94d19b02f664209", size = 34893199, upload-time = "2026-01-10T21:28:10.832Z" }, + { url = "https://files.pythonhosted.org/packages/90/b8/4f0f5cf0c5ea4d7548424e6533e6b17d164f34a6e2fb2e43ffebb6697b06/scipy-1.17.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:cacbaddd91fcffde703934897c5cd2c7cb0371fac195d383f4e1f1c5d3f3bd04", size = 37438061, upload-time = "2026-01-10T21:28:19.684Z" }, + { url = "https://files.pythonhosted.org/packages/f9/cc/2bd59140ed3b2fa2882fb15da0a9cb1b5a6443d67cfd0d98d4cec83a57ec/scipy-1.17.0-cp313-cp313t-win_amd64.whl", hash = "sha256:edce1a1cf66298cccdc48a1bdf8fb10a3bf58e8b58d6c3883dd1530e103f87c0", size = 36328593, upload-time = "2026-01-10T21:28:28.007Z" }, + { url = "https://files.pythonhosted.org/packages/13/1b/c87cc44a0d2c7aaf0f003aef2904c3d097b422a96c7e7c07f5efd9073c1b/scipy-1.17.0-cp313-cp313t-win_arm64.whl", hash = "sha256:30509da9dbec1c2ed8f168b8d8aa853bc6723fede1dbc23c7d43a56f5ab72a67", size = 24625083, upload-time = "2026-01-10T21:28:35.188Z" }, + { url = "https://files.pythonhosted.org/packages/1a/2d/51006cd369b8e7879e1c630999a19d1fbf6f8b5ed3e33374f29dc87e53b3/scipy-1.17.0-cp314-cp314-macosx_10_14_x86_64.whl", hash = "sha256:c17514d11b78be8f7e6331b983a65a7f5ca1fd037b95e27b280921fe5606286a", size = 31346803, upload-time = "2026-01-10T21:28:57.24Z" }, + { url = "https://files.pythonhosted.org/packages/d6/2e/2349458c3ce445f53a6c93d4386b1c4c5c0c540917304c01222ff95ff317/scipy-1.17.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:4e00562e519c09da34c31685f6acc3aa384d4d50604db0f245c14e1b4488bfa2", size = 27967182, upload-time = "2026-01-10T21:29:04.107Z" }, + { url = "https://files.pythonhosted.org/packages/5e/7c/df525fbfa77b878d1cfe625249529514dc02f4fd5f45f0f6295676a76528/scipy-1.17.0-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:f7df7941d71314e60a481e02d5ebcb3f0185b8d799c70d03d8258f6c80f3d467", size = 20139125, upload-time = "2026-01-10T21:29:10.179Z" }, + { url = "https://files.pythonhosted.org/packages/33/11/fcf9d43a7ed1234d31765ec643b0515a85a30b58eddccc5d5a4d12b5f194/scipy-1.17.0-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:aabf057c632798832f071a8dde013c2e26284043934f53b00489f1773b33527e", size = 22443554, upload-time = "2026-01-10T21:29:15.888Z" }, + { url = "https://files.pythonhosted.org/packages/80/5c/ea5d239cda2dd3d31399424967a24d556cf409fbea7b5b21412b0fd0a44f/scipy-1.17.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a38c3337e00be6fd8a95b4ed66b5d988bac4ec888fd922c2ea9fe5fb1603dd67", size = 32757834, upload-time = "2026-01-10T21:29:23.406Z" }, + { url = "https://files.pythonhosted.org/packages/b8/7e/8c917cc573310e5dc91cbeead76f1b600d3fb17cf0969db02c9cf92e3cfa/scipy-1.17.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00fb5f8ec8398ad90215008d8b6009c9db9fa924fd4c7d6be307c6f945f9cd73", size = 34995775, upload-time = "2026-01-10T21:29:31.915Z" }, + { url = "https://files.pythonhosted.org/packages/c5/43/176c0c3c07b3f7df324e7cdd933d3e2c4898ca202b090bd5ba122f9fe270/scipy-1.17.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:f2a4942b0f5f7c23c7cd641a0ca1955e2ae83dedcff537e3a0259096635e186b", size = 34841240, upload-time = "2026-01-10T21:29:39.995Z" }, + { url = "https://files.pythonhosted.org/packages/44/8c/d1f5f4b491160592e7f084d997de53a8e896a3ac01cd07e59f43ca222744/scipy-1.17.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:dbf133ced83889583156566d2bdf7a07ff89228fe0c0cb727f777de92092ec6b", size = 37394463, upload-time = "2026-01-10T21:29:48.723Z" }, + { url = "https://files.pythonhosted.org/packages/9f/ec/42a6657f8d2d087e750e9a5dde0b481fd135657f09eaf1cf5688bb23c338/scipy-1.17.0-cp314-cp314-win_amd64.whl", hash = "sha256:3625c631a7acd7cfd929e4e31d2582cf00f42fcf06011f59281271746d77e061", size = 37053015, upload-time = "2026-01-10T21:30:51.418Z" }, + { url = "https://files.pythonhosted.org/packages/27/58/6b89a6afd132787d89a362d443a7bddd511b8f41336a1ae47f9e4f000dc4/scipy-1.17.0-cp314-cp314-win_arm64.whl", hash = "sha256:9244608d27eafe02b20558523ba57f15c689357c85bdcfe920b1828750aa26eb", size = 24951312, upload-time = "2026-01-10T21:30:56.771Z" }, + { url = "https://files.pythonhosted.org/packages/e9/01/f58916b9d9ae0112b86d7c3b10b9e685625ce6e8248df139d0fcb17f7397/scipy-1.17.0-cp314-cp314t-macosx_10_14_x86_64.whl", hash = "sha256:2b531f57e09c946f56ad0b4a3b2abee778789097871fc541e267d2eca081cff1", size = 31706502, upload-time = "2026-01-10T21:29:56.326Z" }, + { url = "https://files.pythonhosted.org/packages/59/8e/2912a87f94a7d1f8b38aabc0faf74b82d3b6c9e22be991c49979f0eceed8/scipy-1.17.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:13e861634a2c480bd237deb69333ac79ea1941b94568d4b0efa5db5e263d4fd1", size = 28380854, upload-time = "2026-01-10T21:30:01.554Z" }, + { url = "https://files.pythonhosted.org/packages/bd/1c/874137a52dddab7d5d595c1887089a2125d27d0601fce8c0026a24a92a0b/scipy-1.17.0-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:eb2651271135154aa24f6481cbae5cc8af1f0dd46e6533fb7b56aa9727b6a232", size = 20552752, upload-time = "2026-01-10T21:30:05.93Z" }, + { url = "https://files.pythonhosted.org/packages/3f/f0/7518d171cb735f6400f4576cf70f756d5b419a07fe1867da34e2c2c9c11b/scipy-1.17.0-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:c5e8647f60679790c2f5c76be17e2e9247dc6b98ad0d3b065861e082c56e078d", size = 22803972, upload-time = "2026-01-10T21:30:10.651Z" }, + { url = "https://files.pythonhosted.org/packages/7c/74/3498563a2c619e8a3ebb4d75457486c249b19b5b04a30600dfd9af06bea5/scipy-1.17.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5fb10d17e649e1446410895639f3385fd2bf4c3c7dfc9bea937bddcbc3d7b9ba", size = 32829770, upload-time = "2026-01-10T21:30:16.359Z" }, + { url = "https://files.pythonhosted.org/packages/48/d1/7b50cedd8c6c9d6f706b4b36fa8544d829c712a75e370f763b318e9638c1/scipy-1.17.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8547e7c57f932e7354a2319fab613981cde910631979f74c9b542bb167a8b9db", size = 35051093, upload-time = "2026-01-10T21:30:22.987Z" }, + { url = "https://files.pythonhosted.org/packages/e2/82/a2d684dfddb87ba1b3ea325df7c3293496ee9accb3a19abe9429bce94755/scipy-1.17.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:33af70d040e8af9d5e7a38b5ed3b772adddd281e3062ff23fec49e49681c38cf", size = 34909905, upload-time = "2026-01-10T21:30:28.704Z" }, + { url = "https://files.pythonhosted.org/packages/ef/5e/e565bd73991d42023eb82bb99e51c5b3d9e2c588ca9d4b3e2cc1d3ca62a6/scipy-1.17.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:f9eb55bb97d00f8b7ab95cb64f873eb0bf54d9446264d9f3609130381233483f", size = 37457743, upload-time = "2026-01-10T21:30:34.819Z" }, + { url = "https://files.pythonhosted.org/packages/58/a8/a66a75c3d8f1fb2b83f66007d6455a06a6f6cf5618c3dc35bc9b69dd096e/scipy-1.17.0-cp314-cp314t-win_amd64.whl", hash = "sha256:1ff269abf702f6c7e67a4b7aad981d42871a11b9dd83c58d2d2ea624efbd1088", size = 37098574, upload-time = "2026-01-10T21:30:40.782Z" }, + { url = "https://files.pythonhosted.org/packages/56/a5/df8f46ef7da168f1bc52cd86e09a9de5c6f19cc1da04454d51b7d4f43408/scipy-1.17.0-cp314-cp314t-win_arm64.whl", hash = "sha256:031121914e295d9791319a1875444d55079885bbae5bdc9c5e0f2ee5f09d34ff", size = 25246266, upload-time = "2026-01-10T21:30:45.923Z" }, +] + [[package]] name = "six" version = "1.17.0" @@ -1068,6 +1186,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a8/45/a132b9074aa18e799b891b91ad72133c98d8042c70f6240e4c5f9dabee2f/structlog-25.5.0-py3-none-any.whl", hash = "sha256:a8453e9b9e636ec59bd9e79bbd4a72f025981b3ba0f5837aebf48f02f37a7f9f", size = 72510, upload-time = "2025-10-27T08:28:21.535Z" }, ] +[[package]] +name = "threadpoolctl" +version = "3.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b7/4d/08c89e34946fce2aec4fbb45c9016efd5f4d7f24af8e5d93296e935631d8/threadpoolctl-3.6.0.tar.gz", hash = "sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e", size = 21274, upload-time = "2025-03-13T13:49:23.031Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" }, +] + [[package]] name = "types-pytz" version = "2025.2.0.20251108" From 1d411f9ebd43e11b7bcba4525ba75cba7903dfbe Mon Sep 17 00:00:00 2001 From: "Gabe@w7dev" Date: Sun, 1 Feb 2026 01:51:53 +0000 Subject: [PATCH 5/5] fix(forecasting): add security validations and fix documentation Security improvements: - Add constructor validation for season_length >= 1 in SeasonalNaiveForecaster - Add constructor validation for window_size >= 1 in MovingAverageForecaster - Add path traversal prevention in ForecastingService.predict() - Validate .joblib extension and artifacts directory containment - Log rejection reasons for security auditing Test improvements: - Fix get_settings patching to wrap ForecastingService construction - Add tests for constructor validation - Add tests for path traversal and extension validation Documentation fixes: - Fix config parameter names in ARCHITECTURE.md (season_length, window_size) - Fix README example to use season_length instead of seasonal_period - Fix markdown issues in PRP-5 (code fences, ATX headings) Co-Authored-By: Claude Opus 4.5 --- PRPs/PRP-5-forecasting.md | 40 ++-- README.md | 2 +- app/features/forecasting/models.py | 20 +- app/features/forecasting/persistence.py | 25 +- app/features/forecasting/service.py | 37 ++- app/features/forecasting/tests/test_models.py | 16 ++ .../forecasting/tests/test_service.py | 223 ++++++++++++------ docs/ARCHITECTURE.md | 4 +- 8 files changed, 269 insertions(+), 98 deletions(-) diff --git a/PRPs/PRP-5-forecasting.md b/PRPs/PRP-5-forecasting.md index 5f5868cb..6759f5b8 100644 --- a/PRPs/PRP-5-forecasting.md +++ b/PRPs/PRP-5-forecasting.md @@ -101,7 +101,7 @@ Implement a model zoo with baseline forecasting algorithms and a unified model i ### Current Codebase Tree (Relevant Parts) -``` +```text app/ ├── core/ │ ├── config.py # Settings singleton @@ -124,7 +124,7 @@ app/ ### Desired Codebase Tree -``` +```text app/features/forecasting/ # NEW: Forecasting vertical slice ├── __init__.py # Module exports ├── schemas.py # ModelConfig, TrainRequest, PredictRequest, PredictResponse @@ -153,27 +153,33 @@ app/main.py # MODIFY: Register forecasting router ### Known Gotchas -```python -# CRITICAL: Pydantic v2 uses model_config = ConfigDict(...), not class Config -# Example: frozen=True for immutability, extra="forbid" for strict validation +#### CRITICAL: Pydantic v2 uses model_config = ConfigDict(...), not class Config -# CRITICAL: Use field_validator (not @validator) with @classmethod decorator -# Example: @field_validator("horizon") @classmethod def validate_horizon(...) +Example: `frozen=True` for immutability, `extra="forbid"` for strict validation. -# CRITICAL: Joblib serialization includes Python version - document this -# Models may not load if trained on different Python/sklearn version +#### CRITICAL: Use field_validator (not @validator) with @classmethod decorator -# CRITICAL: Recursive forecasting propagates errors - warn users for long horizons +Example: `@field_validator("horizon") @classmethod def validate_horizon(...)` -# CRITICAL: All forecasters must be deterministic with fixed random_state -# Use Settings().forecast_random_seed consistently +#### CRITICAL: Joblib serialization includes Python version -# CRITICAL: Multi-horizon forecasting updates lags recursively -# Prediction at t+1 becomes lag_1 for prediction at t+2 +Models may not load if trained on different Python/sklearn version. Document this. -# CRITICAL: Feature engineering must use cutoff_date = last training date -# Never use future data when computing features for prediction -``` +#### CRITICAL: Recursive forecasting propagates errors + +Warn users for long horizons. + +#### CRITICAL: All forecasters must be deterministic with fixed random_state + +Use `Settings().forecast_random_seed` consistently. + +#### CRITICAL: Multi-horizon forecasting updates lags recursively + +Prediction at t+1 becomes lag_1 for prediction at t+2. + +#### CRITICAL: Feature engineering must use cutoff_date = last training date + +Never use future data when computing features for prediction. --- diff --git a/README.md b/README.md index cc1dd062..de8a8e96 100644 --- a/README.md +++ b/README.md @@ -202,7 +202,7 @@ curl -X POST http://localhost:8123/forecasting/train \ "train_end_date": "2024-06-30", "config": { "model_type": "seasonal_naive", - "seasonal_period": 7 + "season_length": 7 } }' ``` diff --git a/app/features/forecasting/models.py b/app/features/forecasting/models.py index 0727642b..04f9dc05 100644 --- a/app/features/forecasting/models.py +++ b/app/features/forecasting/models.py @@ -235,10 +235,18 @@ def __init__(self, season_length: int = 7, random_state: int = 42) -> None: """Initialize the seasonal naive forecaster. Args: - season_length: Seasonality period in days. + season_length: Seasonality period in days (must be >= 1). random_state: Random seed for reproducibility (unused but kept for interface). + + Raises: + ValueError: If season_length < 1. """ super().__init__(random_state) + if season_length < 1: + raise ValueError( + f"season_length must be >= 1, got {season_length}. " + "A valid seasonality period is required for seasonal forecasting." + ) self.season_length = season_length def fit( @@ -330,10 +338,18 @@ def __init__(self, window_size: int = 7, random_state: int = 42) -> None: """Initialize the moving average forecaster. Args: - window_size: Window size for averaging. + window_size: Window size for averaging (must be >= 1). random_state: Random seed for reproducibility (unused but kept for interface). + + Raises: + ValueError: If window_size < 1. """ super().__init__(random_state) + if window_size < 1: + raise ValueError( + f"window_size must be >= 1, got {window_size}. " + "A valid window size is required for moving average computation." + ) self.window_size = window_size self._forecast_value: float = 0.0 diff --git a/app/features/forecasting/persistence.py b/app/features/forecasting/persistence.py index d0a435c7..9f7a9563 100644 --- a/app/features/forecasting/persistence.py +++ b/app/features/forecasting/persistence.py @@ -111,21 +111,42 @@ def save_model_bundle(bundle: ModelBundle, path: str | Path) -> Path: return path -def load_model_bundle(path: str | Path) -> ModelBundle: +def load_model_bundle(path: str | Path, base_dir: str | Path | None = None) -> ModelBundle: """Load model bundle from disk. CRITICAL: Logs warning if versions don't match. + SECURITY: Validates path is within allowed base directory to prevent path traversal. Args: path: Path to saved bundle. + base_dir: Optional base directory for path validation. If provided, + the resolved path must be within this directory. Returns: Loaded ModelBundle. Raises: FileNotFoundError: If path doesn't exist. + ValueError: If path is outside the allowed base directory. """ - path = Path(path) + path = Path(path).resolve() + + # Security: validate path is within allowed base directory + if base_dir is not None: + base_path = Path(base_dir).resolve() + try: + path.relative_to(base_path) + except ValueError: + logger.warning( + "forecasting.model_load_rejected", + path=str(path), + base_dir=str(base_path), + reason="path_outside_allowed_directory", + ) + raise ValueError( + f"Model path '{path}' is outside the allowed artifacts directory '{base_path}'. " + "Only model artifacts within the configured directory can be loaded." + ) from None if not path.exists(): raise FileNotFoundError(f"Model bundle not found: {path}") diff --git a/app/features/forecasting/service.py b/app/features/forecasting/service.py index 9e4a6e18..a1cf03fa 100644 --- a/app/features/forecasting/service.py +++ b/app/features/forecasting/service.py @@ -215,8 +215,41 @@ async def predict( model_path=model_path, ) - # Load model bundle - bundle = load_model_bundle(model_path) + # Security: Validate model_path before loading + # Resolve to absolute path and validate extension and location + resolved_path = Path(model_path).resolve() + artifacts_dir = Path(self.settings.forecast_model_artifacts_dir).resolve() + + # Check for .joblib extension + if resolved_path.suffix != ".joblib": + logger.warning( + "forecasting.predict_rejected", + model_path=model_path, + resolved_path=str(resolved_path), + reason="invalid_extension", + ) + raise ValueError( + f"Invalid model path: '{model_path}'. Model files must have .joblib extension." + ) + + # Check path is within artifacts directory + try: + resolved_path.relative_to(artifacts_dir) + except ValueError: + logger.warning( + "forecasting.predict_rejected", + model_path=model_path, + resolved_path=str(resolved_path), + artifacts_dir=str(artifacts_dir), + reason="path_traversal_attempt", + ) + raise ValueError( + f"Invalid model path: '{model_path}'. " + f"Model path must be within the configured artifacts directory: '{artifacts_dir}'." + ) from None + + # Load model bundle (path already validated) + bundle = load_model_bundle(resolved_path) # Validate store/product match bundle_store_id = bundle.metadata.get("store_id") diff --git a/app/features/forecasting/tests/test_models.py b/app/features/forecasting/tests/test_models.py index d5a1bfc7..17ae007c 100644 --- a/app/features/forecasting/tests/test_models.py +++ b/app/features/forecasting/tests/test_models.py @@ -117,6 +117,14 @@ def test_insufficient_data_raises(self): with pytest.raises(ValueError, match="at least 7"): model.fit(short_data) + def test_invalid_season_length_raises(self): + """Test that season_length < 1 raises ValueError on construction.""" + with pytest.raises(ValueError, match="season_length must be >= 1"): + SeasonalNaiveForecaster(season_length=0) + + with pytest.raises(ValueError, match="season_length must be >= 1"): + SeasonalNaiveForecaster(season_length=-5) + def test_get_params(self): """Test get_params returns expected values.""" model = SeasonalNaiveForecaster(season_length=14, random_state=42) @@ -173,6 +181,14 @@ def test_insufficient_data_raises(self): with pytest.raises(ValueError, match="at least 7"): model.fit(short_data) + def test_invalid_window_size_raises(self): + """Test that window_size < 1 raises ValueError on construction.""" + with pytest.raises(ValueError, match="window_size must be >= 1"): + MovingAverageForecaster(window_size=0) + + with pytest.raises(ValueError, match="window_size must be >= 1"): + MovingAverageForecaster(window_size=-3) + def test_get_params(self): """Test get_params returns expected values.""" model = MovingAverageForecaster(window_size=14, random_state=42) diff --git a/app/features/forecasting/tests/test_service.py b/app/features/forecasting/tests/test_service.py index fa93e27e..6b476686 100644 --- a/app/features/forecasting/tests/test_service.py +++ b/app/features/forecasting/tests/test_service.py @@ -80,8 +80,8 @@ class TestForecastingServicePredict: """Tests for ForecastingService.predict method.""" @pytest.fixture - def saved_model_path(self, sample_naive_config, sample_time_series): - """Create a saved model for prediction tests.""" + def saved_model_context(self, sample_naive_config, sample_time_series): + """Create a saved model for prediction tests with patched settings.""" with TemporaryDirectory() as tmpdir: model = NaiveForecaster() model.fit(sample_time_series) @@ -98,107 +98,184 @@ def saved_model_path(self, sample_naive_config, sample_time_series): path = Path(tmpdir) / "test_model" saved = save_model_bundle(bundle, path) - yield str(saved) + yield {"model_path": str(saved), "tmpdir": tmpdir} @pytest.mark.asyncio - async def test_predict_returns_correct_horizon(self, saved_model_path): + async def test_predict_returns_correct_horizon(self, saved_model_context): """Test that predict returns correct number of forecast points.""" - service = ForecastingService() + with patch("app.features.forecasting.service.get_settings") as mock_settings: + settings = MagicMock() + settings.forecast_model_artifacts_dir = saved_model_context["tmpdir"] + mock_settings.return_value = settings - response = await service.predict( - store_id=1, - product_id=2, - horizon=7, - model_path=saved_model_path, - ) + service = ForecastingService() - assert len(response.forecasts) == 7 - assert response.horizon == 7 + response = await service.predict( + store_id=1, + product_id=2, + horizon=7, + model_path=saved_model_context["model_path"], + ) + + assert len(response.forecasts) == 7 + assert response.horizon == 7 @pytest.mark.asyncio - async def test_predict_validates_store_id(self, saved_model_path): + async def test_predict_validates_store_id(self, saved_model_context): """Test that predict validates store_id matches model.""" - service = ForecastingService() + with patch("app.features.forecasting.service.get_settings") as mock_settings: + settings = MagicMock() + settings.forecast_model_artifacts_dir = saved_model_context["tmpdir"] + mock_settings.return_value = settings - with pytest.raises(ValueError, match="store=1"): - await service.predict( - store_id=999, # Wrong store - product_id=2, - horizon=7, - model_path=saved_model_path, - ) + service = ForecastingService() + + with pytest.raises(ValueError, match="store=1"): + await service.predict( + store_id=999, # Wrong store + product_id=2, + horizon=7, + model_path=saved_model_context["model_path"], + ) @pytest.mark.asyncio - async def test_predict_validates_product_id(self, saved_model_path): + async def test_predict_validates_product_id(self, saved_model_context): """Test that predict validates product_id matches model.""" - service = ForecastingService() + with patch("app.features.forecasting.service.get_settings") as mock_settings: + settings = MagicMock() + settings.forecast_model_artifacts_dir = saved_model_context["tmpdir"] + mock_settings.return_value = settings - with pytest.raises(ValueError, match="product=2"): - await service.predict( - store_id=1, - product_id=999, # Wrong product - horizon=7, - model_path=saved_model_path, - ) + service = ForecastingService() + + with pytest.raises(ValueError, match="product=2"): + await service.predict( + store_id=1, + product_id=999, # Wrong product + horizon=7, + model_path=saved_model_context["model_path"], + ) @pytest.mark.asyncio async def test_predict_file_not_found(self): """Test that predict raises for missing model file.""" - service = ForecastingService() + with TemporaryDirectory() as tmpdir: + with patch("app.features.forecasting.service.get_settings") as mock_settings: + settings = MagicMock() + settings.forecast_model_artifacts_dir = tmpdir + mock_settings.return_value = settings - with pytest.raises(FileNotFoundError): - await service.predict( - store_id=1, - product_id=1, - horizon=7, - model_path="/nonexistent/model.joblib", - ) + service = ForecastingService() + + # Path must be inside artifacts dir and have .joblib extension + with pytest.raises(FileNotFoundError): + await service.predict( + store_id=1, + product_id=1, + horizon=7, + model_path=f"{tmpdir}/nonexistent/model.joblib", + ) @pytest.mark.asyncio - async def test_predict_forecast_dates(self, saved_model_path): + async def test_predict_forecast_dates(self, saved_model_context): """Test that predict generates correct forecast dates.""" - service = ForecastingService() + with patch("app.features.forecasting.service.get_settings") as mock_settings: + settings = MagicMock() + settings.forecast_model_artifacts_dir = saved_model_context["tmpdir"] + mock_settings.return_value = settings - response = await service.predict( - store_id=1, - product_id=2, - horizon=3, - model_path=saved_model_path, - ) + service = ForecastingService() - # Train end date was 2024-01-31, so forecasts start 2024-02-01 - assert response.forecasts[0].date == date(2024, 2, 1) - assert response.forecasts[1].date == date(2024, 2, 2) - assert response.forecasts[2].date == date(2024, 2, 3) + response = await service.predict( + store_id=1, + product_id=2, + horizon=3, + model_path=saved_model_context["model_path"], + ) + + # Train end date was 2024-01-31, so forecasts start 2024-02-01 + assert response.forecasts[0].date == date(2024, 2, 1) + assert response.forecasts[1].date == date(2024, 2, 2) + assert response.forecasts[2].date == date(2024, 2, 3) @pytest.mark.asyncio - async def test_predict_includes_model_type(self, saved_model_path): + async def test_predict_includes_model_type(self, saved_model_context): """Test that predict response includes model type.""" - service = ForecastingService() + with patch("app.features.forecasting.service.get_settings") as mock_settings: + settings = MagicMock() + settings.forecast_model_artifacts_dir = saved_model_context["tmpdir"] + mock_settings.return_value = settings - response = await service.predict( - store_id=1, - product_id=2, - horizon=7, - model_path=saved_model_path, - ) + service = ForecastingService() + + response = await service.predict( + store_id=1, + product_id=2, + horizon=7, + model_path=saved_model_context["model_path"], + ) - assert response.model_type == "naive" + assert response.model_type == "naive" @pytest.mark.asyncio - async def test_predict_includes_config_hash(self, saved_model_path): + async def test_predict_includes_config_hash(self, saved_model_context): """Test that predict response includes config hash.""" - service = ForecastingService() + with patch("app.features.forecasting.service.get_settings") as mock_settings: + settings = MagicMock() + settings.forecast_model_artifacts_dir = saved_model_context["tmpdir"] + mock_settings.return_value = settings - response = await service.predict( - store_id=1, - product_id=2, - horizon=7, - model_path=saved_model_path, - ) + service = ForecastingService() + + response = await service.predict( + store_id=1, + product_id=2, + horizon=7, + model_path=saved_model_context["model_path"], + ) + + assert response.config_hash is not None + assert len(response.config_hash) == 16 - assert response.config_hash is not None - assert len(response.config_hash) == 16 + @pytest.mark.asyncio + async def test_predict_rejects_path_traversal(self): + """Test that predict rejects paths outside artifacts directory.""" + with TemporaryDirectory() as tmpdir: + with patch("app.features.forecasting.service.get_settings") as mock_settings: + settings = MagicMock() + settings.forecast_model_artifacts_dir = tmpdir + mock_settings.return_value = settings + + service = ForecastingService() + + # Try to load from outside the artifacts directory (with valid extension) + with pytest.raises(ValueError, match="must be within the configured"): + await service.predict( + store_id=1, + product_id=1, + horizon=7, + model_path="/etc/malicious.joblib", + ) + + @pytest.mark.asyncio + async def test_predict_rejects_invalid_extension(self): + """Test that predict rejects non-.joblib files.""" + with TemporaryDirectory() as tmpdir: + with patch("app.features.forecasting.service.get_settings") as mock_settings: + settings = MagicMock() + settings.forecast_model_artifacts_dir = tmpdir + mock_settings.return_value = settings + + service = ForecastingService() + + # Try to load a file without .joblib extension + with pytest.raises(ValueError, match=r"\.joblib extension"): + await service.predict( + store_id=1, + product_id=1, + horizon=7, + model_path=f"{tmpdir}/model.pkl", + ) class TestForecastingServiceTrain: @@ -228,8 +305,6 @@ async def test_train_empty_data_raises(self): @pytest.mark.asyncio async def test_train_returns_model_path(self): """Test that training returns a valid model path.""" - service = ForecastingService() - # Mock database session with sample data mock_db = AsyncMock() mock_result = MagicMock() @@ -246,12 +321,16 @@ async def test_train_returns_model_path(self): mock_db.execute.return_value = mock_result with TemporaryDirectory() as tmpdir: + # Patch get_settings BEFORE constructing ForecastingService with patch("app.features.forecasting.service.get_settings") as mock_settings: settings = MagicMock() settings.forecast_random_seed = 42 settings.forecast_model_artifacts_dir = tmpdir mock_settings.return_value = settings + # Now construct service with patched settings + service = ForecastingService() + response = await service.train_model( db=mock_db, store_id=1, diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 0e2b5735..9bcd3e72 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -243,8 +243,8 @@ The `FeatureEngineeringService` provides time-safe feature computation with CRIT | Model | Description | Config Parameters | |-------|-------------|-------------------| | `naive` | Last observed value | None | -| `seasonal_naive` | Previous season value | `seasonal_period` (e.g., 7 for weekly) | -| `moving_average` | Mean of last N observations | `window` (default: 7) | +| `seasonal_naive` | Previous season value | `season_length` (e.g., 7 for weekly) | +| `moving_average` | Mean of last N observations | `window_size` (default: 7) | | `lightgbm` | LightGBM regressor | Feature-flagged, disabled by default | **Model Interface:** All models inherit from `BaseForecaster` with typed `fit()` and `predict()` methods.