Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
822 changes: 822 additions & 0 deletions PRPs/PRP-3.1A-pydantic-configs-phase2-features.md

Large diffs are not rendered by default.

1,121 changes: 1,121 additions & 0 deletions PRPs/PRP-3.1B-lifecycle-compute.md

Large diffs are not rendered by default.

1,065 changes: 1,065 additions & 0 deletions PRPs/PRP-3.1C-replenishment-compute.md

Large diffs are not rendered by default.

1,219 changes: 1,219 additions & 0 deletions PRPs/PRP-3.1D-promotion-compute.md

Large diffs are not rendered by default.

1,276 changes: 1,276 additions & 0 deletions PRPs/PRP-3.1E-phase2-e2e-integration-and-docs.md

Large diffs are not rendered by default.

115 changes: 114 additions & 1 deletion app/features/featuresets/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,14 @@ class FeatureConfigBase(BaseModel):
def config_hash(self) -> str:
"""Generate deterministic hash of configuration.

Excludes ``None``-valued fields so that adding new optional sub-config
slots is hash-invariant for callers that don't set them (additive
contract — see PRP-3.1A §15 Decision E).

Returns:
16-character hex string hash of config JSON.
"""
config_json = self.model_dump_json()
config_json = self.model_dump_json(exclude_none=True)
return hashlib.sha256(config_json.encode()).hexdigest()[:16]


Expand Down Expand Up @@ -183,6 +187,99 @@ def validate_price_lags_positive(cls, v: tuple[int, ...]) -> tuple[int, ...]:
return v


class LifecycleConfig(FeatureConfigBase):
"""Configuration for product-lifecycle features.

Lifecycle features capture time-since-launch and time-since-discontinue
as continuous integer date-deltas (NOT categorical stage). LightGBM splits
discover stage boundaries from the continuous variable naturally — see
PRP-3.1 decisions log §1.

All features are derived from product.launch_date and
product.discontinue_date (both nullable on Phase 2 products).

Attributes:
include_days_since_launch: Emit days_since_launch_lag{N} columns.
include_days_since_discontinue: Emit days_since_discontinue_lag{N}.
lag_days: Lag offset in days (>= 1 to prevent leakage).
"""

include_days_since_launch: bool = True
include_days_since_discontinue: bool = True
lag_days: int = Field(default=1, ge=1, le=30, description="Lag offset in days")


class ReplenishmentConfig(FeatureConfigBase):
"""Configuration for replenishment-event features.

Replenishment features capture inbound-stock cadence via:
* ``days_since_last_replenishment_lag{N}`` -- gap to previous event
* ``replenishment_count_w{W}_lag{N}`` -- rolling count over window W

Source: ``replenishment_event`` table (separate from sales_daily).
The JOIN happens in service.py (PRP-3.1C) -- this slice only declares
the contract.

Attributes:
include_days_since_last: Emit days_since_last_replenishment_lag{N}.
include_count_window: Emit replenishment_count_w{W}_lag{N}.
lag_days: Lag offset (>= 1).
count_window_days: Rolling-window size for count features (7-60).
"""

include_days_since_last: bool = True
include_count_window: bool = True
lag_days: int = Field(default=1, ge=1, le=30, description="Lag offset in days")
count_window_days: int = Field(
default=14,
ge=7,
le=60,
description="Rolling-window size for replenishment count features",
)


class PromotionConfig(FeatureConfigBase):
"""Configuration for generic promotion-family features.

GENERALIZED from the original MarkdownConfig design (PRP-3.1 decisions
log §3) to cover all four ``promotion.kind`` values via one JOIN:
pct_off | bogo | bundle | markdown. Default ``kinds_to_track=("markdown",)``
preserves the original PRD intent; caller opts in to others.

Produced columns (per kind in kinds_to_track):
* ``promo_<kind>_active_lag{N}`` -- int 0/1
* ``promo_<kind>_intensity_lag{N}`` -- float (when include_intensity)

Intensity source: ``promotion.discount_pct`` (Numeric(5,4), 0..1 range)
per data_platform/models.py. NULL discounts produce NaN columns.

Attributes:
kinds_to_track: Allow-listed promotion kinds (tuple required for
frozen-model hashability).
include_active: Emit promo_<kind>_active_lag{N}.
include_intensity: Emit promo_<kind>_intensity_lag{N}.
lag_days: Lag offset (>= 1).
"""

kinds_to_track: tuple[Literal["pct_off", "bogo", "bundle", "markdown"], ...] = Field(
default=("markdown",),
description="Promotion kinds to track (subset of promotion.kind allow-list)",
)
include_active: bool = True
include_intensity: bool = True
lag_days: int = Field(default=1, ge=1, le=30, description="Lag offset in days")

@field_validator("kinds_to_track")
@classmethod
def validate_kinds_non_empty_unique(cls, v: tuple[str, ...]) -> tuple[str, ...]:
"""Reject empty tuple and duplicates."""
if not v:
raise ValueError("At least one promotion kind must be specified")
if len(set(v)) != len(v):
raise ValueError("Duplicate promotion kinds are not allowed")
return v


class ImputationConfig(FeatureConfigBase):
"""Configuration for missing value imputation.

Expand Down Expand Up @@ -225,6 +322,9 @@ class FeatureSetConfig(FeatureConfigBase):
rolling_config: Configuration for rolling features (None = disabled).
calendar_config: Configuration for calendar features (None = disabled).
exogenous_config: Configuration for exogenous features (None = disabled).
lifecycle_config: Configuration for lifecycle features (None = disabled).
replenishment_config: Configuration for replenishment features (None = disabled).
promotion_config: Configuration for promotion features (None = disabled).
imputation_config: Configuration for imputation (None = disabled).
"""

Expand All @@ -244,6 +344,11 @@ class FeatureSetConfig(FeatureConfigBase):
rolling_config: RollingConfig | None = None
calendar_config: CalendarConfig | None = None
exogenous_config: ExogenousConfig | None = None
# --- Phase 2 additions (PRP-3.1A) ---
lifecycle_config: LifecycleConfig | None = None
replenishment_config: ReplenishmentConfig | None = None
promotion_config: PromotionConfig | None = None
# --- end Phase 2 additions ---
imputation_config: ImputationConfig | None = None

def get_enabled_features(self) -> list[str]:
Expand All @@ -261,6 +366,14 @@ def get_enabled_features(self) -> list[str]:
enabled.append("calendar")
if self.exogenous_config:
enabled.append("exogenous")
# --- Phase 2 additions (PRP-3.1A) ---
if self.lifecycle_config:
enabled.append("lifecycle")
if self.replenishment_config:
enabled.append("replenishment")
if self.promotion_config:
enabled.append("promotion")
# --- end Phase 2 additions ---
return enabled


Expand Down
88 changes: 88 additions & 0 deletions app/features/featuresets/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Test fixtures for featuresets module."""

from datetime import date

import pandas as pd
import pytest

Expand Down Expand Up @@ -169,3 +171,89 @@ def time_series_with_gaps() -> pd.DataFrame:
df.loc[7, "unit_price"] = None

return df


# =============================================================================
# Phase 2 fixtures (PRP-3.1A) — consumed by PRP-3.1B/C/D leakage cases.
# Sequential / derivable values let downstream tests mathematically detect
# any contamination from future-date data into past-date feature rows.
# =============================================================================


@pytest.fixture
def phase2_product_attrs_df() -> pd.DataFrame:
"""Phase 2 product lifecycle attributes.

Grain: one row per ``product_id``. Mirrors a subset of
``app/features/data_platform/models.py:Product`` columns:
- product_id (int)
- launch_date (date | None)
- discontinue_date (date | None) -- None = still active

Two products:
* P1 launched 2023-06-01, discontinued 2025-12-31 (closed lifecycle)
* P2 launched 2024-03-15, still active (open lifecycle)
"""
return pd.DataFrame(
{
"product_id": [1, 2],
"launch_date": [date(2023, 6, 1), date(2024, 3, 15)],
"discontinue_date": [date(2025, 12, 31), None],
}
)


@pytest.fixture
def phase2_replenishment_events_df() -> pd.DataFrame:
"""Phase 2 replenishment events.

Grain: one row per (store_id, product_id, event_date). Mirrors
``app/features/data_platform/models.py:ReplenishmentEvent`` columns:
- store_id (int)
- product_id (int)
- event_date (date)
- lead_time_days (int)
- ordered_qty (int)
- received_qty (int) -- received_qty <= ordered_qty per CHECK

Three events for (store=1, product=1) at 7-day then 14-day gaps so
PRP-3.1C tests can compute ``days_since_last`` (=7, then 14) plus
rolling counts over W=14.
"""
return pd.DataFrame(
{
"store_id": [1, 1, 1],
"product_id": [1, 1, 1],
"event_date": [date(2024, 1, 5), date(2024, 1, 12), date(2024, 1, 26)],
"lead_time_days": [7, 5, 10],
"ordered_qty": [100, 100, 200],
"received_qty": [98, 100, 195],
}
)


@pytest.fixture
def phase2_promotion_rows_df() -> pd.DataFrame:
"""Phase 2 promotion rows.

Grain: one row per active campaign. Mirrors a subset of
``app/features/data_platform/models.py:Promotion`` columns:
- product_id (int)
- store_id (int | None) -- None = chain-wide
- kind (Literal["pct_off", "bogo", "bundle", "markdown"])
- discount_pct (float | None) -- NULL for bogo / bundle
- start_date (date), end_date (date)

Mix of kinds + chain-wide vs store-specific to exercise PRP-3.1D's
per-kind one-hot branch and NULL-discount handling.
"""
return pd.DataFrame(
{
"product_id": [1, 1, 2],
"store_id": [1, None, 1],
"kind": ["markdown", "pct_off", "bogo"],
"discount_pct": [0.20, 0.10, None],
"start_date": [date(2024, 1, 7), date(2024, 1, 1), date(2024, 1, 15)],
"end_date": [date(2024, 1, 14), date(2024, 1, 31), date(2024, 1, 28)],
}
)
Loading