From 70ebb836ecce9b4103d0519e71abd9a2f0d10e28 Mon Sep 17 00:00:00 2001 From: Gabor Szabo Date: Wed, 13 May 2026 04:42:40 +0200 Subject: [PATCH] feat(features): plumb product attrs through FeatureDataLoader (#116) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends FeatureDataLoader so the lifecycle feature family lights up at the HTTP boundary -- closes the last open gap from the PRP-3.1 umbrella (was a PRP-3.1E §16 Open Question 2 follow-up). Changes: * FeatureDataLoader.load_product_attrs(db, product_ids) -- new method loading product.id, launch_date, discontinue_date. No cutoff filter needed (launch/discontinue are timeless product attributes, not facts). Returns an empty DataFrame with the correct columns when no products match, mirroring the load_calendar_data / load_replenishment_events shape. * compute_features_for_series() calls load_product_attrs when config.lifecycle_config is set and merges onto the sales frame via product_id BEFORE constructing the FeatureEngineeringService. The merge attaches the same per-product values to every sales row; _compute_lifecycle_features derives the per-row delta + shift(N) downstream. * test_phase2_integration.py: PHASE2_EXPECTED_COLUMNS expanded to include days_since_launch_lag1 and days_since_discontinue_lag1. The seeded product has launch_date=date(2023, 6, 1) and discontinue_date=None, so launch resolves to a real integer; the discontinue column appears with all-NaN values (header still emitted). * docs/PHASE/3-FEATURE_ENGINEERING.md: removed the "at the HTTP boundary, the current FeatureDataLoader does not yet join..." note and replaced with a positive statement about the end-to-end wiring. Validation: * uv run ruff check . && uv run ruff format --check . -- clean. * uv run mypy app/ -- 0 errors. * uv run pyright app/features/featuresets/ -- 0 errors. * uv run pytest app/features/featuresets/ -v -m "not integration" -- 106 passed. * uv run pytest app/features/featuresets/tests/test_phase2_integration.py -v -m integration -- 3 passed (now exercising the lifecycle path end-to-end against real Postgres). * uv run pytest app/ -m "not integration" -- 952 passed (no regression outside featuresets). --- app/features/featuresets/service.py | 69 ++++++++++++++++++- .../tests/test_phase2_integration.py | 31 +++++---- docs/PHASE/3-FEATURE_ENGINEERING.md | 12 ++-- 3 files changed, 91 insertions(+), 21 deletions(-) diff --git a/app/features/featuresets/service.py b/app/features/featuresets/service.py index 84e5e7c9..51d8b084 100644 --- a/app/features/featuresets/service.py +++ b/app/features/featuresets/service.py @@ -20,7 +20,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.core.logging import get_logger -from app.features.data_platform.models import Calendar, SalesDaily +from app.features.data_platform.models import Calendar, Product, SalesDaily from app.features.featuresets.schemas import FeatureSetConfig if TYPE_CHECKING: @@ -983,6 +983,56 @@ async def load_replenishment_events( ] ) + async def load_product_attrs( + self, + db: AsyncSession, + product_ids: list[int], + ) -> pd.DataFrame: + """Load product lifecycle attributes for the given product IDs. + + Returns a per-product slice of dimension columns relevant to the + lifecycle feature family. ``launch_date`` and ``discontinue_date`` + are timeless attributes of the product (NOT facts), so there's no + cutoff filter -- the values are constant across time and the + ``_compute_lifecycle_features`` method derives the per-row delta + downstream. + + Args: + db: Async database session. + product_ids: Product IDs to include. + + Returns: + DataFrame with columns ``[product_id, launch_date, + discontinue_date]``. Empty DataFrame (with correct columns) + when no matching products are found. + """ + stmt = ( + select( + Product.id, + Product.launch_date, + Product.discontinue_date, + ) + .where(Product.id.in_(product_ids)) + .order_by(Product.id) + ) + + result = await db.execute(stmt) + rows = result.all() + + if not rows: + return pd.DataFrame(columns=["product_id", "launch_date", "discontinue_date"]) + + return pd.DataFrame( + [ + { + "product_id": row.id, + "launch_date": row.launch_date, + "discontinue_date": row.discontinue_date, + } + for row in rows + ] + ) + async def compute_features_for_series( db: AsyncSession, @@ -1035,6 +1085,23 @@ async def compute_features_for_series( how="left", ) + # Optionally load product attrs and merge for lifecycle features + # (#116). ``launch_date`` / ``discontinue_date`` are timeless product + # attributes, so no cutoff filter is needed -- the merge attaches + # the same per-product values to every sales row, and + # ``_compute_lifecycle_features`` derives per-row deltas downstream. + if config.lifecycle_config and not df.empty: + product_attrs_df = await loader.load_product_attrs( + db=db, + product_ids=[product_id], + ) + if not product_attrs_df.empty: + df = df.merge( + product_attrs_df[["product_id", "launch_date", "discontinue_date"]], + on="product_id", + how="left", + ) + # Optionally load replenishment events (PRP-3.1C) before constructing # the service. SQL-side date filter enforces time-safety. events_df: pd.DataFrame | None = None diff --git a/app/features/featuresets/tests/test_phase2_integration.py b/app/features/featuresets/tests/test_phase2_integration.py index a13fa7ea..05b6d36d 100644 --- a/app/features/featuresets/tests/test_phase2_integration.py +++ b/app/features/featuresets/tests/test_phase2_integration.py @@ -45,22 +45,25 @@ # --- Phase 2 columns expected end-to-end through ``POST /featuresets/compute``. # -# Replenishment + promotion columns appear because: -# * ``compute_features_for_series`` (service.py) eagerly loads -# replenishment events via ``FeatureDataLoader.load_replenishment_events``. -# * ``_compute_promotion_features`` falls back to an empty DataFrame when -# no rows are wired through, and still EMITS its columns (with 0 / -# NaN values) -- so the column membership assertion holds. +# All four families are wired through ``compute_features_for_series``: +# * Lifecycle (#116): ``FeatureDataLoader.load_product_attrs`` joins +# ``product.launch_date`` / ``product.discontinue_date`` onto the +# sales frame when ``lifecycle_config`` is set. The seeded product +# has ``launch_date=date(2023, 6, 1)`` so ``days_since_launch_lag1`` +# resolves to a real integer. ``discontinue_date`` is NULL, so +# ``days_since_discontinue_lag1`` is emitted with all-NaN values. +# * Replenishment: ``FeatureDataLoader.load_replenishment_events`` +# loads the seeded events. +# * Promotion: ``_compute_promotion_features`` falls back to an empty +# DataFrame when rows aren't wired through; columns still appear +# (with 0/NaN values). # -# Lifecycle columns (``days_since_launch_lag1`` / ``_discontinue_lag1``) are -# OMITTED from the expected set because the ``FeatureDataLoader`` does not -# join ``product.launch_date`` / ``product.discontinue_date`` onto the -# sales frame; per ``service.py::_compute_lifecycle_features`` (silent -# skip when both source columns are absent), no lifecycle columns reach -# ``feature_columns`` at the HTTP boundary. Extending the loader to plumb -# product attrs is OUT OF SCOPE for this slice (PRP-3.1E ships docs + -# tests only; see PRP-3.1E §16 Open Question 2). +# ``days_since_discontinue_lag1`` is in the expected set even though +# the seeded product has ``discontinue_date=None`` -- the compute method +# emits the column header regardless; the values are all-NaN downstream. PHASE2_EXPECTED_COLUMNS: set[str] = { + "days_since_launch_lag1", + "days_since_discontinue_lag1", "days_since_last_replenishment_lag1", "replenishment_count_w14_lag1", "promo_markdown_active_lag1", diff --git a/docs/PHASE/3-FEATURE_ENGINEERING.md b/docs/PHASE/3-FEATURE_ENGINEERING.md index eebae929..8238ae7f 100644 --- a/docs/PHASE/3-FEATURE_ENGINEERING.md +++ b/docs/PHASE/3-FEATURE_ENGINEERING.md @@ -245,12 +245,12 @@ LifecycleConfig( | `days_since_launch_lag{N}` | int (NaN if `launch_date` is NULL) | `shift(N)` per `(store_id, product_id)` | | `days_since_discontinue_lag{N}` | int (NaN if `discontinue_date` is NULL; signed pre-/post-retire) | `shift(N)` per `(store_id, product_id)` | -> Note: at the HTTP boundary, the current `FeatureDataLoader` does not yet -> join `product.launch_date` / `product.discontinue_date` onto the sales -> frame, so `_compute_lifecycle_features` silently emits zero lifecycle -> columns end-to-end (see `service.py`). The compute method itself is fully -> wired and unit-tested; extending the loader to plumb product attrs is -> tracked as follow-up work. +End-to-end at the HTTP boundary, `FeatureDataLoader.load_product_attrs` +joins `product.launch_date` / `product.discontinue_date` onto the sales +frame when `lifecycle_config` is set (#116). The attributes are timeless +(no cutoff filter), so the merge is constant per product and +`_compute_lifecycle_features` derives the per-row delta + `shift(N)` +downstream. ### Replenishment features