From 70ebb836ecce9b4103d0519e71abd9a2f0d10e28 Mon Sep 17 00:00:00 2001
From: Gabor Szabo <shellsnake@icloud.com>
Date: Wed, 13 May 2026 04:42:40 +0200
Subject: [PATCH] feat(features): plumb product attrs through FeatureDataLoader
 (#116)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extends FeatureDataLoader so the lifecycle feature family lights up at
the HTTP boundary -- closes the last open gap from the PRP-3.1 umbrella
(was a PRP-3.1E §16 Open Question 2 follow-up).

Changes:

* FeatureDataLoader.load_product_attrs(db, product_ids) -- new method
  loading product.id, launch_date, discontinue_date. No cutoff filter
  needed (launch/discontinue are timeless product attributes, not facts).
  Returns an empty DataFrame with the correct columns when no products
  match, mirroring the load_calendar_data / load_replenishment_events
  shape.
* compute_features_for_series() calls load_product_attrs when
  config.lifecycle_config is set and merges onto the sales frame via
  product_id BEFORE constructing the FeatureEngineeringService. The
  merge attaches the same per-product values to every sales row;
  _compute_lifecycle_features derives the per-row delta + shift(N)
  downstream.
* test_phase2_integration.py: PHASE2_EXPECTED_COLUMNS expanded to
  include days_since_launch_lag1 and days_since_discontinue_lag1.
  The seeded product has launch_date=date(2023, 6, 1) and
  discontinue_date=None, so launch resolves to a real integer; the
  discontinue column appears with all-NaN values (header still emitted).
* docs/PHASE/3-FEATURE_ENGINEERING.md: removed the "at the HTTP
  boundary, the current FeatureDataLoader does not yet join..." note
  and replaced with a positive statement about the end-to-end wiring.

Validation:

* uv run ruff check . && uv run ruff format --check . -- clean.
* uv run mypy app/ -- 0 errors.
* uv run pyright app/features/featuresets/ -- 0 errors.
* uv run pytest app/features/featuresets/ -v -m "not integration" --
  106 passed.
* uv run pytest app/features/featuresets/tests/test_phase2_integration.py
  -v -m integration -- 3 passed (now exercising the lifecycle path
  end-to-end against real Postgres).
* uv run pytest app/ -m "not integration" -- 952 passed (no regression
  outside featuresets).
---
 app/features/featuresets/service.py           | 69 ++++++++++++++++++-
 .../tests/test_phase2_integration.py          | 31 +++++----
 docs/PHASE/3-FEATURE_ENGINEERING.md           | 12 ++--
 3 files changed, 91 insertions(+), 21 deletions(-)

diff --git a/app/features/featuresets/service.py b/app/features/featuresets/service.py
index 84e5e7c9..51d8b084 100644
--- a/app/features/featuresets/service.py
+++ b/app/features/featuresets/service.py
@@ -20,7 +20,7 @@
 from sqlalchemy.ext.asyncio import AsyncSession
 
 from app.core.logging import get_logger
-from app.features.data_platform.models import Calendar, SalesDaily
+from app.features.data_platform.models import Calendar, Product, SalesDaily
 from app.features.featuresets.schemas import FeatureSetConfig
 
 if TYPE_CHECKING:
@@ -983,6 +983,56 @@ async def load_replenishment_events(
             ]
         )
 
+    async def load_product_attrs(
+        self,
+        db: AsyncSession,
+        product_ids: list[int],
+    ) -> pd.DataFrame:
+        """Load product lifecycle attributes for the given product IDs.
+
+        Returns a per-product slice of dimension columns relevant to the
+        lifecycle feature family. ``launch_date`` and ``discontinue_date``
+        are timeless attributes of the product (NOT facts), so there's no
+        cutoff filter -- the values are constant across time and the
+        ``_compute_lifecycle_features`` method derives the per-row delta
+        downstream.
+
+        Args:
+            db: Async database session.
+            product_ids: Product IDs to include.
+
+        Returns:
+            DataFrame with columns ``[product_id, launch_date,
+            discontinue_date]``. Empty DataFrame (with correct columns)
+            when no matching products are found.
+        """
+        stmt = (
+            select(
+                Product.id,
+                Product.launch_date,
+                Product.discontinue_date,
+            )
+            .where(Product.id.in_(product_ids))
+            .order_by(Product.id)
+        )
+
+        result = await db.execute(stmt)
+        rows = result.all()
+
+        if not rows:
+            return pd.DataFrame(columns=["product_id", "launch_date", "discontinue_date"])
+
+        return pd.DataFrame(
+            [
+                {
+                    "product_id": row.id,
+                    "launch_date": row.launch_date,
+                    "discontinue_date": row.discontinue_date,
+                }
+                for row in rows
+            ]
+        )
+
 
 async def compute_features_for_series(
     db: AsyncSession,
@@ -1035,6 +1085,23 @@ async def compute_features_for_series(
                 how="left",
             )
 
+    # Optionally load product attrs and merge for lifecycle features
+    # (#116). ``launch_date`` / ``discontinue_date`` are timeless product
+    # attributes, so no cutoff filter is needed -- the merge attaches
+    # the same per-product values to every sales row, and
+    # ``_compute_lifecycle_features`` derives per-row deltas downstream.
+    if config.lifecycle_config and not df.empty:
+        product_attrs_df = await loader.load_product_attrs(
+            db=db,
+            product_ids=[product_id],
+        )
+        if not product_attrs_df.empty:
+            df = df.merge(
+                product_attrs_df[["product_id", "launch_date", "discontinue_date"]],
+                on="product_id",
+                how="left",
+            )
+
     # Optionally load replenishment events (PRP-3.1C) before constructing
     # the service. SQL-side date filter enforces time-safety.
     events_df: pd.DataFrame | None = None
diff --git a/app/features/featuresets/tests/test_phase2_integration.py b/app/features/featuresets/tests/test_phase2_integration.py
index a13fa7ea..05b6d36d 100644
--- a/app/features/featuresets/tests/test_phase2_integration.py
+++ b/app/features/featuresets/tests/test_phase2_integration.py
@@ -45,22 +45,25 @@
 
 # --- Phase 2 columns expected end-to-end through ``POST /featuresets/compute``.
 #
-# Replenishment + promotion columns appear because:
-#   * ``compute_features_for_series`` (service.py) eagerly loads
-#     replenishment events via ``FeatureDataLoader.load_replenishment_events``.
-#   * ``_compute_promotion_features`` falls back to an empty DataFrame when
-#     no rows are wired through, and still EMITS its columns (with 0 /
-#     NaN values) -- so the column membership assertion holds.
+# All four families are wired through ``compute_features_for_series``:
+#   * Lifecycle (#116): ``FeatureDataLoader.load_product_attrs`` joins
+#     ``product.launch_date`` / ``product.discontinue_date`` onto the
+#     sales frame when ``lifecycle_config`` is set. The seeded product
+#     has ``launch_date=date(2023, 6, 1)`` so ``days_since_launch_lag1``
+#     resolves to a real integer. ``discontinue_date`` is NULL, so
+#     ``days_since_discontinue_lag1`` is emitted with all-NaN values.
+#   * Replenishment: ``FeatureDataLoader.load_replenishment_events``
+#     loads the seeded events.
+#   * Promotion: ``_compute_promotion_features`` falls back to an empty
+#     DataFrame when rows aren't wired through; columns still appear
+#     (with 0/NaN values).
 #
-# Lifecycle columns (``days_since_launch_lag1`` / ``_discontinue_lag1``) are
-# OMITTED from the expected set because the ``FeatureDataLoader`` does not
-# join ``product.launch_date`` / ``product.discontinue_date`` onto the
-# sales frame; per ``service.py::_compute_lifecycle_features`` (silent
-# skip when both source columns are absent), no lifecycle columns reach
-# ``feature_columns`` at the HTTP boundary. Extending the loader to plumb
-# product attrs is OUT OF SCOPE for this slice (PRP-3.1E ships docs +
-# tests only; see PRP-3.1E §16 Open Question 2).
+# ``days_since_discontinue_lag1`` is in the expected set even though
+# the seeded product has ``discontinue_date=None`` -- the compute method
+# emits the column header regardless; the values are all-NaN downstream.
 PHASE2_EXPECTED_COLUMNS: set[str] = {
+    "days_since_launch_lag1",
+    "days_since_discontinue_lag1",
     "days_since_last_replenishment_lag1",
     "replenishment_count_w14_lag1",
     "promo_markdown_active_lag1",
diff --git a/docs/PHASE/3-FEATURE_ENGINEERING.md b/docs/PHASE/3-FEATURE_ENGINEERING.md
index eebae929..8238ae7f 100644
--- a/docs/PHASE/3-FEATURE_ENGINEERING.md
+++ b/docs/PHASE/3-FEATURE_ENGINEERING.md
@@ -245,12 +245,12 @@ LifecycleConfig(
 | `days_since_launch_lag{N}` | int (NaN if `launch_date` is NULL) | `shift(N)` per `(store_id, product_id)` |
 | `days_since_discontinue_lag{N}` | int (NaN if `discontinue_date` is NULL; signed pre-/post-retire) | `shift(N)` per `(store_id, product_id)` |
 
-> Note: at the HTTP boundary, the current `FeatureDataLoader` does not yet
-> join `product.launch_date` / `product.discontinue_date` onto the sales
-> frame, so `_compute_lifecycle_features` silently emits zero lifecycle
-> columns end-to-end (see `service.py`). The compute method itself is fully
-> wired and unit-tested; extending the loader to plumb product attrs is
-> tracked as follow-up work.
+End-to-end at the HTTP boundary, `FeatureDataLoader.load_product_attrs`
+joins `product.launch_date` / `product.discontinue_date` onto the sales
+frame when `lifecycle_config` is set (#116). The attributes are timeless
+(no cutoff filter), so the merge is constant per product and
+`_compute_lifecycle_features` derives the per-row delta + `shift(N)`
+downstream.
 
 ### Replenishment features