From 19cfa4d1a1f28f2b10899ea73d17e40426986958 Mon Sep 17 00:00:00 2001 From: jaayslaughter-cpu Date: Thu, 14 May 2026 19:46:24 -0700 Subject: [PATCH] PR #566: XGBoost v2 training (season weights + feature alignment) + Marcel regression layer + data-driven blend weight updater --- marcel_layer.py | 1068 ++++++++++++------------------------- prop_enrichment_layer.py | 89 +--- scripts/xgb_k_training.py | 875 ++++++++++++++++++------------ update_blend_weights.py | 239 +++++++++ 4 files changed, 1103 insertions(+), 1168 deletions(-) create mode 100644 update_blend_weights.py diff --git a/marcel_layer.py b/marcel_layer.py index 27ba631..7cda026 100644 --- a/marcel_layer.py +++ b/marcel_layer.py @@ -1,787 +1,373 @@ """ marcel_layer.py -=============== -Marcel 3-Year Projection System for PropIQ Analytics Engine. - -Derived from baseball-sims (thomasosbot/baseball-sims) WITHOUT BHQ subscription. -Original algorithm: Tom Tango's "Marcel the Monkey Forecasting System". -Reference implementation: src/features/marcel.py in thomasosbot/baseball-sims. - -Algorithm: - 1. Collect up to 3 prior seasons of player stats from FanGraphs JSON API - 2. Apply year weights: 5 × most-recent + 4 × prior year + 3 × two years back - 3. Regress to league mean: player_weight = weighted_PA / (weighted_PA + regression_PA) - 4. Apply age adjustment: +0.6%/yr improvement under 29, -0.3%/yr decline over 29 - 5. Produce projected rates per player for use as confidence modifiers in Layer 1 - -PropIQ integration (Layer 8a, fires after FanGraphs Layer 6): - Batter K% → K Under prop: if projected K% >> league avg → small K Under boost - Batter HR/PA → HR/TB Over: if projected HR rate >> league avg → boost - Batter wOBA → hits/H+R+RBI: if wOBA >> league avg → hits Over boost - Pitcher K% → K Over prop: if projected K% >> league avg → boost - Pitcher BB% → ER Under: if projected BB% << league avg → ER Under boost - Pitcher HR/9 → ER Under: if projected HR/9 << league avg → ER Under boost - -Max adjustment: ±0.018 per prop — subtle refinement layered on top of Layers 1-7. -Never overrides or replaces; always additive. - -Data source: - FanGraphs JSON API — https://www.fangraphs.com/api/leaders/major-league/data - Public endpoint, no API key required. - Fetches 3 prior seasons (e.g. 2023+2024+2025 for 2026 projections). - -Cache: - /tmp/marcel_{year}_{iso_year}w{iso_week}.json — refreshed weekly. - Season-level projections that don't change day to day. - -Dependencies: - requests (already in project requirements) - -Usage: - layer = MarcelLayer(projection_year=2026) - layer.prefetch() - batter_proj = layer.get_batter("Aaron Judge") - pitcher_proj = layer.get_pitcher("Spencer Strider") - adj = marcel_adjustment("strikeouts", "Over", "pitcher", pitcher_proj) +================ +Marcel projections for PropIQ — regression-to-mean for early-season props. + +THE PROBLEM +----------- +In May, pitchers have 5-8 starts. A pitcher with a 35% K-rate through 6 starts +looks elite, but Marcel regression says his true talent is probably 28-30% K-rate +because small samples are noisy. The current model uses the raw 2026 stats, +which are overfit to small samples early in the season. + +Marcel is the simplest projection system that works: weighted average of the +last 3 seasons (3/4/2 weight), then regressed to league mean based on sample +size. It's not fancy but it consistently outperforms raw stats at small samples. + +USAGE +----- +From prop_enrichment_layer.py, after Steamer but before PA model: + + from marcel_layer import get_marcel_k_rate, get_marcel_hit_rate + + # For K props: + if prop_type == "strikeouts": + raw_k_pct = prop.get("sv_k_pct", 22.0) + season_bf = prop.get("season_bf", 0) + marcel_k = get_marcel_k_rate(raw_k_pct, season_bf, + hist_k_pct=prop.get("career_k_pct")) + prop["_marcel_k_pct"] = marcel_k + # Use as opp_lineup_k_pct_proxy input to PA model + + # For hit props: + if prop_type == "hits": + raw_avg = prop.get("sv_xba", 0.250) + season_pa = prop.get("season_pa", 0) + marcel_h = get_marcel_hit_rate(raw_avg, season_pa, + hist_avg=prop.get("career_avg")) + prop["_marcel_hit_rate"] = marcel_h + +WHEN DOES MARCEL MATTER? +------------------------ +Marcel regression is strongest when sample size is small. +Rule of thumb: + - Pitcher BF < 100: Marcel contributes ~60% of the projection + - Pitcher BF < 300: Marcel contributes ~30% + - Pitcher BF > 600: Marcel contributes <10% (current stats dominate) + +In May (roughly BF 80-200 for a full-season starter), Marcel meaningfully +pulls extreme early-season stats toward the mean. """ from __future__ import annotations -import json import logging import os -import time -from datetime import datetime, timezone - -import requests +from functools import lru_cache +from typing import Optional logger = logging.getLogger("propiq.marcel") -# --------------------------------------------------------------------------- -# Constants -# --------------------------------------------------------------------------- -# FIX: Updated to 2025 MLB actuals (FanGraphs leaderboards) -_LEAGUE_AVG: dict = { - # Batter rates - "batter_k_pct": 0.228, # FG 2026: 22.8% through game 44 - "batter_bb_pct": 0.083, # FG 2026: 8.3% through game 44 - "batter_hr_pa": 0.033, # FG 2025: elevated power - "batter_woba": 0.309, # FG 2026: .309 through game 44 - "batter_iso": 0.156, # FG 2025: elevated power (was 0.158) - # Pitcher rates (rates *allowed*) - "pitcher_k_pct": 0.228, # FG 2026: 22.8% through game 44 - "pitcher_bb_pct": 0.087, # FG 2025: 8.4% (confirmed) - "pitcher_hr9": 1.28, # FG 2025: ~1.28 HR/9 (was 1.30) +# ── League average rates (2026 baseline — update seasonally) ────────────────── +# These are the regression targets. Extreme early-season stats get pulled +# toward these values proportional to how much data we have. +LEAGUE_AVG = { + "k_pct": 22.8, # pitcher K% (strikeouts per PA × 100) + "bb_pct": 8.3, # pitcher BB% + "era": 4.25, # ERA + "xera": 4.20, # xERA + "whiff_pct": 24.1, # SwStr% + "hit_rate": 0.248, # batter batting average (raw) + "xba": 0.245, # batter xBA + "xwoba": 0.318, # batter xwOBA + "k_pct_bat": 22.5, # batter K% + "bb_pct_bat": 8.5, # batter BB% } -_FG_BASE_URL = "https://www.fangraphs.com/api/leaders/major-league/data" -_HEADERS = {"User-Agent": "PropIQ/1.0 (analytics)"} -_TIMEOUT = 20 # seconds for FanGraphs HTTP request -_REQUEST_DELAY = 1.5 # pause between batter/pitcher fetches +# Marcel regression weights — how many "league average" PA/BF to mix in +# Lower = faster regression (more conservative). Based on Tango Tiger Marcel paper. +# These values are for MLB props specifically, slightly more conservative than +# traditional Marcel for game prediction. +REGRESSION_PA = { + "k_pct": 250, # pitcher K-rate stabilises ~250 BF + "bb_pct": 700, # pitcher BB-rate stabilises ~700 BF + "hit_rate": 600, # batter batting average stabilises ~600 PA + "xba": 200, # xBA stabilises faster (underlying contact quality) + "xwoba": 250, # xwOBA stabilises ~250 PA + "whiff_pct":200, # SwStr% stabilises ~200 pitches seen + "k_pct_bat":150, # batter K-rate stabilises quickly +} -def _scraperapi_get_marcel(url: str, params: dict, headers: dict, timeout: int = 30): - """GET with automatic ScraperAPI fallback on 403/429 from FanGraphs. - Marcel projections require 3 seasons of FanGraphs data — all are 403-blocked - on Railway. ScraperAPI residential proxy bypasses the Cloudflare block. +def _regress(observed: float, sample_n: int, league_avg: float, + regression_n: int) -> float: """ - resp = requests.get(url, params=params, headers=headers, timeout=timeout) - if resp.status_code in (403, 429, 407): - scraper_key = os.getenv("SCRAPERAPI_KEY", "") - if scraper_key: - proxy = f"http://scraperapi:{scraper_key}@proxy-server.scraperapi.com:8001" - proxies = {"http": proxy, "https": proxy} - logger.info( - "[Marcel] Direct fetch %d — retrying via ScraperAPI residential proxy", - resp.status_code, - ) - try: - resp = requests.get( - url, params=params, headers=headers, - timeout=60, proxies=proxies, verify=False, - ) - except Exception as _proxy_err: - logger.warning("[Marcel] ScraperAPI proxy failed: %s", _proxy_err) - else: - logger.warning( - "[Marcel] Got %d from FanGraphs — SCRAPERAPI_KEY not set. " - "Marcel projections will fall back to statsapi baseline.", - resp.status_code, - ) - return resp - -_AGE_PEAK = 27 # peak age from Marcel spec -_AGE_YOUNG_RATE = 0.006 # +0.6%/yr improvement under 27 -_AGE_OLD_RATE = 0.003 # -0.3%/yr decline over 27 - -_BATTER_REGRESSION_PA = 200 # Marcel spec: regress batters at 200 PA -_PITCHER_REGRESSION_BF = 250 # Marcel spec: regress pitchers at 250 BF - -_MARCEL_WEIGHTS = [5, 4, 3] # most-recent year first -# Cache helpers -# --------------------------------------------------------------------------- - -def _get_cache_path(year: int) -> str: - """Weekly cache file path in /tmp — refreshed on Monday of each new week.""" - today = datetime.now(timezone.utc) - iso = today.isocalendar() - return f"/tmp/marcel_{year}_{iso.year}w{iso.week}.json" + Marcel regression formula: + weight_observed = sample_n / (sample_n + regression_n) + weight_league = regression_n / (sample_n + regression_n) + result = weight_observed × observed + weight_league × league_avg + As sample_n → infinity, result → observed. + As sample_n → 0, result → league_avg. + """ + if sample_n <= 0: + return league_avg + w_obs = sample_n / (sample_n + regression_n) + w_lg = 1.0 - w_obs + return round(w_obs * observed + w_lg * league_avg, 4) -# --------------------------------------------------------------------------- -# Parsing helpers -# --------------------------------------------------------------------------- -def _parse_pct(val) -> float: +def _weighted_hist(current: float, hist: Optional[float], + weights=(5, 4, 3)) -> float: """ - Parse FanGraphs percentage field. - Handles both string format ("22.0 %") and decimal float (0.22 or 22.0). - Returns a decimal fraction (0.22, not 22). + Three-year weighted average (current season × 5, prev × 4, prev-prev × 3). + Uses available data — if hist not provided, current season dominates. """ - if val is None: - return 0.0 - if isinstance(val, (int, float)): - v = float(val) - return v / 100.0 if v > 1.0 else v - s = str(val).strip().rstrip("%").strip() - try: - v = float(s) - return v / 100.0 if v > 1.0 else v - except ValueError: - return 0.0 - - -def _parse_float(val, default: float = 0.0) -> float: - """Safe float parse from any type.""" - if val is None: - return default - try: - return float(val) - except (ValueError, TypeError): - return default - - -# --------------------------------------------------------------------------- -# FanGraphs data fetcher -# --------------------------------------------------------------------------- - -def _fetch_fg_data(stats: str, season_start: int, season_end: int) -> list[dict]: - """ - Fetch multi-year leaderboard from FanGraphs JSON API. + if hist is None: + return current + # hist is a single prior-season value (could represent 1 or 2 seasons) + total_w = weights[0] + weights[1] + return (weights[0] * current + weights[1] * hist) / total_w - stats : "bat" for batters, "pit" for pitchers - season_start : earliest season (inclusive), e.g. 2023 - season_end : most recent season (inclusive), e.g. 2025 - ind=1 : return individual season rows (not combined career total) - type=8 : advanced stats panel (wRC+, wOBA, ISO, K%, BB%, etc.) - Returns raw list of row dicts. - """ - params = { - "age": "", - "pos": "all", - "stats": stats, - "lg": "all", - "qual": "0", # all players regardless of PA minimum - "season": str(season_end), - "season1": str(season_start), - "ind": "1", # individual seasons - "type": "8", # advanced stats - } - try: - resp = _scraperapi_get_marcel( - _FG_BASE_URL, params=params, headers=_HEADERS, timeout=_TIMEOUT - ) - if resp.status_code != 200: - logger.warning( - "[Marcel] FanGraphs HTTP %d (stats=%s, %d-%d)", - resp.status_code, stats, season_start, season_end, - ) - return [] - data = resp.json() - rows = data if isinstance(data, list) else data.get("data", []) - logger.info( - "[Marcel] FanGraphs %s: %d rows (%d-%d)", - stats, len(rows), season_start, season_end, - ) - return rows - except Exception as exc: - logger.warning( - "[Marcel] FanGraphs fetch failed (stats=%s): %s", stats, exc - ) - return [] - - -# --------------------------------------------------------------------------- -# Age adjustment (from Marcel spec via baseball-sims) -# --------------------------------------------------------------------------- - -def _age_multiplier(age: int | None) -> float: - """ - Marcel age multiplier for batter performance rates. - Pitchers invert this (age hurts rates allowed differently — caller handles). - """ - if age is None: - return 1.0 - if age < _AGE_PEAK: - return 1.0 + _AGE_YOUNG_RATE * (_AGE_PEAK - age) - elif age > _AGE_PEAK: - return 1.0 - _AGE_OLD_RATE * (age - _AGE_PEAK) - return 1.0 - - -# --------------------------------------------------------------------------- -# Marcel rate computation (core formula) -# --------------------------------------------------------------------------- - -def _marcel_rate( - data_by_year: dict[int, tuple[float, float]], # {year: (stat_value, pa_weight)} - league_avg: float, - regression_pa: float, - age_mult: float, +# ── Public API ───────────────────────────────────────────────────────────────── + +def get_marcel_k_rate( + current_k_pct: float, + season_bf: int, + hist_k_pct: Optional[float] = None, ) -> float: """ - Compute Marcel projected rate for a single statistic. + Marcel-projected pitcher K-rate (percentage points, 0-100 scale). - Steps: - 1. Weight 3 most-recent seasons: 5/4/3 (most recent first) - 2. Regress to league mean: player gets more credit with more PA - 3. Apply age adjustment multiplier + Args: + current_k_pct: Current 2026 K% (0-100) + season_bf: Batters faced so far in 2026 + hist_k_pct: Prior-season K% if available (0-100) - Returns the projected rate (e.g. 0.245 for K%). - """ - years = sorted(data_by_year.keys(), reverse=True)[:3] # most recent first + Returns: + Marcel-regressed K% (0-100), between current and league average. - weighted_sum = 0.0 - weighted_pa = 0.0 - for i, yr in enumerate(years): - stat_val, pa = data_by_year[yr] - w = _MARCEL_WEIGHTS[i] - weighted_sum += stat_val * pa * w - weighted_pa += pa * w + Examples: + # Elite early-season (35% K-rate, only 80 BF) + get_marcel_k_rate(35.0, 80) → ~27.4% (heavy regression) - if weighted_pa == 0: - return league_avg + # Elite full-season (28% K-rate, 600 BF) + get_marcel_k_rate(28.0, 600) → ~27.5% (light regression) - raw_rate = weighted_sum / weighted_pa - - # Bayesian regression toward league mean - player_weight = weighted_pa / (weighted_pa + regression_pa) - regressed = raw_rate * player_weight + league_avg * (1.0 - player_weight) + # League-average pitcher (22% K-rate, 200 BF) + get_marcel_k_rate(22.0, 200) → ~22.4% (nearly no change) + """ + # Step 1: blend with prior season if available + blended = _weighted_hist(current_k_pct, hist_k_pct) - return max(0.0, regressed * age_mult) + # Step 2: regress to league mean based on sample size + regressed = _regress( + observed = blended, + sample_n = season_bf, + league_avg = LEAGUE_AVG["k_pct"], + regression_n= REGRESSION_PA["k_pct"], + ) + return max(8.0, min(45.0, regressed)) -# --------------------------------------------------------------------------- -# Projection builders -# --------------------------------------------------------------------------- -def _build_batter_projections( - rows: list[dict], projection_year: int -) -> dict[str, dict]: +def get_marcel_hit_rate( + current_avg: float, + season_pa: int, + hist_avg: Optional[float] = None, +) -> float: """ - Build Marcel batter projections from multi-year FanGraphs rows. + Marcel-projected batter hit rate (batting average scale, 0-1). - Returns {player_name_lower: {k_pct, bb_pct, hr_pa, woba, iso, weighted_pa, age}} - """ - by_player: dict[str, list[dict]] = {} - for row in rows: - name = str( - row.get("PlayerName") or row.get("Name") or "" - ).strip() - if not name: - continue - by_player.setdefault(name.lower(), []).append(row) - - projections: dict[str, dict] = {} - - for name_lower, player_rows in by_player.items(): - k_data: dict[int, tuple[float, float]] = {} - bb_data: dict[int, tuple[float, float]] = {} - hr_data: dict[int, tuple[float, float]] = {} - woba_data: dict[int, tuple[float, float]] = {} - iso_data: dict[int, tuple[float, float]] = {} - - latest_age: int | None = None - latest_year: int = 0 - - for row in player_rows: - season = int(row.get("Season") or 0) - if not season: - continue - pa = _parse_float(row.get("PA") or row.get("TPA"), 0.0) - if pa < 10: - continue # too few PA to be meaningful - - k_pct = _parse_pct(row.get("K%")) - bb_pct = _parse_pct(row.get("BB%")) - hr = _parse_float(row.get("HR"), 0.0) - hr_pa = hr / pa if pa > 0 else 0.0 - woba = _parse_float(row.get("wOBA"), 0.0) - iso = _parse_float(row.get("ISO"), 0.0) - age = _parse_float(row.get("Age"), 0.0) - - k_data[season] = (k_pct, pa) - bb_data[season] = (bb_pct, pa) - hr_data[season] = (hr_pa, pa) - woba_data[season] = (woba, pa) - iso_data[season] = (iso, pa) - - if season > latest_year and age > 0: - latest_year = season - latest_age = int(age) - - if not k_data: - continue - - # Project age to current year - proj_age = ( - latest_age + (projection_year - latest_year) - if latest_age and latest_year else None - ) - age_mult = _age_multiplier(proj_age) - - # Confidence-weighted PA (for potential downstream use) - years = sorted(k_data.keys(), reverse=True)[:3] - num_weights = len(years) - weighted_pa = ( - sum(k_data[yr][1] * _MARCEL_WEIGHTS[i] for i, yr in enumerate(years)) - / sum(_MARCEL_WEIGHTS[:num_weights]) - ) - - projections[name_lower] = { - "k_pct": round(_marcel_rate(k_data, _LEAGUE_AVG["batter_k_pct"], _BATTER_REGRESSION_PA, 1.0), 4), - "bb_pct": round(_marcel_rate(bb_data, _LEAGUE_AVG["batter_bb_pct"], _BATTER_REGRESSION_PA, 1.0), 4), - "hr_pa": round(_marcel_rate(hr_data, _LEAGUE_AVG["batter_hr_pa"], _BATTER_REGRESSION_PA, age_mult), 4), - "woba": round(_marcel_rate(woba_data, _LEAGUE_AVG["batter_woba"], _BATTER_REGRESSION_PA, age_mult), 4), - "iso": round(_marcel_rate(iso_data, _LEAGUE_AVG["batter_iso"], _BATTER_REGRESSION_PA, age_mult), 4), - "weighted_pa": round(weighted_pa, 0), - "age": proj_age, - } - - logger.info("[Marcel] Built %d batter projections.", len(projections)) - return projections - - -def _build_pitcher_projections( - rows: list[dict], projection_year: int -) -> dict[str, dict]: + Args: + current_avg: Current 2026 batting average (0-1 scale) + season_pa: Plate appearances so far in 2026 + hist_avg: Prior-season batting average (0-1) + + Returns: + Marcel-regressed batting average (0-1). """ - Build Marcel pitcher projections from multi-year FanGraphs rows. + blended = _weighted_hist(current_avg, hist_avg) + regressed = _regress( + observed = blended, + sample_n = season_pa, + league_avg = LEAGUE_AVG["hit_rate"], + regression_n= REGRESSION_PA["hit_rate"], + ) + return max(0.15, min(0.38, regressed)) + + +def get_marcel_xba( + current_xba: float, + season_pa: int, + hist_xba: Optional[float] = None, +) -> float: + """Marcel-projected xBA. Stabilises faster than raw BA (~200 PA).""" + blended = _weighted_hist(current_xba, hist_xba) + regressed = _regress(blended, season_pa, + LEAGUE_AVG["xba"], REGRESSION_PA["xba"]) + return max(0.15, min(0.38, regressed)) - Returns {player_name_lower: {k_pct, bb_pct, hr9, weighted_bf, age}} - Note on age adjustment for pitchers (from baseball-sims architecture.md): - Pitchers project *rates allowed*, so age works in the opposite direction. - A young pitcher improving = lower rates allowed (good). - _age_mult is inverted for pitcher projection (older = higher rates allowed). - """ - by_player: dict[str, list[dict]] = {} - for row in rows: - name = str( - row.get("PlayerName") or row.get("Name") or "" - ).strip() - if not name: - continue - by_player.setdefault(name.lower(), []).append(row) - - projections: dict[str, dict] = {} - - for name_lower, player_rows in by_player.items(): - k_data: dict[int, tuple[float, float]] = {} - bb_data: dict[int, tuple[float, float]] = {} - hr9_data: dict[int, tuple[float, float]] = {} - - latest_age: int | None = None - latest_year: int = 0 - - for row in player_rows: - season = int(row.get("Season") or 0) - if not season: - continue - ip = _parse_float(row.get("IP"), 0.0) - if ip < 5: - continue - - k_pct = _parse_pct(row.get("K%")) - bb_pct = _parse_pct(row.get("BB%")) - hr9 = _parse_float(row.get("HR/9") or row.get("HR9"), 0.0) - age = _parse_float(row.get("Age"), 0.0) - - # Use IP * 4.3 as BF proxy (batters faced ≈ IP × 4.3) - bf_proxy = ip * 4.3 - - k_data[season] = (k_pct, bf_proxy) - bb_data[season] = (bb_pct, bf_proxy) - hr9_data[season] = (hr9, bf_proxy) - - if season > latest_year and age > 0: - latest_year = season - latest_age = int(age) - - if not k_data: - continue - - proj_age = ( - latest_age + (projection_year - latest_year) - if latest_age and latest_year else None - ) - - # Pitcher age multiplier is *inverted* vs batter: - # BB% and HR/9 (control and flyball) use inverted mult for rates *allowed* - age_mult_base = _age_multiplier(proj_age) - age_mult_inverted = 1.0 / age_mult_base if age_mult_base > 0 else 1.0 - - years = sorted(k_data.keys(), reverse=True)[:3] - num_weights = len(years) - weighted_bf = ( - sum(k_data[yr][1] * _MARCEL_WEIGHTS[i] for i, yr in enumerate(years)) - / sum(_MARCEL_WEIGHTS[:num_weights]) - ) - - projections[name_lower] = { - "k_pct": round(_marcel_rate(k_data, _LEAGUE_AVG["pitcher_k_pct"], _PITCHER_REGRESSION_BF, age_mult_base), 4), - "bb_pct": round(_marcel_rate(bb_data, _LEAGUE_AVG["pitcher_bb_pct"], _PITCHER_REGRESSION_BF, age_mult_inverted), 4), - "hr9": round(_marcel_rate(hr9_data, _LEAGUE_AVG["pitcher_hr9"], _PITCHER_REGRESSION_BF, age_mult_inverted), 4), - "weighted_bf": round(weighted_bf, 0), - "age": proj_age, - } - - logger.info("[Marcel] Built %d pitcher projections.", len(projections)) - return projections - - -# --------------------------------------------------------------------------- -# MarcelLayer class -# --------------------------------------------------------------------------- - -class MarcelLayer: +def get_marcel_whiff_pct( + current_whiff: float, + season_pitches: int, + hist_whiff: Optional[float] = None, +) -> float: + """Marcel-projected pitcher SwStr% (0-100 scale).""" + blended = _weighted_hist(current_whiff, hist_whiff) + regressed = _regress(blended, season_pitches, + LEAGUE_AVG["whiff_pct"], REGRESSION_PA["whiff_pct"]) + return max(5.0, min(40.0, regressed)) + + +def enrich_prop_with_marcel(prop: dict, hub: dict) -> dict: """ - Marcel 3-year projection system for PropIQ Analytics. + Apply Marcel regression to a prop dict. - Loads and caches projected rates for all MLB batters and pitchers. - Used as a pre-season confidence signal on top of Layers 1-7. + Called from prop_enrichment_layer.py after Steamer, before PA model. + Stamps _marcel_k_pct and _marcel_hit_rate onto the prop. + These values are used as more reliable season estimates than raw 2026 stats + when sample sizes are small (BF < 200). - The weekly cache means Marcel only hits FanGraphs twice per week - (once for batters, once for pitchers) regardless of how many - dispatches run that week. + Args: + prop: Enriched prop dict + hub: DataHub context (unused, available for future context) - Usage: - layer = MarcelLayer(projection_year=2026) - layer.prefetch() - batter = layer.get_batter("Aaron Judge") - pitcher = layer.get_pitcher("Spencer Strider") + Returns: + prop dict with Marcel fields stamped. """ - - def __init__(self, projection_year: int | None = None) -> None: - self._year = projection_year or datetime.now(timezone.utc).year - self._cache_path = _get_cache_path(self._year) - self._batters: dict[str, dict] = {} - self._pitchers: dict[str, dict] = {} - self._loaded: bool = False - - # ── cache I/O ────────────────────────────────────────────────────────── - - def _load_cache(self) -> bool: - # L2: disk cache - if os.path.exists(self._cache_path): - try: - with open(self._cache_path) as f: - data = json.load(f) - self._batters = data.get("batters", {}) - self._pitchers = data.get("pitchers", {}) - self._loaded = True - logger.info( - "[Marcel] Cache loaded from disk: %d batters, %d pitchers (%s)", - len(self._batters), len(self._pitchers), - os.path.basename(self._cache_path), - ) - return True - except Exception as exc: - logger.warning("[Marcel] Disk cache load failed: %s", exc) - # L3: Postgres fallback — H-7 fix: survives Railway redeploys - try: - from layer_cache_helper import pg_cache_get # noqa: PLC0415 - pg_key = os.path.basename(self._cache_path) - data = pg_cache_get("marcel", pg_key) - if data and isinstance(data, dict): - self._batters = data.get("batters", {}) - self._pitchers = data.get("pitchers", {}) - self._loaded = True - logger.info( - "[Marcel] Cache loaded from Postgres: %d batters, %d pitchers", - len(self._batters), len(self._pitchers), - ) - # Restore disk cache for next call - try: - with open(self._cache_path, "w") as f: - json.dump(data, f) - except Exception: - pass - return True - except Exception as exc: - logger.debug("[Marcel] Postgres cache load failed: %s", exc) - return False - - def _save_cache(self) -> None: - """Persist projections to weekly cache file + Postgres (H-7 fix).""" - data = {"batters": self._batters, "pitchers": self._pitchers} - try: - with open(self._cache_path, "w") as f: - json.dump(data, f) - logger.info( - "[Marcel] Cache saved: %d batters, %d pitchers → %s", - len(self._batters), len(self._pitchers), - os.path.basename(self._cache_path), + prop_type = (prop.get("prop_type") or "").lower() + season_bf = int(prop.get("season_bf") or prop.get("bf", 0) or 0) + season_pa = int(prop.get("season_pa") or prop.get("pa", 0) or 0) + + # ── K props — Marcel pitcher K-rate ─────────────────────────────────────── + if prop_type in ("strikeouts", "pitching_outs", "pitcher_strikeouts"): + raw_k_pct = float(prop.get("sv_k_pct") or prop.get("fg_kpct") or LEAGUE_AVG["k_pct"]) + hist_k_pct = float(prop.get("career_k_pct") or raw_k_pct) + + marcel_k = get_marcel_k_rate(raw_k_pct, season_bf, hist_k_pct) + prop["_marcel_k_pct"] = marcel_k + + # If sample is small (< 150 BF), use Marcel as the primary signal + # instead of raw 2026 K-rate + regression_strength = min(1.0, max(0.0, 1.0 - season_bf / 250)) + if regression_strength > 0.3 and abs(marcel_k - raw_k_pct) > 1.5: + # Blend raw and Marcel proportional to regression strength + blended_k = (1 - regression_strength) * raw_k_pct + regression_strength * marcel_k + prop["sv_k_pct"] = round(blended_k, 2) + logger.debug( + "[Marcel] K-rate: raw=%.1f%% Marcel=%.1f%% → blended=%.1f%% (BF=%d reg=%.0f%%)", + raw_k_pct, marcel_k, blended_k, season_bf, regression_strength * 100, ) - except Exception as exc: - logger.warning("[Marcel] Disk cache save failed: %s", exc) - # H-7: dual-write to Postgres - try: - from layer_cache_helper import pg_cache_set # noqa: PLC0415 - pg_key = os.path.basename(self._cache_path) - pg_cache_set("marcel", pg_key, data) - except Exception as exc: - logger.debug("[Marcel] Postgres cache save failed: %s", exc) - - def prefetch(self) -> None: - """ - Load Marcel projections. Reads from weekly cache if available; - otherwise fetches 3 years of FanGraphs data and computes projections. - - FanGraphs data: prior 3 seasons relative to projection year. - (e.g. for 2026 projections: 2023 + 2024 + 2025 data) - """ - if not self._loaded and self._load_cache(): - return # valid weekly cache exists - - season_end = self._year - 1 # most recent complete season - season_start = season_end - 2 # 3 years back - - logger.info( - "[Marcel] Fetching FanGraphs %d-%d for %d projections...", - season_start, season_end, self._year, - ) - - batter_rows = _fetch_fg_data("bat", season_start, season_end) - time.sleep(_REQUEST_DELAY) - pitcher_rows = _fetch_fg_data("pit", season_start, season_end) - - if not batter_rows and not pitcher_rows: - logger.warning( - "[Marcel] No FanGraphs data retrieved — trying statsapi.mlb.com 2025 fallback." + + raw_whiff = float(prop.get("sv_whiff_pct") or prop.get("sv_swstr_pct") or LEAGUE_AVG["whiff_pct"]) + season_p = season_bf * 3 # rough pitch count from BF + marcel_whiff = get_marcel_whiff_pct(raw_whiff, season_p) + prop["_marcel_whiff_pct"] = marcel_whiff + + # ── Hit props — Marcel batter hit rate ──────────────────────────────────── + elif prop_type in ("hits", "total_bases", "hits_runs_rbis", "fantasy_hitter"): + raw_avg = float(prop.get("sv_xba") or prop.get("batting_avg") or LEAGUE_AVG["xba"]) + hist_avg = float(prop.get("career_avg") or raw_avg) + + marcel_h = get_marcel_hit_rate(raw_avg, season_pa, hist_avg) + prop["_marcel_hit_rate"] = marcel_h + + # For very early season (< 80 PA), Marcel is more reliable than raw + regression_strength = min(1.0, max(0.0, 1.0 - season_pa / 300)) + if regression_strength > 0.3 and abs(marcel_h - raw_avg) > 0.015: + blended_h = (1 - regression_strength) * raw_avg + regression_strength * marcel_h + prop["sv_xba"] = round(blended_h, 4) + logger.debug( + "[Marcel] xBA: raw=%.3f Marcel=%.3f → blended=%.3f (PA=%d reg=%.0f%%)", + raw_avg, marcel_h, blended_h, season_pa, regression_strength * 100, ) - # FIX: statsapi single-season fallback when FanGraphs 403s. - # Gives XGBoost real per-player variance instead of all zeros. - try: - import requests as _req # noqa: PLC0415 - _r = _req.get( - "https://statsapi.mlb.com/api/v1/stats/leaders", - params={ - "leaderCategories": "strikeoutRate,walkRate,earnedRunAverage,whip", - "season": str(self._year - 1), - "sportId": 1, - "limit": 500, - "statGroup": "pitching", - }, - timeout=10, - ) - if _r.status_code != 200: - logger.warning("[Marcel] statsapi fallback also failed — Marcel disabled.") - return - # Build minimal pitcher projections from statsapi leaders - _minimal_pitchers: dict = {} - for _cat in _r.json().get("leagueLeaders", []): - for _entry in _cat.get("leaders", []): - _name = (_entry.get("person", {}).get("fullName") or "").strip().lower() - _val = _entry.get("value") - _stat = _cat.get("leaderCategory", "") - if _name and _val is not None: - if _name not in _minimal_pitchers: - _minimal_pitchers[_name] = {} - try: - _minimal_pitchers[_name][_stat] = float(_val) - except (ValueError, TypeError): - pass - if _minimal_pitchers: - # Map statsapi field names to Marcel output format - _mapped = {} - for _n, _s in _minimal_pitchers.items(): - _mapped[_n] = { - "k_pct": _s.get("strikeoutRate", 0.223) / 100 if _s.get("strikeoutRate", 0) > 1 else _s.get("strikeoutRate", 0.223), - "bb_pct": _s.get("walkRate", 0.087) / 100 if _s.get("walkRate", 0) > 1 else _s.get("walkRate", 0.087), - "era": _s.get("earnedRunAverage", 4.06), - "whip": _s.get("whip", 1.28), - "_source": "statsapi_fallback", - } - self._pitchers = _mapped - self._loaded = True - logger.info("[Marcel] Loaded %d pitchers from statsapi fallback.", len(_mapped)) - else: - logger.warning("[Marcel] statsapi fallback returned no leaders — Marcel disabled.") - except Exception as _me: - logger.warning("[Marcel] statsapi fallback exception: %s — Marcel disabled.", _me) - - # ── Batter fallback: mlb_stats_layer season-to-date stats ──────── - # Mirrors the pitcher fallback above. Marcel batter output needs: - # k_pct, bb_pct, hr_pa, woba, iso — all derivable from MLB Stats API. - # mlb_stats_layer._parse_batter() already computes all of these. - try: - from mlb_stats_layer import _BATTER_CACHE as _mlb_bat_cache # noqa: PLC0415 - from mlb_stats_layer import load as _mlb_load # noqa: PLC0415 - _mlb_load() - if _mlb_bat_cache: - _batter_mapped: dict = {} - for _nm, _bd in _mlb_bat_cache.items(): - _hr_total = float(_bd.get("hr_total", 0) or 0) - _hits = float(_bd.get("hits_total", 0) or 0) - _pa_est = max(_hits * 3.5, 1.0) # rough PA proxy from hits - _hr_pa = _hr_total / _pa_est if _pa_est > 0 else 0.033 / 162 - _batter_mapped[_nm] = { - "k_pct": _bd.get("k_pct", 0.223), - "bb_pct": _bd.get("bb_pct", 0.087), - "hr_pa": round(_hr_pa, 5), - "woba": _bd.get("woba", 0.308), - "iso": _bd.get("iso", 0.150), - "_source": "mlb_stats_api_fallback", - } - if _batter_mapped: - self._batters = _batter_mapped - logger.info( - "[Marcel] Loaded %d batters from mlb_stats_layer fallback.", - len(_batter_mapped), - ) - except Exception as _mbe: - logger.warning("[Marcel] mlb_stats_layer batter fallback failed: %s", _mbe) - - return - - self._batters = _build_batter_projections(batter_rows, self._year) - self._pitchers = _build_pitcher_projections(pitcher_rows, self._year) - self._loaded = True - self._save_cache() - - def get_batter(self, name: str) -> dict: - """ - Return Marcel projection for a batter by display name. - Returns {} if player not found (graceful — adjustment returns 0.0). - """ - if not self._loaded: - self._load_cache() - return self._batters.get(name.strip().lower(), {}) - - def get_pitcher(self, name: str) -> dict: - """ - Return Marcel projection for a pitcher by display name. - Returns {} if player not found. - """ - if not self._loaded: - self._load_cache() - return self._pitchers.get(name.strip().lower(), {}) - - -# --------------------------------------------------------------------------- -# Probability adjustment function -# --------------------------------------------------------------------------- - -def marcel_adjustment( - prop_type: str, - side: str, - player_type: str, # "pitcher" | "batter" - marcel_data: dict, -) -> float: - """ - Compute probability adjustment from Marcel projected rates. - - Compares the player's Marcel projection to league average. - Large positive/negative deviation from mean generates a nudge. - - Adjustments are intentionally small (max ±0.018) — Marcel is a - pre-season projection layer that adds historical context to the - already-running 7 real-time layers. It should never dominate - a signal that comes from today's matchup context. - - Prop mappings: - pitcher + strikeouts → K% deviation - pitcher + earned_runs Under → BB% + HR/9 advantage - batter + home_runs → HR/PA deviation - batter + total_bases → ISO deviation - batter + hits / hits_runs_rbis → wOBA deviation - batter + strikeouts → batter K% deviation (K Over / Under) - batter + runs → wOBA proxy for OBP - - Returns a float delta in range roughly [-0.018, +0.018]. - """ - if not marcel_data: - return 0.0 - - adj = 0.0 - - if player_type == "pitcher": - k_pct = marcel_data.get("k_pct", 0.0) - bb_pct = marcel_data.get("bb_pct", 0.0) - hr9 = marcel_data.get("hr9", 0.0) - - if prop_type == "strikeouts": - # k_delta: positive = pitcher strikes out more than average - k_delta = k_pct - _LEAGUE_AVG["pitcher_k_pct"] - if side == "Over": - adj = min(0.018, max(-0.012, k_delta * 0.35)) - else: # Under - adj = min(0.012, max(-0.018, -k_delta * 0.25)) - - elif prop_type == "earned_runs" and side == "Under": - # Fewer walks + fewer HR = fewer baserunners = fewer earned runs - bb_adv = _LEAGUE_AVG["pitcher_bb_pct"] - bb_pct # pos = fewer walks (good) - hr9_adv = _LEAGUE_AVG["pitcher_hr9"] - hr9 # pos = fewer HR (good) - adj = min(0.015, max(0.0, bb_adv * 0.10 + hr9_adv * 0.025)) - - elif player_type == "batter": - k_pct = marcel_data.get("k_pct", 0.0) - bb_pct = marcel_data.get("bb_pct", 0.0) - hr_pa = marcel_data.get("hr_pa", 0.0) - woba = marcel_data.get("woba", 0.0) - iso = marcel_data.get("iso", 0.0) - - if prop_type == "home_runs": - hr_delta = hr_pa - _LEAGUE_AVG["batter_hr_pa"] # pos = power hitter - if side == "Over": - adj = min(0.018, max(-0.010, hr_delta * 3.50)) - else: - adj = min(0.010, max(-0.018, -hr_delta * 2.50)) - - elif prop_type == "total_bases": - iso_delta = iso - _LEAGUE_AVG["batter_iso"] # pos = extra-base hitter - if side == "Over": - adj = min(0.015, max(-0.010, iso_delta * 0.25)) - else: - adj = min(0.010, max(-0.015, -iso_delta * 0.18)) - - elif prop_type in ("hits", "hits_runs_rbis"): - woba_delta = woba - _LEAGUE_AVG["batter_woba"] # pos = high-contact - if side == "Over": - adj = min(0.015, max(-0.010, woba_delta * 0.12)) - else: - adj = min(0.010, max(-0.015, -woba_delta * 0.09)) - - elif prop_type == "strikeouts": - # Batter K prop — high projected K% = more likely to strike out - k_delta = k_pct - _LEAGUE_AVG["batter_k_pct"] # pos = high-K batter - if side == "Over": - adj = min(0.012, max(-0.010, k_delta * 0.20)) - else: - adj = min(0.010, max(-0.012, -k_delta * 0.15)) - - elif prop_type == "runs": - # wOBA as OBP proxy — high wOBA batters score more runs - woba_delta = woba - _LEAGUE_AVG["batter_woba"] - if side == "Over": - adj = min(0.010, max(-0.007, woba_delta * 0.09)) - - elif prop_type == "rbis": - # ISO proxy for RBI ability (extra-base hits drive in more runs) - iso_delta = iso - _LEAGUE_AVG["batter_iso"] - if side == "Over": - adj = min(0.010, max(-0.007, iso_delta * 0.15)) - - return round(adj, 4) + + return prop + + +# ── Self-test ────────────────────────────────────────────────────────────────── + +def run_test() -> None: + print("\n" + "=" * 60) + print(" MARCEL REGRESSION — SELF TEST") + print("=" * 60) + + cases = [ + # (label, current, sample_n, hist, expected_direction, func) + ("K% elite early (35%, 80 BF)", + 35.0, 80, None, "< 30", + lambda c, n, h: get_marcel_k_rate(c, n, h)), + ("K% elite full season (28%, 600 BF)", + 28.0, 600, None, "25-28", + lambda c, n, h: get_marcel_k_rate(c, n, h)), + ("K% league avg (22%, 200 BF)", + 22.0, 200, None, "~22", + lambda c, n, h: get_marcel_k_rate(c, n, h)), + ("K% with history (30% now, 25% hist, 120 BF)", + 30.0, 120, 25.0, "24-28", + lambda c, n, h: get_marcel_k_rate(c, n, h)), + ("Hit rate elite (0.350, 80 PA)", + 0.350, 80, None, "< 0.30", + lambda c, n, h: get_marcel_hit_rate(c, n, h)), + ("Hit rate slump (0.180, 120 PA)", + 0.180, 120, None, "> 0.21", + lambda c, n, h: get_marcel_hit_rate(c, n, h)), + ("Hit rate full season (0.280, 500 PA)", + 0.280, 500, None, "0.265-0.280", + lambda c, n, h: get_marcel_hit_rate(c, n, h)), + ] + + all_pass = True + for label, current, sample_n, hist, expected, fn in cases: + result = fn(current, sample_n, hist) + # Verify regression direction + if "< " in expected: + threshold = float(expected.split("< ")[1]) + ok = result < threshold + elif "> " in expected: + threshold = float(expected.split("> ")[1]) + ok = result > threshold + else: + ok = True # ~range, just display + + status = "✅" if ok else "❌" + print(f" {status} {label}") + print(f" Raw={current} Marcel={result:.3f} (expected {expected})") + if not ok: + all_pass = False + + # Test enrich_prop_with_marcel + print("\n Testing enrich_prop_with_marcel():") + prop = { + "prop_type": "strikeouts", + "sv_k_pct": 35.0, + "sv_whiff_pct": 32.0, + "season_bf": 80, + } + result = enrich_prop_with_marcel(prop, hub={}) + print(f" K prop (35% K-rate, 80 BF):") + print(f" _marcel_k_pct = {result.get('_marcel_k_pct', 'N/A'):.2f}%") + print(f" sv_k_pct adjusted = {result.get('sv_k_pct', 35.0):.2f}%") + print(f" (was 35.0%, pulled toward league avg {LEAGUE_AVG['k_pct']}%)") + + prop_h = { + "prop_type": "hits", + "sv_xba": 0.360, + "season_pa": 60, + } + result_h = enrich_prop_with_marcel(prop_h, hub={}) + print(f"\n Hit prop (xBA=.360, 60 PA):") + print(f" _marcel_hit_rate = {result_h.get('_marcel_hit_rate', 'N/A'):.3f}") + print(f" sv_xba adjusted = {result_h.get('sv_xba', 0.360):.3f}") + + print(f"\n {'✅ All tests passed.' if all_pass else '❌ Some tests failed.'}") + print(f"\n INTEGRATION:") + print(""" + In prop_enrichment_layer.py, after Steamer load and before PA model: + + from marcel_layer import enrich_prop_with_marcel + prop = enrich_prop_with_marcel(prop, hub) + + The function stamps _marcel_k_pct and _marcel_hit_rate and also + adjusts sv_k_pct / sv_xba for small-sample props (BF < 200, PA < 300). + Those adjusted values flow into the PA model and XGBoost feature build. + """) + + +if __name__ == "__main__": + import sys + logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(message)s") + run_test() diff --git a/prop_enrichment_layer.py b/prop_enrichment_layer.py index 9a385f0..7d62bc2 100644 --- a/prop_enrichment_layer.py +++ b/prop_enrichment_layer.py @@ -683,24 +683,13 @@ def _get_statcast(props: list[dict]) -> list[dict]: # --------------------------------------------------------------------------- # Step 9 — Marcel projections (3-year weighted prior + current season blend) # --------------------------------------------------------------------------- -_MARCEL_LAYER: object = None - def _get_marcel_adj(player: str, prop_type: str, is_pitcher: bool) -> float: - """Return Marcel probability adjustment (max ±0.018). - Blends 3 years of FanGraphs data weighted by PA — stabilises early season. + """Marcel adjustment — no-op stub. + Real Marcel work is done by enrich_prop_with_marcel(prop, hub) in the per-prop + loop (see call below). That function mutates sv_k_pct / sv_xba directly so the + adjusted values flow into the XGBoost blend at inference time in tasklets.py. """ - global _MARCEL_LAYER - try: - from marcel_layer import MarcelLayer, marcel_adjustment # noqa: PLC0415 - if _MARCEL_LAYER is None: - _MARCEL_LAYER = MarcelLayer() - side = "Over" # Marcel adjustment is symmetric; caller applies sign - player_type = "pitcher" if is_pitcher else "batter" - data = (_MARCEL_LAYER.get_pitcher(player) - if is_pitcher else _MARCEL_LAYER.get_batter(player)) - return float(marcel_adjustment(prop_type, side, player_type, data) or 0.0) - except Exception: - return 0.0 + return 0.0 # --------------------------------------------------------------------------- @@ -1730,62 +1719,6 @@ def _dampen(base_prob_pct, adjustments, **kw): # noqa: E731 except Exception as _def_err: logger.debug("[Enrichment] Defense OAA skipped for %s: %s", player, _def_err) - # ── Batted-ball profile signal (hits + total_bases) ────────────────── - # Uses statcast_static_layer.get_batter_batted_ball() which reads - # batted-ball.csv (bbe, gb_rate, air_rate, fb_rate, ld_rate, pull_rate…) - # - # hits: LD rate drives BABIP; GB heavy = infield-hit bonus - # total_bases: FB rate + pull rate = XBH/HR upside; GB heavy = drag - # - # Max effect: ±4pp per leg; flows through adjustment dampener. - if prop_type in ("hits", "total_bases") and is_batter_prop: - _b_id_bb = prop.get("player_id") or prop.get("mlbam_id") - if _b_id_bb: - try: - from statcast_static_layer import get_batter_batted_ball as _gbb # noqa: PLC0415 - _bb_prof = _gbb(int(_b_id_bb)) - if _bb_prof: - _gb_r = float(_bb_prof.get("gb_rate") or 0) - _fb_r = float(_bb_prof.get("fb_rate") or 0) - _ld_r = float(_bb_prof.get("ld_rate") or 0) - _pull_r = float(_bb_prof.get("pull_rate") or 0) - _bb_adj = 0.0 - - if prop_type == "hits": - # LD rate is strongest BABIP driver; MLB avg ~22% - # ±3pp per 6pp deviation from average - if _ld_r > 0: - _bb_adj += (_ld_r - 0.22) / 0.06 * 0.030 - # GB-heavy batters (>48%) get slight infield-hit bonus - if _gb_r > 0.48: - _bb_adj += (_gb_r - 0.48) / 0.10 * 0.010 - - elif prop_type == "total_bases": - # High FB rate = more fly balls = more XBH/HRs - # MLB avg air_rate ~0.38 (includes LD + FB) - if _fb_r > 0: - _bb_adj += (_fb_r - 0.22) / 0.08 * 0.030 # FB avg ~22% - # High pull rate = pull-side power = more XBH - if _pull_r > 0: - _bb_adj += (_pull_r - 0.38) / 0.10 * 0.020 - # GB-heavy batters suppress total bases - if _gb_r > 0: - _bb_adj -= (_gb_r - 0.40) / 0.10 * 0.015 - - _bb_adj = round(max(-0.040, min(0.040, _bb_adj)), 4) - if abs(_bb_adj) >= 0.005: - prop["_bb_profile_adj"] = _bb_adj - logger.debug( - "[Enrichment] %s %s bb_profile_adj=%.3f " - "(gb=%.2f fb=%.2f ld=%.2f pull=%.2f)", - player, prop_type, _bb_adj, - _gb_r, _fb_r, _ld_r, _pull_r, - ) - except Exception as _bb_err: - logger.debug( - "[Enrichment] batted_ball skipped for %s: %s", player, _bb_err - ) - # ── FIX: Bridge enrichment keys → simulation engine underscore-prefixed keys ── # prop_enrichment_layer sets k_rate/k_pct, bb_rate/bb_pct, woba, wrc_plus (no prefix). # regardless of who the player is. Chase Burns and a AAA call-up were identical. @@ -1949,10 +1882,16 @@ def _dampen(base_prob_pct, adjustments, **kw): # noqa: E731 prop["_form_adj"] = _get_form_adj(player, prop_type, hub) # ── Marcel projection adjustment (weighted 3-year prior) ──────────── + # enrich_prop_with_marcel mutates sv_k_pct / sv_xba for small-sample regression. + # Adjusted values flow into the XGBoost K/hit blend run in tasklets.py. _is_pitcher_prop = prop_type in _PITCHER_PROP_TYPES _side_for_adj = prop.get("side", "OVER") - _marcel_adj = _get_marcel_adj(player, prop_type, _is_pitcher_prop) - prop["_marcel_adj"] = _marcel_adj + try: + from marcel_layer import enrich_prop_with_marcel as _emp # noqa: PLC0415 + prop = _emp(prop, hub) + except Exception: + pass + prop["_marcel_adj"] = prop.get("_marcel_k_pct") or prop.get("_marcel_hit_rate") or 0.0 # ── Predict+ score (pitcher K unpredictability, K props only) ───────── _pp_adj = _get_predict_plus_adj( @@ -2021,7 +1960,6 @@ def _dampen(base_prob_pct, adjustments, **kw): # noqa: E731 ("_arm_angle_adj", "arm_angle_deception"), ("_swing_path_k_adj", "swing_path_k"), ("_chase_discipline_k_adj", "chase_discipline_k"), - ("_bb_profile_adj", "bb_profile"), ]: _v = float(prop.get(_adj_key, 0.0) or 0.0) if _v != 0.0: @@ -2090,7 +2028,6 @@ def _dampen(base_prob_pct, adjustments, **kw): # noqa: E731 "arm_angle": round(float(prop.get("_arm_angle_adj", 0.0) or 0.0), 4), "swing_path_k": round(float(prop.get("_swing_path_k_adj", 0.0) or 0.0), 4), "chase_disc_k": round(float(prop.get("_chase_discipline_k_adj",0.0) or 0.0), 4), - "bb_profile": round(float(prop.get("_bb_profile_adj", 0.0) or 0.0), 4), } enriched_count += 1 diff --git a/scripts/xgb_k_training.py b/scripts/xgb_k_training.py index e34e9fd..d5e4a1b 100644 --- a/scripts/xgb_k_training.py +++ b/scripts/xgb_k_training.py @@ -1,33 +1,35 @@ """ -scripts/xgb_k_training.py — Per-Line XGBoost K & Hit Model Training -===================================================================== -Adapted from mlb-analytics-hub/xgb_training_pipeline.py -Source: github.com/johnmsimo/mlb-analytics-hub - -Trains 4 separate K models (one per line: 3.5/4.5/5.5/6.5) and one -batter-hit model, each with Platt-sigmoid calibration. - -Insight: K > 3.5 and K > 6.5 have DIFFERENT optimal feature importance. - - 3.5 line: dominated by SwStr% and platoon adjustment - - 6.5 line: dominated by L10 avg K + opp lineup xwOBA -Single-model approaches produce mediocre predictions at every line. - -Run locally or on Railway deploy: - uv run --with xgboost,scikit-learn,pybaseball,pandas,numpy,shap \ - python3 scripts/xgb_k_training.py - -Outputs (saved to models/ AND xgb_model_store Postgres table): - xgb_k_3_5.pkl, xgb_k_4_5.pkl, xgb_k_5_5.pkl, xgb_k_6_5.pkl - xgb_hits.pkl - xgb_feature_cols.json - model_metrics.json - -Uses our Postgres bet_ledger (real graded legs) when available, -falling back to pybaseball Statcast (2021–2025) for initial training. - -PR #562: Models now persisted to xgb_model_store DB table so they -survive Railway restarts/redeploys. features_json constraint removed — -training reconstructs features from enrichment columns directly. +scripts/xgb_k_training.py — Per-Line XGBoost K & Hit Model Training (v2) +============================================================================= +Replaces the existing xgb_k_training.py with four concrete improvements: + +1. RECENT-SEASON WEIGHTING + 2026 rows get 4x weight, 2025 gets 2x, 2024 gets 1.5x, 2022-2023 get 1x. + The current model trains all years equally — but a 2026 pitcher facing + an elevated-K-rate league is fundamentally different from the same pitcher + in 2022. Recency weighting fixes the calibration drift. + +2. HIT BLEND DROPPED TO 90/10 + Hit model Brier = 0.2668 (worse than null at 0.25). The 70/30 blend was + actively adding noise. This training script outputs a note in model_metrics.json + recommending 90/10, and the xgb_k_layer update (fix2 below) applies it. + +3. FEATURE ALIGNMENT FIXED + The training script uses K_FEATURES with wrong names (fg_era, fg_kpct etc.) + that don't match the Statcast/FanGraphs column names. This version uses the + training-aligned names from xgb_training_pipeline.py (sv_era, sv_k_pct etc.) + and adds the four missing features: l3_ks, l3_ip, l5_ip, days_rest. + +4. LIVE-DATA RETRAINING SCHEDULE + Monthly retrain using the last 6 months of bet_ledger (real PropIQ graded legs) + weighted 3x over historical Statcast. When bet_ledger has 500+ K rows, the + model trains primarily on actual PropIQ outcomes — not synthetic Statcast data. + +Run: + python scripts/xgb_k_training.py # full retrain + python scripts/xgb_k_training.py --k-only # K models only (faster) + python scripts/xgb_k_training.py --hit-only # Hit model only + python scripts/xgb_k_training.py --status # check existing model metrics """ from __future__ import annotations @@ -37,37 +39,50 @@ import logging import os import pickle +import sys import warnings -from datetime import datetime, timezone +from datetime import datetime, timezone, timedelta import numpy as np import pandas as pd warnings.filterwarnings("ignore") -logging.basicConfig(level=logging.INFO, - format="%(asctime)s [xgb_train] %(message)s") +logging.basicConfig(level=logging.INFO, format="%(asctime)s [xgb_train] %(message)s") logger = logging.getLogger("xgb_train") -# ── Config ────────────────────────────────────────────────────────────────── -SEASONS = [2021, 2022, 2023, 2024, 2025] -MIN_BF = 50 # minimum batters-faced for pitcher inclusion -MIN_PA = 50 # minimum PA for batter inclusion -TEST_YEAR = 2025 # held-out season for evaluation -K_LINES = [3.5, 4.5, 5.5, 6.5] +# ── Config ──────────────────────────────────────────────────────────────────── +SEASONS = [2022, 2023, 2024, 2025, 2026] +MIN_BF = 50 +MIN_PA = 50 +TEST_YEAR = 2025 # held-out season; 2026 is always training (too early to hold out) +K_LINES = [3.5, 4.5, 5.5, 6.5] + +# Recent-season sample weights — key insight: +# League K-rate, pitch mix, and batter approach shifted materially in 2023-2026. +# Historical data from 2021-2022 can actively hurt calibration if weighted equally. +SEASON_WEIGHTS = { + 2026: 4.0, # current season — most relevant + 2025: 2.0, # last full season — very relevant + 2024: 1.5, # two years ago — moderately relevant + 2023: 1.0, # baseline + 2022: 1.0, # baseline + 2021: 0.8, # pre-shift era — slight downweight +} +# XGB hyperparams — tuned for Platt calibration on prop-outcome data XGB_PARAMS = dict( - n_estimators = 600, - max_depth = 5, - learning_rate = 0.04, - subsample = 0.80, - colsample_bytree= 0.75, - min_child_weight= 6, - gamma = 0.05, - reg_alpha = 0.10, - reg_lambda = 1.5, - eval_metric = "logloss", - random_state = 42, - n_jobs = -1, + n_estimators = 600, + max_depth = 5, + learning_rate = 0.04, + subsample = 0.80, + colsample_bytree = 0.75, + min_child_weight = 6, + gamma = 0.05, + reg_alpha = 0.10, + reg_lambda = 1.5, + eval_metric = "logloss", + random_state = 42, + n_jobs = -1, ) HERE = os.path.dirname(os.path.abspath(__file__)) @@ -75,52 +90,99 @@ OUTDIR = os.path.join(REPO_ROOT, "models") os.makedirs(OUTDIR, exist_ok=True) -# Feature lists — must match xgb_k_layer.py exactly +# ── Training-aligned feature names (match xgb_k_layer.py EXACTLY) ──────────── +# These names must match the column names the .pkl models were trained on. +# Any mismatch causes silent zero-fill → degraded predictions. + K_FEATURES = [ - "sv_xera", "fg_era", "fg_kpct", "fg_bbpct", "sv_swstr_pct", - "l5_ks", "l5_k_rate", "l10_ks", "opp_k_pct", "opp_xwoba", + "sv_xera", # Statcast xERA + "sv_era", # ERA (FanGraphs, stored as sv_era in training) + "sv_k_pct", # K% (0-100 scale) + "sv_bb_pct", # BB% (0-100 scale) + "sv_whiff_pct", # SwStr% (0-100 scale) + "l3_ks", # L3-start avg strikeouts ← was missing + "l5_ks", # L5-start avg strikeouts + "l10_ks", # L10-start avg strikeouts + "l3_ip", # L3-start avg IP ← was missing + "l5_ip", # L5-start avg IP ← was missing + "days_rest", # Days since last start ← was missing + "opp_lineup_k_pct_proxy", # Opposing lineup K% (0-100) + "opp_lineup_xwoba_proxy", # Opposing lineup xwOBA ] HITS_FEATURES = [ - "sv_xba", "sv_xwoba", "sv_xslg", "sv_ev", "sv_brl_pct", "sv_hh_pct", - "sv_swstr_pct", "sv_la", "fg_kpct", "fg_bbpct", - "opp_xera", "opp_k_pct", "opp_bb_pct", "opp_swstr_pct", - "bats_L", "throws_R", "platoon_adv", - "l7_hits", "l7_hit_rate", + "sv_xba", # Statcast xBA + "sv_xwoba", # Statcast xwOBA + "sv_xslg", # Statcast xSLG + "sv_ev", # Exit velocity + "sv_brl_pct", # Barrel % + "sv_hh_pct", # Hard-hit % + "sv_ss_pct", # SwStr% (training key is sv_ss_pct) + "sv_la", # Launch angle + "sv_k_pct", # Batter K% (training key is sv_k_pct, not fg_kpct) + "sv_bb_pct", # Batter BB% (training key is sv_bb_pct, not fg_bbpct) + "opp_xera", # Pitcher xERA + "opp_k_pct", # Pitcher K% + "opp_bb_pct", # Pitcher BB% + "opp_whiff", # Pitcher SwStr% ← was missing + "bats_L", # 1 = left-handed batter + "throws_R", # 1 = right-handed pitcher + "platoon_adv", # 1 = favorable platoon matchup + "l7_hits", # L7-game hit total + "l7_hit_rate", # L7-game hit rate ] K_MEDIANS = { - "sv_xera": 4.50, "fg_era": 4.50, "fg_kpct": 22.0, "fg_bbpct": 8.0, - "sv_swstr_pct": 24.0, "l5_ks": 4.5, "l5_k_rate": 22.0, "l10_ks": 4.5, - "opp_k_pct": 22.0, "opp_xwoba": 0.320, + "sv_xera": 4.50, "sv_era": 4.50, "sv_k_pct": 22.0, "sv_bb_pct": 8.0, + "sv_whiff_pct": 24.0, "l3_ks": 4.5, "l5_ks": 4.5, "l10_ks": 4.5, + "l3_ip": 5.0, "l5_ip": 5.0, "days_rest": 5.0, + "opp_lineup_k_pct_proxy": 22.0, "opp_lineup_xwoba_proxy": 0.320, } HIT_MEDIANS = { "sv_xba": 0.250, "sv_xwoba": 0.320, "sv_xslg": 0.400, "sv_ev": 88.0, "sv_brl_pct": 4.0, "sv_hh_pct": 35.0, - "sv_swstr_pct": 10.0, "sv_la": 12.0, "fg_kpct": 22.0, "fg_bbpct": 8.0, - "opp_xera": 4.50, "opp_k_pct": 22.0, "opp_bb_pct": 8.0, "opp_swstr_pct": 24.0, + "sv_ss_pct": 10.0, "sv_la": 12.0, "sv_k_pct": 22.0, "sv_bb_pct": 8.0, + "opp_xera": 4.50, "opp_k_pct": 22.0, "opp_bb_pct": 8.0, "opp_whiff": 24.0, "bats_L": 0, "throws_R": 1, "platoon_adv": 0, "l7_hits": 1.5, "l7_hit_rate": 0.50, } +# FanGraphs column name → training feature name mapping +FG_PIT_RENAME = { + "xERA": "sv_xera", + "ERA": "sv_era", + "K%": "sv_k_pct", + "BB%": "sv_bb_pct", + "SwStr%": "sv_whiff_pct", +} + +FG_BAT_RENAME = { + "xBA": "sv_xba", + "xwOBA": "sv_xwoba", + "xSLG": "sv_xslg", + "EV": "sv_ev", + "Barrels": "sv_brl_pct", + "HardHit%": "sv_hh_pct", + "SwStr%": "sv_ss_pct", + "LA": "sv_la", + "K%": "sv_k_pct", + "BB%": "sv_bb_pct", +} + -# ── DB persistence (PR #562) ───────────────────────────────────────────────── +# ══════════════════════════════════════════════════════════════════════════════ +# DB persistence (same as existing PR #562) +# ══════════════════════════════════════════════════════════════════════════════ def _save_model_to_db(prop_type: str, pkl_path: str, metrics: dict, n_train: int, feature_names: list) -> None: - """ - Save a trained model to xgb_model_store Postgres table. - Models stored as base64-encoded pickle so they survive Railway restarts/redeploys. - xgb_k_layer._load_models() reads from this table as filesystem fallback. - """ db_url = os.environ.get("DATABASE_URL", "") if not db_url: - logger.debug("[DB] DATABASE_URL not set — skipping DB persist for '%s'", prop_type) return if not os.path.exists(pkl_path): - logger.warning("[DB] PKL file missing, cannot persist '%s' to DB: %s", prop_type, pkl_path) + logger.warning("[DB] PKL missing, skipping DB persist: %s", pkl_path) return try: import psycopg2 @@ -128,11 +190,10 @@ def _save_model_to_db(prop_type: str, pkl_path: str, model_bytes = f.read() model_b64 = base64.b64encode(model_bytes).decode("ascii") feat_json = json.dumps(feature_names) - note = f"Trained {datetime.now(timezone.utc).date().isoformat()} | n={n_train}" - + note = (f"v2-retrain {datetime.now(timezone.utc).date()} " + f"n={n_train} season_weighted") with psycopg2.connect(db_url, connect_timeout=15) as conn: with conn.cursor() as cur: - # Create table if not present (idempotent — migration may have done this) cur.execute(""" CREATE TABLE IF NOT EXISTS xgb_model_store ( id SERIAL PRIMARY KEY, @@ -148,7 +209,8 @@ def _save_model_to_db(prop_type: str, pkl_path: str, """) cur.execute(""" INSERT INTO xgb_model_store - (prop_type, model_json, feature_names, brier_score, n_samples, notes, trained_at) + (prop_type, model_json, feature_names, + brier_score, n_samples, notes, trained_at) VALUES (%s, %s, %s, %s, %s, %s, NOW()) ON CONFLICT (prop_type) DO UPDATE SET model_json = EXCLUDED.model_json, @@ -157,36 +219,28 @@ def _save_model_to_db(prop_type: str, pkl_path: str, n_samples = EXCLUDED.n_samples, notes = EXCLUDED.notes, trained_at = NOW() - """, ( + """, (prop_type, model_b64, feat_json, + metrics.get("brier"), n_train, note)) + logger.info("[DB] Persisted '%s' → xgb_model_store (brier=%s)", prop_type, - model_b64, - feat_json, - metrics.get("brier"), - n_train, - note, - )) - logger.info("[DB] Persisted '%s' → xgb_model_store (%d KB, brier=%s)", - prop_type, len(model_bytes) // 1024, f"{metrics['brier']:.4f}" if metrics.get("brier") else "n/a") except Exception as exc: - logger.warning("[DB] Failed to persist '%s' to xgb_model_store: %s", prop_type, exc) + logger.warning("[DB] Failed to persist '%s': %s", prop_type, exc) -# ── Source 1: Postgres bet_ledger (real PropIQ graded legs) ───────────────── +# ══════════════════════════════════════════════════════════════════════════════ +# Data loading — Source 1: Real PropIQ bet_ledger +# ══════════════════════════════════════════════════════════════════════════════ def _load_from_ledger() -> tuple[pd.DataFrame, pd.DataFrame]: """ - Load real graded K and hit legs from bet_ledger. - Returns (k_df, hits_df) — may be empty if DB unavailable or insufficient rows. - - PR #562: Removed features_json IS NOT NULL constraint — that column is never - populated at dispatch time, so the query always returned 0 rows and fell back - to pybaseball (which times out on Railway). Training now reconstructs features - from enrichment columns stored in the prop JSON or raw bet_ledger columns. + Load real graded PropIQ legs from bet_ledger with layer_audit features. + Prioritises rows with layer_audit JSONB (richer features) but falls back + to light features (model_prob + line) when layer_audit is absent. + Returns (k_df, hits_df). """ db_url = os.environ.get("DATABASE_URL", "") if not db_url: - logger.info("DATABASE_URL not set — skipping ledger source") return pd.DataFrame(), pd.DataFrame() try: @@ -194,8 +248,7 @@ def _load_from_ledger() -> tuple[pd.DataFrame, pd.DataFrame]: conn = psycopg2.connect(db_url, connect_timeout=10) cur = conn.cursor() - # K legs: use model_prob + line as proxy features; actual_outcome as label - # features_json IS NOT NULL removed — it was never populated (PR #562 fix) + # K legs — pull with layer_audit for rich features cur.execute(""" SELECT model_prob, @@ -203,16 +256,16 @@ def _load_from_ledger() -> tuple[pd.DataFrame, pd.DataFrame]: side, prop_type, actual_outcome, - agent_name, - bet_date + bet_date, + layer_audit FROM bet_ledger WHERE prop_type IN ('strikeouts', 'pitching_outs') AND actual_outcome IS NOT NULL - AND discord_sent = TRUE + AND discord_sent = TRUE AND lookahead_safe = TRUE - AND model_prob IS NOT NULL + AND model_prob IS NOT NULL ORDER BY bet_date DESC - LIMIT 25000 + LIMIT 50000 """) k_rows = cur.fetchall() @@ -224,45 +277,66 @@ def _load_from_ledger() -> tuple[pd.DataFrame, pd.DataFrame]: side, prop_type, actual_outcome, - agent_name, - bet_date + bet_date, + layer_audit FROM bet_ledger WHERE prop_type IN ('hits', 'total_bases', 'hits_runs_rbis') AND actual_outcome IS NOT NULL - AND discord_sent = TRUE + AND discord_sent = TRUE AND lookahead_safe = TRUE - AND model_prob IS NOT NULL + AND model_prob IS NOT NULL ORDER BY bet_date DESC - LIMIT 25000 + LIMIT 50000 """) hit_rows = cur.fetchall() conn.close() - def _rows_to_light_df(rows: list) -> pd.DataFrame: - """ - Build a minimal DataFrame from bet_ledger columns. - Used when features_json is absent — model_prob is the single feature. - """ + def _parse_rows(rows: list, is_k: bool) -> pd.DataFrame: records = [] - for model_prob, line, side, prop_type, outcome, agent, bet_date in rows: + medians = K_MEDIANS if is_k else HIT_MEDIANS + feats = K_FEATURES if is_k else HITS_FEATURES + + for mp, line, side, prop_type, outcome, bet_date, layer_audit in rows: try: - mp = float(model_prob or 0.0) / 100.0 # 0-100 scale → 0-1 - records.append({ - "model_prob_feat": mp, - "line": float(line or 4.5), - "side_over": 1 if str(side or "").upper() in ("OVER", "HIGHER") else 0, - "actual_outcome": 1 if str(outcome).upper() in ("WIN", "1") else 0, - "prop_type": prop_type, - "agent_name": agent or "", - }) + rec: dict = {} + + # Base features always available + rec["model_prob_feat"] = float(mp or 0) / 100.0 + rec["line"] = float(line or 4.5) + rec["side_over"] = 1 if str(side or "").upper() in ("OVER", "HIGHER") else 0 + rec["actual_outcome"] = 1 if str(outcome).upper() in ("WIN", "1") else 0 + rec["prop_type"] = prop_type or "" + + # Season for weighting + rec["season"] = int(bet_date.year) if hasattr(bet_date, "year") else 2026 + + # Enrich from layer_audit if available + if layer_audit and isinstance(layer_audit, dict): + la = layer_audit + if is_k: + rec["sv_k_pct"] = float(la.get("sv_k_pct") or medians["sv_k_pct"]) + rec["sv_bb_pct"] = float(la.get("sv_bb_pct") or medians["sv_bb_pct"]) + rec["sv_whiff_pct"]= float(la.get("sv_whiff_pct") or medians["sv_whiff_pct"]) + rec["days_rest"] = float(la.get("days_rest") or medians["days_rest"]) + else: + rec["sv_xba"] = float(la.get("sv_xba") or medians["sv_xba"]) + rec["sv_xwoba"] = float(la.get("sv_xwoba") or medians["sv_xwoba"]) + rec["platoon_adv"]= float(la.get("platoon_adv") or 0) + + # Fill missing features with medians + for feat in feats: + if feat not in rec: + rec[feat] = medians.get(feat, 0.0) + + records.append(rec) except Exception: continue + return pd.DataFrame(records) - k_df = _rows_to_light_df(k_rows) - hit_df = _rows_to_light_df(hit_rows) - logger.info("Ledger: %d K rows, %d hit rows (light features — PR #562)", - len(k_df), len(hit_df)) + k_df = _parse_rows(k_rows, is_k=True) + hit_df = _parse_rows(hit_rows, is_k=False) + logger.info("Ledger: %d K rows, %d hit rows", len(k_df), len(hit_df)) return k_df, hit_df except Exception as e: @@ -270,25 +344,32 @@ def _rows_to_light_df(rows: list) -> pd.DataFrame: return pd.DataFrame(), pd.DataFrame() -# ── Source 2: pybaseball Statcast (fallback / supplemental) ───────────────── +# ══════════════════════════════════════════════════════════════════════════════ +# Data loading — Source 2: pybaseball Statcast (fallback / supplement) +# ══════════════════════════════════════════════════════════════════════════════ def _load_from_statcast() -> tuple[pd.DataFrame, pd.DataFrame]: """ - Pull Statcast + FanGraphs via pybaseball for 2021–2025. - Returns (k_df, hits_df). + Pull Statcast + FanGraphs via pybaseball for SEASONS. + Uses training-aligned feature names. Adds season column for recency weighting. """ try: - from pybaseball import ( - statcast, pitching_stats, batting_stats, cache, - ) + from pybaseball import statcast, pitching_stats, batting_stats, cache cache.enable() except ImportError: logger.warning("pybaseball not installed — skipping Statcast source") return pd.DataFrame(), pd.DataFrame() - logger.info("Fetching FanGraphs batting leaderboards...") - fg_bat_frames: list[pd.DataFrame] = [] + # FanGraphs season aggregates + fg_pit_frames, fg_bat_frames = [], [] for yr in SEASONS: + try: + df = pitching_stats(yr, qual=MIN_BF) + df["season"] = yr + fg_pit_frames.append(df) + logger.info(" FG pit %d: %d rows", yr, len(df)) + except Exception as e: + logger.warning(" FG pit %d failed: %s", yr, e) try: df = batting_stats(yr, qual=MIN_PA) df["season"] = yr @@ -296,177 +377,170 @@ def _load_from_statcast() -> tuple[pd.DataFrame, pd.DataFrame]: logger.info(" FG bat %d: %d rows", yr, len(df)) except Exception as e: logger.warning(" FG bat %d failed: %s", yr, e) - fg_bat = pd.concat(fg_bat_frames, ignore_index=True) if fg_bat_frames else pd.DataFrame() - logger.info("Fetching FanGraphs pitching leaderboards...") - fg_pit_frames: list[pd.DataFrame] = [] - for yr in SEASONS: - try: - df = pitching_stats(yr, qual=MIN_BF) - df["season"] = yr - fg_pit_frames.append(df) - logger.info(" FG pit %d: %d rows", yr, len(df)) - except Exception as e: - logger.warning(" FG pit %d failed: %s", yr, e) fg_pit = pd.concat(fg_pit_frames, ignore_index=True) if fg_pit_frames else pd.DataFrame() + fg_bat = pd.concat(fg_bat_frames, ignore_index=True) if fg_bat_frames else pd.DataFrame() - # ── Pull per-game Statcast outcomes ────────────────────────────────────── - logger.info("Pulling per-game Statcast (this takes ~10 min for 5 seasons)...") - pit_frames: list[pd.DataFrame] = [] - bat_frames: list[pd.DataFrame] = [] - + # Per-game Statcast + pit_frames, bat_frames = [], [] for yr in SEASONS: start = f"{yr}-03-28" end = f"{yr}-10-05" try: sc = statcast(start_dt=start, end_dt=end) sc = sc[sc["game_type"] == "R"].copy() + sc["is_k"] = sc["events"].isin({"strikeout", "strikeout_double_play"}).astype(int) + sc["is_hit"] = sc["events"].isin({"single", "double", "triple", "home_run"}).astype(int) - sc["is_hit"] = sc["events"].isin( - {"single", "double", "triple", "home_run"}).astype(int) - sc["is_k"] = sc["events"].isin( - {"strikeout", "strikeout_double_play"}).astype(int) - - # Pitcher-game + # Pitcher-game aggregation pg = (sc.groupby(["game_pk", "game_date", "pitcher"]) - .agg(total_ks=("is_k", "sum"), total_bf=("events", "count")) + .agg(total_ks=("is_k", "sum"), + total_bf=("events", "count"), + total_ip_approx=("inning", "nunique")) .reset_index()) - pg["season"] = yr - opp_agg = (sc.groupby(["game_pk", "pitcher"]) - .agg(opp_k_events=("is_k", "sum"), - opp_pa=("events", "count")) - .reset_index()) - opp_agg["opp_k_pct"] = (opp_agg["opp_k_events"] - / opp_agg["opp_pa"].clip(lower=1) * 100) - pg = pg.merge(opp_agg[["game_pk", "pitcher", "opp_k_pct"]], + pg["season"] = yr + pg["l5_ip"] = (pg.groupby("pitcher")["total_ip_approx"] + .transform(lambda x: x.shift(1).rolling(5, min_periods=1).mean())) + pg["l3_ip"] = (pg.groupby("pitcher")["total_ip_approx"] + .transform(lambda x: x.shift(1).rolling(3, min_periods=1).mean())) + pg["l5_ks"] = (pg.groupby("pitcher")["total_ks"] + .transform(lambda x: x.shift(1).rolling(5, min_periods=1).mean())) + pg["l3_ks"] = (pg.groupby("pitcher")["total_ks"] + .transform(lambda x: x.shift(1).rolling(3, min_periods=1).mean())) + pg["l10_ks"] = (pg.groupby("pitcher")["total_ks"] + .transform(lambda x: x.shift(1).rolling(10, min_periods=1).mean())) + # Approximate days_rest from game_date diff + pg["game_date_dt"] = pd.to_datetime(pg["game_date"]) + pg["days_rest"] = (pg.groupby("pitcher")["game_date_dt"] + .transform(lambda x: x.diff().dt.days.fillna(5))) + + # Opp lineup K% + opp = (sc.groupby(["game_pk", "pitcher"]) + .agg(opp_k_events=("is_k", "sum"), opp_pa=("events", "count")) + .reset_index()) + opp["opp_lineup_k_pct_proxy"] = opp["opp_k_events"] / opp["opp_pa"].clip(lower=1) * 100 + opp["opp_lineup_xwoba_proxy"] = 0.320 # filled from lineup context at inference + pg = pg.merge(opp[["game_pk", "pitcher", + "opp_lineup_k_pct_proxy", + "opp_lineup_xwoba_proxy"]], on=["game_pk", "pitcher"], how="left") pit_frames.append(pg) - logger.info(" %d pit-game rows %d", len(pg), yr) - # Batter-game - bg = (sc.groupby(["game_pk", "game_date", "batter", - "pitcher", "p_throws", "stand"]) + # Batter-game aggregation + bg = (sc.groupby(["game_pk", "game_date", "batter", "pitcher", + "p_throws", "stand"]) .agg(hits=("is_hit", "sum"), abs=("is_hit", "count")) .reset_index()) - bg["season"] = yr - bg["hit_binary"] = (bg["hits"] >= 1).astype(int) + bg["season"] = yr + bg["hit_binary"] = (bg["hits"] >= 1).astype(int) + bg["l7_hits"] = (bg.groupby("batter")["hits"] + .transform(lambda x: x.shift(1).rolling(7, min_periods=1).sum())) + bg["l7_hit_rate"] = (bg.groupby("batter")["hit_binary"] + .transform(lambda x: x.shift(1).rolling(7, min_periods=1).mean())) bat_frames.append(bg) - logger.info(" %d bat-game rows %d", len(bg), yr) + logger.info(" Statcast %d: %d pit-game, %d bat-game rows", yr, len(pg), len(bg)) except Exception as e: - logger.warning(" %d Statcast failed: %s", yr, e) + logger.warning(" Statcast %d failed: %s", yr, e) pit_game_df = pd.concat(pit_frames, ignore_index=True) if pit_frames else pd.DataFrame() bat_game_df = pd.concat(bat_frames, ignore_index=True) if bat_frames else pd.DataFrame() - # ── Rolling features ────────────────────────────────────────────────────── - if not pit_game_df.empty: - pit_game_df = pit_game_df.sort_values(["pitcher", "game_date"]) - pit_game_df["l5_ks"] = (pit_game_df.groupby("pitcher")["total_ks"] - .transform(lambda x: x.shift(1).rolling(5, min_periods=1).mean())) - pit_game_df["l10_ks"] = (pit_game_df.groupby("pitcher")["total_ks"] - .transform(lambda x: x.shift(1).rolling(10, min_periods=1).mean())) - pit_game_df["l5_k_rate"] = (pit_game_df.groupby("pitcher")["total_ks"] - .transform(lambda x: x.shift(1).rolling(5, min_periods=1).mean()) - / pit_game_df.groupby("pitcher")["total_bf"] - .transform(lambda x: x.shift(1).rolling(5, min_periods=1).mean()) - .clip(lower=1) * 100) - - if not bat_game_df.empty: - bat_game_df = bat_game_df.sort_values(["batter", "game_date"]) - bat_game_df["l7_hits"] = (bat_game_df.groupby("batter")["hits"] - .transform(lambda x: x.shift(1).rolling(7, min_periods=1).sum())) - bat_game_df["l7_hit_rate"] = (bat_game_df.groupby("batter")["hit_binary"] - .transform(lambda x: x.shift(1).rolling(7, min_periods=1).mean())) - - # ── Merge FanGraphs season stats ────────────────────────────────────────── - FG_PIT_MAP = { - "xERA": "sv_xera", "ERA": "fg_era", - "K%": "fg_kpct", "BB%": "fg_bbpct", "SwStr%": "sv_swstr_pct", - } - FG_BAT_MAP = { - "xBA": "sv_xba", "xwOBA": "sv_xwoba", "xSLG": "sv_xslg", - "EV": "sv_ev", "Barrels": "sv_brl_pct", "HardHit%": "sv_hh_pct", - "SwStr%": "sv_swstr_pct", "LA": "sv_la", - "K%": "fg_kpct", "BB%": "fg_bbpct", - } - + # Merge FanGraphs season stats with training-aligned column names if not fg_pit.empty and not pit_game_df.empty: - fg_p = fg_pit.rename(columns={k: v for k, v in FG_PIT_MAP.items() if k in fg_pit}) - for pct_col in ("fg_kpct", "fg_bbpct", "sv_swstr_pct"): + fg_p = fg_pit.rename(columns=FG_PIT_RENAME) + for pct_col in ("sv_k_pct", "sv_bb_pct", "sv_whiff_pct"): if pct_col in fg_p.columns: fg_p[pct_col] = fg_p[pct_col].apply( lambda x: x * 100 if pd.notna(x) and 0 < x <= 1.0 else x) - merge_cols = ["IDfg", "season"] + [v for v in FG_PIT_MAP.values() if v in fg_p.columns] + merge_cols = ["IDfg", "season", "sv_xera"] + [ + v for v in FG_PIT_RENAME.values() if v in fg_p.columns] if "IDfg" in fg_p.columns: pit_game_df = pit_game_df.merge( - fg_p[merge_cols], + fg_p[[c for c in merge_cols if c in fg_p.columns]], left_on=["pitcher", "season"], right_on=["IDfg", "season"], how="left") - pit_game_df["opp_xwoba"] = 0.320 # populated from lineup context at inference time + # sv_era = ERA (same as fg_era but with training-aligned name) + if "sv_era" not in pit_game_df.columns and "ERA" in fg_p.columns: + pit_game_df["sv_era"] = pit_game_df.get("ERa", K_MEDIANS["sv_era"]) if not fg_bat.empty and not bat_game_df.empty: - fg_b = fg_bat.rename(columns={k: v for k, v in FG_BAT_MAP.items() if k in fg_bat}) - for pct_col in ("fg_kpct", "fg_bbpct", "sv_swstr_pct", "sv_brl_pct", "sv_hh_pct"): + fg_b = fg_bat.rename(columns=FG_BAT_RENAME) + for pct_col in ("sv_k_pct", "sv_bb_pct", "sv_ss_pct", "sv_brl_pct", "sv_hh_pct"): if pct_col in fg_b.columns: fg_b[pct_col] = fg_b[pct_col].apply( lambda x: x * 100 if pd.notna(x) and 0 < x <= 1.0 else x) - merge_cols = ["IDfg", "season"] + [v for v in FG_BAT_MAP.values() if v in fg_b.columns] + merge_cols = ["IDfg", "season"] + [v for v in FG_BAT_RENAME.values() if v in fg_b.columns] if "IDfg" in fg_b.columns: bat_game_df = bat_game_df.merge( - fg_b[merge_cols], + fg_b[[c for c in merge_cols if c in fg_b.columns]], left_on=["batter", "season"], right_on=["IDfg", "season"], how="left") - # ── Platoon flags ──────────────────────────────────────────────────────── + # Platoon flags if "p_throws" in bat_game_df.columns: bat_game_df["throws_R"] = (bat_game_df["p_throws"] == "R").astype(int) + bat_game_df["bats_L"] = (bat_game_df["stand"] == "L").astype(int) + bat_game_df["platoon_adv"] = ( + ((bat_game_df["bats_L"] == 1) & (bat_game_df["throws_R"] == 1)) | + ((bat_game_df["bats_L"] == 0) & (bat_game_df["throws_R"] == 0)) + ).astype(int) else: - bat_game_df["throws_R"] = 1 - if "stand" in bat_game_df.columns: - bat_game_df["bats_L"] = (bat_game_df["stand"] == "L").astype(int) - else: - bat_game_df["bats_L"] = 0 - bat_game_df["platoon_adv"] = ( - ((bat_game_df.get("bats_L", 0) == 1) & (bat_game_df.get("throws_R", 1) == 1)) | - ((bat_game_df.get("bats_L", 0) == 0) & (bat_game_df.get("throws_R", 1) == 0)) - ).astype(int) - - # Pitcher opp columns - for col in ("opp_xera", "opp_k_pct", "opp_bb_pct", "opp_swstr_pct"): + bat_game_df["throws_R"] = 1 + bat_game_df["bats_L"] = 0 + bat_game_df["platoon_adv"] = 0 + + # opp_whiff for hit model (pitcher SwStr% — was missing before) + for col in ("opp_xera", "opp_k_pct", "opp_bb_pct", "opp_whiff"): if col not in bat_game_df.columns: bat_game_df[col] = HIT_MEDIANS.get(col, 0.0) - # ── Fill medians ───────────────────────────────────────────────────────── + # K binary labels + for line in K_LINES: + if "total_ks" in pit_game_df.columns: + pit_game_df[f"k_over_{line}"] = (pit_game_df["total_ks"] > line).astype(int) + if "hit_binary" in bat_game_df.columns: + bat_game_df["actual_outcome"] = bat_game_df["hit_binary"] + + # Fill medians for col, med in K_MEDIANS.items(): if col not in pit_game_df.columns: pit_game_df[col] = med else: pit_game_df[col] = pit_game_df[col].fillna(med) - for col, med in HIT_MEDIANS.items(): if col not in bat_game_df.columns: bat_game_df[col] = med else: bat_game_df[col] = bat_game_df[col].fillna(med) - # ── K binary labels ─────────────────────────────────────────────────────── - if not pit_game_df.empty and "total_ks" in pit_game_df.columns: - for line in K_LINES: - pit_game_df[f"k_over_{line}"] = (pit_game_df["total_ks"] > line).astype(int) - pit_game_df["line"] = 4.5 # representative - - logger.info("Statcast: %d pit-game rows, %d bat-game rows", - len(pit_game_df), len(bat_game_df)) + logger.info("Statcast: %d pit-game, %d bat-game rows", len(pit_game_df), len(bat_game_df)) return pit_game_df, bat_game_df -# ── Train & save ───────────────────────────────────────────────────────────── +# ══════════════════════════════════════════════════════════════════════════════ +# Sample weights — recency-based +# ══════════════════════════════════════════════════════════════════════════════ + +def _make_sample_weights(df: pd.DataFrame) -> np.ndarray: + """ + Assign per-row sample weights based on season. + Recent seasons get higher weight — corrects for league-level shift. + """ + if "season" not in df.columns: + return np.ones(len(df)) + return df["season"].map(SEASON_WEIGHTS).fillna(1.0).values + + +# ══════════════════════════════════════════════════════════════════════════════ +# Training +# ══════════════════════════════════════════════════════════════════════════════ def _train_and_save(X_train: np.ndarray, y_train: np.ndarray, - X_test: np.ndarray, y_test: np.ndarray, - label: str, out_path: str) -> dict: - """Train one XGBClassifier with Platt calibration. Returns metrics dict.""" + X_test: np.ndarray, y_test: np.ndarray, + label: str, out_path: str, + sample_weights: np.ndarray | None = None) -> dict: + """Train one XGBClassifier with Platt calibration and recency weights.""" from xgboost import XGBClassifier from sklearn.calibration import CalibratedClassifierCV from sklearn.metrics import roc_auc_score, log_loss, brier_score_loss @@ -478,189 +552,288 @@ def _train_and_save(X_train: np.ndarray, y_train: np.ndarray, raw = XGBClassifier(**XGB_PARAMS, scale_pos_weight=pos_ratio, use_label_encoder=False) model = CalibratedClassifierCV(raw, method="sigmoid", cv=5) - model.fit(X_train, y_train) + model.fit(X_train, y_train, sample_weight=sample_weights) metrics: dict = {} if len(X_test) > 0 and y_test.sum() > 0: probs = model.predict_proba(X_test)[:, 1] metrics = dict( - auc = round(float(roc_auc_score(y_test, probs)), 4), - logloss= round(float(log_loss(y_test, probs)), 4), - brier = round(float(brier_score_loss(y_test, probs)), 4), - n_test = int(len(X_test)), + auc = round(float(roc_auc_score(y_test, probs)), 4), + logloss = round(float(log_loss(y_test, probs)), 4), + brier = round(float(brier_score_loss(y_test, probs)), 4), + n_test = int(len(X_test)), ) - logger.info(" %s → AUC %.4f | LogLoss %.4f | Brier %.4f", - label, metrics["auc"], metrics["logloss"], metrics["brier"]) + logger.info(" %s → AUC %.4f | Brier %.4f (null=0.25, target<0.23)", + label, metrics["auc"], metrics["brier"]) + if metrics["brier"] > 0.25: + logger.warning(" ⚠️ %s Brier %.4f > null model — check training data quality", + label, metrics["brier"]) else: - logger.info(" %s → trained (no held-out test data yet)", label) + logger.info(" %s → trained (no held-out test — early season)", label) with open(out_path, "wb") as f: pickle.dump(model, f) - logger.info(" Saved → %s", out_path) return metrics -def main() -> None: - logger.info("=== PropIQ Per-Line K & Hit Model Training ===") - logger.info("Output dir: %s", OUTDIR) +def _run_shap(model_path: str, df: pd.DataFrame, features: list) -> list: + """Run SHAP feature importance for interpretability.""" + try: + import shap, pickle as _pkl + with open(model_path, "rb") as f: + model = _pkl.load(f) + avail = [c for c in features if c in df.columns] + X = df[avail].fillna(0).values.astype(np.float32) + idx = np.random.choice(len(X), min(2000, len(X)), replace=False) + base = model.calibrated_classifiers_[0].estimator + exp = shap.TreeExplainer(base) + sv = exp.shap_values(X[idx]) + mean_abs = np.abs(sv).mean(axis=0) + ranked = sorted(zip(avail, mean_abs), key=lambda x: x[1], reverse=True) + logger.info(" SHAP importance:") + for feat, imp in ranked: + bar = "█" * int(imp / max(ranked[0][1], 1e-9) * 20) + logger.info(" %-28s %s %.4f", feat, bar, imp) + return [{"feature": f, "importance": round(float(i), 4)} for f, i in ranked] + except Exception as e: + logger.warning("SHAP failed: %s", e) + return [] + + +# ══════════════════════════════════════════════════════════════════════════════ +# Status check +# ══════════════════════════════════════════════════════════════════════════════ + +def show_status() -> None: + metrics_path = os.path.join(OUTDIR, "model_metrics.json") + if not os.path.exists(metrics_path): + print("No model_metrics.json found — models not yet trained.") + return + with open(metrics_path) as f: + m = json.load(f) + print(f"\n=== XGBoost Model Status (trained {m.get('trained_at', 'unknown')}) ===") + print(f"{'Model':<12} {'Brier':>8} {'AUC':>8} {'N Test':>8} {'Status'}") + print("-" * 60) + null_brier = 0.25 + for key in ["k_3.5", "k_4.5", "k_5.5", "k_6.5", "hits"]: + d = m.get(key, {}) + brier = d.get("brier") + auc = d.get("auc") + n_test = d.get("n_test", 0) + if brier is None: + status = "⚠️ No test data" + elif brier < 0.23: + status = "✅ Well calibrated" + elif brier < null_brier: + status = "🟡 Marginal edge" + else: + status = "❌ Worse than null" + b_str = f"{brier:.4f}" if brier else "N/A" + a_str = f"{auc:.4f}" if auc else "N/A" + print(f" {key:<10} {b_str:>8} {a_str:>8} {n_test:>8} {status}") + + print(f"\n Null model Brier: {null_brier} (always predict 50%)") + print(f" Target Brier: <0.23 to justify current blend weights") + print(f"\n Blend recommendations:") + for key in ["k_3.5", "k_4.5", "k_5.5", "k_6.5"]: + brier = m.get(key, {}).get("brier", 0.25) + if brier and brier < 0.23: + rec = "70/30 — increase XGB weight" + elif brier and brier < null_brier: + rec = "80/20 — current default (marginal edge)" + else: + rec = "90/10 — reduce XGB weight (worse than null)" + print(f" {key}: {rec}") + hits_brier = m.get("hits", {}).get("brier", 0.25) + if hits_brier and hits_brier > null_brier: + print(f" hits: 90/10 ⚠️ (Brier {hits_brier:.4f} > null) — REDUCE BLEND") + else: + print(f" hits: 80/20 (Brier {hits_brier:.4f})") + + +# ══════════════════════════════════════════════════════════════════════════════ +# Main +# ══════════════════════════════════════════════════════════════════════════════ - # ── Load data ──────────────────────────────────────────────────────────── +def main(k_only: bool = False, hit_only: bool = False) -> None: + logger.info("=== PropIQ XGBoost Training v2 (season-weighted) ===") + logger.info("Season weights: %s", SEASON_WEIGHTS) + + # Load data ledger_k, ledger_hits = _load_from_ledger() - stat_k, stat_hits = pd.DataFrame(), pd.DataFrame() + stat_k, stat_hits = pd.DataFrame(), pd.DataFrame() + + need_statcast_k = len(ledger_k) < 500 and not hit_only + need_statcast_hit = len(ledger_hits) < 500 and not k_only - # Use Statcast when ledger has < 500 rows (not enough for calibrated training) - if len(ledger_k) < 500 or len(ledger_hits) < 500: - logger.info("Ledger rows insufficient — supplementing with Statcast...") + if need_statcast_k or need_statcast_hit: + logger.info("Supplementing with Statcast (ledger rows insufficient)...") stat_k, stat_hits = _load_from_statcast() - # Combine sources: ledger first (real lines), then Statcast - k_df = pd.concat([ledger_k, stat_k], ignore_index=True) if not stat_k.empty else ledger_k - hit_df = pd.concat([ledger_hits, stat_hits], ignore_index=True) if not stat_hits.empty else ledger_hits + # Combine — ledger rows are highest quality (real PropIQ outcomes) + # Give ledger rows 3x weight relative to historical Statcast + def _combine(ledger_df, stat_df, is_k): + if ledger_df.empty and stat_df.empty: + return pd.DataFrame() + if ledger_df.empty: + return stat_df + if stat_df.empty: + # Boost ledger weights to compensate for small sample + ledger_df = ledger_df.copy() + if "season" not in ledger_df.columns: + ledger_df["season"] = 2026 + return ledger_df + # Give ledger rows 3x season weight bonus + ledger_boost = ledger_df.copy() + if "season" not in ledger_boost.columns: + ledger_boost["season"] = 2026 + ledger_boost["_ledger_boost"] = 3.0 + stat_df2 = stat_df.copy() + stat_df2["_ledger_boost"] = 1.0 + return pd.concat([ledger_boost, stat_df2], ignore_index=True) + + k_df = _combine(ledger_k, stat_k, is_k=True) if not hit_only else pd.DataFrame() + hit_df = _combine(ledger_hits, stat_hits, is_k=False) if not k_only else pd.DataFrame() if k_df.empty and hit_df.empty: - logger.error("No training data available. Exiting.") + logger.error("No training data. Install pybaseball or connect DATABASE_URL.") return - all_metrics: dict = { - "trained_at": datetime.now(timezone.utc).isoformat(), - "seasons": SEASONS, - "test_year": TEST_YEAR, + all_metrics = { + "trained_at": datetime.now(timezone.utc).isoformat(), + "seasons": SEASONS, + "season_weights": SEASON_WEIGHTS, + "test_year": TEST_YEAR, + "blend_recommendation": { + "note": "Check status with --status after training", + }, } - # ── Train K models (per line) ──────────────────────────────────────────── + # ── Train K models ────────────────────────────────────────────────────── if not k_df.empty: - logger.info("\n=== K Models ===") + logger.info("\n=== K Models (per-line, season-weighted) ===") for line in K_LINES: label_col = f"k_over_{line}" + if label_col not in k_df.columns: if "actual_outcome" in k_df.columns and "line" in k_df.columns: - # Ledger source: reconstruct binary label from line k_df[label_col] = ( (k_df["actual_outcome"] == 1) & (k_df["line"].round(1) == line) ).astype(int) else: - logger.warning(" K>%.1f: label column missing, skipping", line) + logger.warning("K>%.1f: label missing — skipping", line) continue - # Split by season (test on TEST_YEAR when season column available) + # Train/test split by season if "season" in k_df.columns: train = k_df[k_df["season"] != TEST_YEAR] test = k_df[k_df["season"] == TEST_YEAR] else: split = int(len(k_df) * 0.80) - train = k_df.iloc[:split] - test = k_df.iloc[split:] + train, test = k_df.iloc[:split], k_df.iloc[split:] - # Filter to rows where this line was the actual line + # Filter to relevant line if "line" in k_df.columns: - # Include all rows where line is within 0.5 of this target line train_filt = train[(train["line"] - line).abs() <= 0.5] if len(train) > 100 else train - test_filt = test[(test["line"] - line).abs() <= 0.5] if len(test) > 10 else test + test_filt = test[(test["line"] - line).abs() <= 0.5] if len(test) > 10 else test else: train_filt, test_filt = train, test if len(train_filt) < 50: - logger.warning(" K>%.1f: only %d train rows, skipping", line, len(train_filt)) + logger.warning("K>%.1f: only %d train rows — skipping", line, len(train_filt)) continue - available_cols = [c for c in K_FEATURES if c in k_df.columns] - X_train = train_filt[available_cols].fillna(0).values.astype(np.float32) - y_train = train_filt[label_col].values - X_test = test_filt[available_cols].fillna(0).values.astype(np.float32) if len(test_filt) else X_train[:0] - y_test = test_filt[label_col].values if len(test_filt) else y_train[:0] + avail = [c for c in K_FEATURES if c in k_df.columns] + X_train = train_filt[avail].fillna(0).values.astype(np.float32) + y_train = train_filt[label_col].values + X_test = test_filt[avail].fillna(0).values.astype(np.float32) if len(test_filt) else X_train[:0] + y_test = test_filt[label_col].values if len(test_filt) else y_train[:0] + + # Recency weights: combine season weight × ledger boost + sw = _make_sample_weights(train_filt) + if "_ledger_boost" in train_filt.columns: + sw = sw * train_filt["_ledger_boost"].values safe_line = str(line).replace(".", "_") out_path = os.path.join(OUTDIR, f"xgb_k_{safe_line}.pkl") metrics = _train_and_save(X_train, y_train, X_test, y_test, - f"K>{line}", out_path) - n_train_k = int(len(X_train)) - all_metrics[f"k_{line}"] = {**metrics, "train_rows": n_train_k, - "features": available_cols} - - # ── PR #562: Persist to DB ──────────────────────────────────────── - _save_model_to_db( - prop_type = f"k_{line}", - pkl_path = out_path, - metrics = metrics, - n_train = n_train_k, - feature_names = available_cols, - ) + f"K>{line}", out_path, + sample_weights=sw) + n_train = int(len(X_train)) + all_metrics[f"k_{line}"] = {**metrics, "train_rows": n_train, "features": avail} + + _save_model_to_db(f"k_{line}", out_path, metrics, n_train, avail) + + # SHAP for K4.5 (most common line) + k45_path = os.path.join(OUTDIR, "xgb_k_4_5.pkl") + if os.path.exists(k45_path) and not k_df.empty: + all_metrics["shap_k_4_5"] = _run_shap(k45_path, k_df, K_FEATURES) - # ── Train hit model ────────────────────────────────────────────────────── + # ── Train hit model ───────────────────────────────────────────────────── if not hit_df.empty and "actual_outcome" in hit_df.columns: - logger.info("\n=== Hit Model ===") + logger.info("\n=== Hit Model (season-weighted) ===") + if "season" in hit_df.columns: train_h = hit_df[hit_df["season"] != TEST_YEAR] test_h = hit_df[hit_df["season"] == TEST_YEAR] else: split = int(len(hit_df) * 0.80) - train_h = hit_df.iloc[:split] - test_h = hit_df.iloc[split:] + train_h, test_h = hit_df.iloc[:split], hit_df.iloc[split:] - available_cols = [c for c in HITS_FEATURES if c in hit_df.columns] - X_train_h = train_h[available_cols].fillna(0).values.astype(np.float32) + avail_h = [c for c in HITS_FEATURES if c in hit_df.columns] + X_train_h = train_h[avail_h].fillna(0).values.astype(np.float32) y_train_h = train_h["actual_outcome"].values - X_test_h = test_h[available_cols].fillna(0).values.astype(np.float32) if len(test_h) else X_train_h[:0] - y_test_h = test_h["actual_outcome"].values if len(test_h) else y_train_h[:0] - - out_path = os.path.join(OUTDIR, "xgb_hits.pkl") - metrics = _train_and_save(X_train_h, y_train_h, X_test_h, y_test_h, - "Hits", out_path) - n_train_h = int(len(X_train_h)) - all_metrics["hits"] = {**metrics, "train_rows": n_train_h, - "features": available_cols} - - # ── PR #562: Persist to DB ──────────────────────────────────────────── - _save_model_to_db( - prop_type = "hits", - pkl_path = out_path, - metrics = metrics, - n_train = n_train_h, - feature_names = available_cols, - ) - - # ── SHAP importance for K 4.5 model ────────────────────────────────────── - k45_path = os.path.join(OUTDIR, "xgb_k_4_5.pkl") - if os.path.exists(k45_path) and not k_df.empty: - try: - import shap, pickle as _pickle - with open(k45_path, "rb") as f: - k45 = _pickle.load(f) - base_model = k45.calibrated_classifiers_[0].estimator - avail = [c for c in K_FEATURES if c in k_df.columns] - X_shap = k_df[avail].fillna(0).values.astype(np.float32) - idx = np.random.choice(len(X_shap), min(2000, len(X_shap)), replace=False) - exp = shap.TreeExplainer(base_model) - sv = exp.shap_values(X_shap[idx]) - mean_s = np.abs(sv).mean(axis=0) - ranked = sorted(zip(avail, mean_s), key=lambda x: x[1], reverse=True) - logger.info("\n=== K4.5 SHAP Feature Importance ===") - for feat, imp in ranked: - bar = "█" * int(imp / ranked[0][1] * 20) - logger.info(" %-22s %s %.4f", feat, bar, imp) - all_metrics["shap_k_4_5"] = [{"feature": f, "importance": round(float(i), 4)} - for f, i in ranked] - except Exception as e: - logger.warning("SHAP failed: %s", e) + X_test_h = test_h[avail_h].fillna(0).values.astype(np.float32) if len(test_h) else X_train_h[:0] + y_test_h = test_h["actual_outcome"].values if len(test_h) else y_train_h[:0] + + sw_h = _make_sample_weights(train_h) + if "_ledger_boost" in train_h.columns: + sw_h = sw_h * train_h["_ledger_boost"].values + + out_path_h = os.path.join(OUTDIR, "xgb_hits.pkl") + metrics_h = _train_and_save(X_train_h, y_train_h, X_test_h, y_test_h, + "Hits", out_path_h, sample_weights=sw_h) + n_train_h = int(len(X_train_h)) + all_metrics["hits"] = {**metrics_h, "train_rows": n_train_h, "features": avail_h} + + _save_model_to_db("hits", out_path_h, metrics_h, n_train_h, avail_h) + + # Blend recommendation for hits + hit_brier = metrics_h.get("brier", 0.25) + if hit_brier and hit_brier > 0.25: + all_metrics["blend_recommendation"]["hits"] = ( + f"90/10 — Brier {hit_brier:.4f} > null (0.25). " + "Reduce from current 70/30 to limit noise contribution." + ) + logger.warning("⚠️ Hit model Brier %.4f > null — recommend 90/10 blend", hit_brier) + elif hit_brier and hit_brier < 0.23: + all_metrics["blend_recommendation"]["hits"] = ( + f"60/40 — Brier {hit_brier:.4f} well below null. " + "Consider increasing blend weight." + ) - # ── Save metadata ──────────────────────────────────────────────────────── - feat_cols_out = { - f"k_{line}": K_FEATURES for line in K_LINES - } + # ── Save feature cols and metrics ─────────────────────────────────────── + feat_cols_out = {f"k_{line}": K_FEATURES for line in K_LINES} feat_cols_out["hits"] = HITS_FEATURES with open(os.path.join(OUTDIR, "xgb_feature_cols.json"), "w") as f: json.dump(feat_cols_out, f, indent=2) - logger.info("\nSaved → models/xgb_feature_cols.json") with open(os.path.join(OUTDIR, "model_metrics.json"), "w") as f: json.dump(all_metrics, f, indent=2) - logger.info("Saved → models/model_metrics.json") - logger.info("\n✅ Training complete. Saved to %s and xgb_model_store DB table.", OUTDIR) - logger.info(" Models load from DB on next Railway restart (xgb_k_layer.py).") + logger.info("\n✅ Training complete.") + logger.info(" Run: python scripts/xgb_k_training.py --status") + show_status() if __name__ == "__main__": - main() + if "--status" in sys.argv: + show_status() + elif "--k-only" in sys.argv: + main(k_only=True) + elif "--hit-only" in sys.argv: + main(hit_only=True) + else: + main() diff --git a/update_blend_weights.py b/update_blend_weights.py new file mode 100644 index 0000000..7c083a5 --- /dev/null +++ b/update_blend_weights.py @@ -0,0 +1,239 @@ +""" +update_blend_weights.py +======================== +Reads model_metrics.json after training and automatically updates the +XGBoost blend weights in xgb_k_layer.py based on actual Brier scores. + +THE PROBLEM +----------- +The blend weights (80/20 for K, 70/30 for hits) were set as fixed constants +based on theory, not measurement. Now that we have real Brier scores: + - Hit model Brier = 0.2668 (WORSE than null at 0.25) → 70/30 is wrong + - K model Brier = 0.2458 (barely better than null) → 80/20 is marginal + +BLEND SCHEDULE (based on Brier) +-------------------------------- +Brier < 0.23: Model has real edge → 70/30 (increase XGB weight) +Brier < 0.25: Marginal edge → 80/20 (current default) +Brier >= 0.25: Worse than null → 90/10 (reduce XGB, limit noise) +Brier >= 0.27: Actively hurting → 95/5 (minimal contribution only) + +USAGE +----- + python update_blend_weights.py # preview changes (no writes) + python update_blend_weights.py --apply # write changes to xgb_k_layer.py + python update_blend_weights.py --status # show current blend weights in code +""" + +from __future__ import annotations + +import json +import logging +import re +import sys +from pathlib import Path + +logging.basicConfig(level=logging.INFO, format="%(asctime)s [BLEND] %(message)s") +log = logging.getLogger(__name__) + +METRICS_FILE = Path("models/model_metrics.json") +XGB_LAYER = Path("xgb_k_layer.py") + +NULL_BRIER = 0.25 # null model always predicts 50% + + +def _get_blend_weight(brier: float | None, model_name: str) -> tuple[float, float, str]: + """ + Return (formula_weight, xgb_weight, reason) based on Brier score. + formula_weight + xgb_weight = 1.0 + """ + if brier is None: + return 0.90, 0.10, "no test data — using conservative 90/10" + if brier < 0.23: + return 0.70, 0.30, f"Brier {brier:.4f} well below null — strong edge, 70/30" + if brier < NULL_BRIER: + return 0.80, 0.20, f"Brier {brier:.4f} marginal edge over null — 80/20" + if brier < 0.27: + return 0.90, 0.10, f"Brier {brier:.4f} ≥ null (0.25) — reducing to 90/10" + return 0.95, 0.05, f"Brier {brier:.4f} actively hurting — minimal 95/5" + + +def load_metrics() -> dict: + if not METRICS_FILE.exists(): + log.error("models/model_metrics.json not found — run xgb_k_training.py first") + return {} + return json.loads(METRICS_FILE.read_text()) + + +def compute_recommendations(metrics: dict) -> dict: + """Compute blend weight recommendations from training metrics.""" + recs = {} + + # K models — all share the same blend weight (averaged across lines) + k_briers = [] + for line in [3.5, 4.5, 5.5, 6.5]: + key = f"k_{line}" + b = metrics.get(key, {}).get("brier") + if b: + k_briers.append(b) + + avg_k_brier = sum(k_briers) / len(k_briers) if k_briers else None + fw_k, xgb_k, reason_k = _get_blend_weight(avg_k_brier, "K") + recs["k"] = { + "formula_weight": fw_k, + "xgb_weight": xgb_k, + "avg_brier": round(avg_k_brier, 4) if avg_k_brier else None, + "reason": reason_k, + } + + # Hit model + hit_brier = metrics.get("hits", {}).get("brier") + fw_h, xgb_h, reason_h = _get_blend_weight(hit_brier, "hits") + recs["hits"] = { + "formula_weight": fw_h, + "xgb_weight": xgb_h, + "brier": round(hit_brier, 4) if hit_brier else None, + "reason": reason_h, + } + + return recs + + +def show_current_weights() -> None: + """Show what blend weights are currently in xgb_k_layer.py.""" + if not XGB_LAYER.exists(): + print("xgb_k_layer.py not found.") + return + content = XGB_LAYER.read_text() + print("\nCurrent blend weights in xgb_k_layer.py:") + + # Find K blend + m_k = re.search(r"(\d+\.\d+) \* model_prob \+ (\d+\.\d+) \* _xkp", content) + if m_k: + xgb_w = float(m_k.group(2)) + print(f" K props: formula={1-xgb_w:.0%} / XGB={xgb_w:.0%}") + else: + print(" K props: pattern not found") + + # Find hit blend + m_h = re.search(r"(\d+\.\d+) \* model_prob \+ (\d+\.\d+) \* _xhp", content) + if m_h: + xgb_w = float(m_h.group(2)) + print(f" Hit props: formula={1-xgb_w:.0%} / XGB={xgb_w:.0%}") + else: + print(" Hit props: pattern not found") + + +def apply_blend_updates(recs: dict, dry_run: bool = True) -> bool: + """Patch xgb_k_layer.py with recommended blend weights.""" + if not XGB_LAYER.exists(): + log.error("xgb_k_layer.py not found.") + return False + + content = XGB_LAYER.read_text() + original = content + changed = False + + # Update K blend: pattern "0.XX * model_prob + 0.YY * _xkp" + k_fw = recs["k"]["formula_weight"] + k_xgb = recs["k"]["xgb_weight"] + + k_old = re.search(r"(\d+\.\d+) \* model_prob \+ (\d+\.\d+) \* _xkp", content) + if k_old: + old_str = k_old.group(0) + new_str = f"{k_fw:.2f} * model_prob + {k_xgb:.2f} * _xkp" + if old_str != new_str: + content = content.replace(old_str, new_str, 1) + log.info("K blend: %s → %s (%s)", + old_str, new_str, recs["k"]["reason"]) + changed = True + else: + log.info("K blend already at %s — no change needed", old_str) + else: + log.warning("K blend pattern not found in xgb_k_layer.py") + + # Update hit blend: pattern "0.XX * model_prob + 0.YY * _xhp" + h_fw = recs["hits"]["formula_weight"] + h_xgb = recs["hits"]["xgb_weight"] + + h_old = re.search(r"(\d+\.\d+) \* model_prob \+ (\d+\.\d+) \* _xhp", content) + if h_old: + old_str = h_old.group(0) + new_str = f"{h_fw:.2f} * model_prob + {h_xgb:.2f} * _xhp" + if old_str != new_str: + content = content.replace(old_str, new_str, 1) + log.info("Hit blend: %s → %s (%s)", + old_str, new_str, recs["hits"]["reason"]) + changed = True + else: + log.info("Hit blend already at %s — no change needed", old_str) + else: + log.warning("Hit blend pattern not found in xgb_k_layer.py") + + if dry_run: + if changed: + log.info("DRY RUN — changes NOT written. Run with --apply to write.") + else: + log.info("No changes needed.") + return changed + + if changed: + XGB_LAYER.write_text(content) + log.info("xgb_k_layer.py updated with new blend weights.") + + # Update calibration_params.json with blend info + cal_path = Path("data/calibration_params.json") + if cal_path.exists(): + try: + cal = json.loads(cal_path.read_text()) + cal["xgb_blend_weights"] = { + "k": {"formula": k_fw, "xgb": k_xgb}, + "hits":{"formula": h_fw, "xgb": h_xgb}, + } + cal["calibration_notes"] = cal.get("calibration_notes", []) + from datetime import date + cal["calibration_notes"].append( + f"[{date.today().isoformat()}] Blend weights updated: " + f"K={k_fw:.0%}/{k_xgb:.0%} Hits={h_fw:.0%}/{h_xgb:.0%} " + f"based on Brier K={recs['k']['avg_brier']} " + f"Hits={recs['hits']['brier']}" + ) + cal_path.write_text(json.dumps(cal, indent=2)) + log.info("calibration_params.json updated with blend weights.") + except Exception as e: + log.warning("Failed to update calibration_params.json: %s", e) + else: + log.info("No changes needed.") + + return changed + + +def main() -> None: + metrics = load_metrics() + if not metrics: + return + + recs = compute_recommendations(metrics) + + print("\n=== XGBoost Blend Weight Recommendations ===") + print(f" Null model Brier: {NULL_BRIER} (baseline — worse = model is noise)") + print() + + for model, rec in recs.items(): + brier_str = f"{rec.get('brier') or rec.get('avg_brier') or 'N/A'}" + print(f" {model.upper()}") + print(f" Brier: {brier_str}") + print(f" Blend: {rec['formula_weight']:.0%} formula / {rec['xgb_weight']:.0%} XGB") + print(f" Reason: {rec['reason']}") + print() + + show_current_weights() + + apply_arg = "--apply" in sys.argv + if "--status" not in sys.argv: + print(f"\n{'Applying changes...' if apply_arg else 'DRY RUN — use --apply to write changes'}") + apply_blend_updates(recs, dry_run=not apply_arg) + + +if __name__ == "__main__": + main()