braintrustdata · Abhijeet Prasad (AbhiPrasad) · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
diff --git a/.github/actions/setup-python-env/action.yml b/.github/actions/setup-python-env/action.yml
@@ -0,0 +1,21 @@
+name: "Setup Python environment"
+description: "Checkout, configure mise, and install dev dependencies for a given Python version"
+
+inputs:
+  python-version:
+    description: "Python version to install (e.g. 3.12)"
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Set up mise
+      uses: jdx/mise-action@1648a7812b9aeae629881980618f079932869151 # v4.0.1
+      with:
+        cache: true
+        experimental: true
+        install_args: python@${{ inputs.python-version }}
+    - name: Install dependencies
+      shell: bash
+      run: |
+        mise exec python@${{ inputs.python-version }} -- make -C py install-dev
diff --git a/.github/workflows/checks.yaml b/.github/workflows/checks.yaml
@@ -45,15 +45,10 @@ jobs:
 
     steps:
       - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
-      - name: Set up mise
-        uses: jdx/mise-action@1648a7812b9aeae629881980618f079932869151 # v4.0.1
+      - name: Setup Python environment
+        uses: ./.github/actions/setup-python-env
         with:
-          cache: true
-          experimental: true
-          install_args: python@${{ matrix.python-version }}
-      - name: Install dependencies
-        run: |
-          mise exec python@${{ matrix.python-version }} -- make -C py install-dev
+          python-version: ${{ matrix.python-version }}
       - name: Test whether the Python SDK can be installed
         run: |
           # This is already done by make install-dev, but we're keeping this as a separate step
@@ -72,23 +67,18 @@ jobs:
       matrix:
         python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
         os: [ubuntu-latest, windows-latest]
-        shard: [0, 1]
+        shard: [0, 1, 2, 3]
 
     steps:
       - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
-      - name: Set up mise
-        uses: jdx/mise-action@1648a7812b9aeae629881980618f079932869151 # v4.0.1
+      - name: Setup Python environment
+        uses: ./.github/actions/setup-python-env
         with:
-          cache: true
-          experimental: true
-          install_args: python@${{ matrix.python-version }}
-      - name: Install dependencies
-        run: |
-          mise exec python@${{ matrix.python-version }} -- make -C py install-dev
-      - name: Run nox tests (shard ${{ matrix.shard }}/2)
+          python-version: ${{ matrix.python-version }}
+      - name: Run nox tests (shard ${{ matrix.shard }}/4)
         shell: bash
         run: |
-          mise exec python@${{ matrix.python-version }} -- bash ./py/scripts/nox-matrix.sh ${{ matrix.shard }} 2
+          mise exec python@${{ matrix.python-version }} -- python ./py/scripts/nox-matrix.py ${{ matrix.shard }} 4
 
   adk-py:
     uses: ./.github/workflows/adk-py-test.yaml

diff --git a/.github/workflows/update-session-weights.yaml b/.github/workflows/update-session-weights.yaml
@@ -0,0 +1,78 @@
+name: update-session-weights
+
+on:
+  schedule:
+    # Every Monday at 06:00 UTC
+    - cron: "0 6 * * 1"
+  workflow_dispatch: {}
+
+permissions:
+  contents: write
+  pull-requests: write
+
+jobs:
+  measure:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+
+    strategy:
+      fail-fast: false
+      matrix:
+        shard: [0, 1, 2, 3]
+
+    steps:
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
+      - name: Setup Python environment
+        uses: ./.github/actions/setup-python-env
+        with:
+          python-version: "3.13"
+      - name: Run nox tests (shard ${{ matrix.shard }}/4)
+        run: |
+          mise exec python@3.13 -- python ./py/scripts/nox-matrix.py ${{ matrix.shard }} 4 \
+            --output-durations measured-durations-${{ matrix.shard }}.json
+      - name: Upload measured durations
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: session-durations-shard-${{ matrix.shard }}
+          path: measured-durations-${{ matrix.shard }}.json
+          retention-days: 5
+
+  update:
+    needs: [measure]
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+    steps:
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
+      - name: Download measured durations
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
+        with:
+          pattern: session-durations-shard-*
+          merge-multiple: true
+      - name: Check for drift
+        id: check
+        run: |
+          if python ./py/scripts/check-session-weights.py measured-durations-*.json; then
+            echo "drifted=false" >> "$GITHUB_OUTPUT"
+          else
+            echo "drifted=true" >> "$GITHUB_OUTPUT"
+          fi
+      - name: Update weights
+        if: steps.check.outputs.drifted == 'true'
+        run: |
+          python ./py/scripts/check-session-weights.py --update measured-durations-*.json
+      - name: Create pull request
+        if: steps.check.outputs.drifted == 'true'
+        uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # v7.0.8
+        with:
+          commit-message: "ci: update nox session weights"
+          branch: auto/update-session-weights
+          title: "ci: update nox session weights"
+          body: |
+            Automated weekly update of `py/scripts/session-weights.json`.
+
+            Session durations were re-measured on `ubuntu-latest` with Python 3.13
+            and at least one session drifted beyond the 50% threshold.
+
+            This keeps nox shard balancing accurate.
+          labels: ci
+          delete-branch: true
diff --git a/py/scripts/check-session-weights.py b/py/scripts/check-session-weights.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+"""Compare measured nox session durations against session-weights.json.
+
+Reads one or more measured-duration JSON files (produced by
+``nox-matrix.py --output-durations``), merges them, and reports sessions
+whose actual duration drifted significantly from the recorded weight.
+
+Exit codes:
+    0 — all weights are within tolerance (new/missing sessions are reported
+        but do not cause a non-zero exit; they receive the default weight)
+    1 — at least one weight drifted beyond the threshold
+
+Usage:
+    python check-session-weights.py measured-shard-0.json measured-shard-1.json ...
+
+To update weights after downloading the measured-durations artifacts from CI:
+    python check-session-weights.py --update measured-shard-0.json measured-shard-1.json ...
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+
+# A session is flagged when its measured duration differs from the recorded
+# weight by more than this fraction (0.5 = 50%).
+DRIFT_THRESHOLD = 0.5
+
+# Ignore drift for sessions shorter than this (seconds).  Short sessions
+# have high relative variance and aren't worth chasing.
+MIN_DURATION_FOR_DRIFT = 8
+
+
+def update_weights(weights_path: Path, weights_data: dict, measured: dict[str, int]) -> None:
+    """Overwrite session-weights.json with measured durations."""
+    meta_keys = {k for k in weights_data if k.startswith("_")}
+    updated = {k: weights_data[k] for k in sorted(meta_keys)}
+    # Merge: keep measured values, drop sessions that no longer exist
+    all_sessions = sorted(set(weights_data.keys() - meta_keys) | set(measured.keys()))
+    for session in all_sessions:
+        if session in measured:
+            updated[session] = measured[session]
+        else:
+            # Session wasn't measured — keep the old weight (may be platform-specific
+            # or skipped; a full run across all shards would cover everything)
+            updated[session] = weights_data[session]
+    with open(weights_path, "w") as f:
+        json.dump(updated, f, indent=2, sort_keys=True)
+        f.write("\n")
+    n_changed = sum(1 for s in measured if s not in meta_keys and weights_data.get(s) != measured[s])
+    n_new = sum(1 for s in measured if s not in weights_data)
+    print(
+        f"✅ Updated {weights_path} ({n_changed} changed, {n_new} new, {len(updated) - len(meta_keys)} total sessions)"
+    )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("measured_files", nargs="+", type=Path, help="JSON files with measured durations")
+    parser.add_argument(
+        "--weights",
+        type=Path,
+        default=Path(__file__).parent / "session-weights.json",
+        help="Path to session-weights.json (default: co-located with this script)",
+    )
+    parser.add_argument(
+        "--update",
+        action="store_true",
+        help="Update session-weights.json with measured durations and exit",
+    )
+    args = parser.parse_args()
+
+    # Merge all measured durations (later files overwrite earlier for the same session)
+    measured: dict[str, int] = {}
+    for path in args.measured_files:
+        with open(path) as f:
+            measured.update(json.load(f))
+
+    if not measured:
+        print("⚠️  No measured durations found — nothing to check.")
+        sys.exit(0)
+
+    with open(args.weights) as f:
+        weights_data: dict[str, int] = json.load(f)
+
+    if args.update:
+        update_weights(args.weights, weights_data, measured)
+        return
+
+    meta_keys = {k for k in weights_data if k.startswith("_")}
+
+    drifted: list[str] = []
+    new_sessions: list[str] = []
+
+    print(f"Comparing {len(measured)} measured sessions against session-weights.json\n")
+    print(f"{'Session':<50} {'Expected':>8} {'Actual':>8} {'Drift':>8}")
+    print("-" * 78)
+
+    for session in sorted(measured):
+        actual = measured[session]
+        expected = weights_data.get(session)
+
+        if expected is None:
+            new_sessions.append(session)
+            print(f"{session:<50} {'(new)':>8} {actual:>7}s {'':>8}")
+            continue
+
+        if expected == 0:
+            drift_pct = 0.0
+        else:
+            drift_pct = (actual - expected) / expected
+
+        flag = ""
+        if abs(drift_pct) > DRIFT_THRESHOLD and max(actual, expected) >= MIN_DURATION_FOR_DRIFT:
+            flag = " ⚠️"
+            drifted.append(session)
+
+        print(f"{session:<50} {expected:>7}s {actual:>7}s {drift_pct:>+7.0%}{flag}")
+
+    # Check for sessions in weights but not measured (may have been removed)
+    known_sessions = {k for k in weights_data if k not in meta_keys}
+    missing = sorted(known_sessions - set(measured))
+
+    print()
+
+    if new_sessions:
+        print(f"🆕 {len(new_sessions)} new session(s) not in session-weights.json:")
+        for s in new_sessions:
+            print(f"   {s}: {measured[s]}s")
+        print()
+
+    if missing:
+        print(f"❓ {len(missing)} session(s) in session-weights.json but not measured")
+        print("   (may be in another shard — only a concern if missing from ALL shards):")
+        for s in missing:
+            print(f"   {s}")
+        print()
+
+    if drifted:
+        print(f"⚠️  {len(drifted)} session(s) drifted beyond {DRIFT_THRESHOLD:.0%} threshold.")
+        print("   Consider updating py/scripts/session-weights.json")
+        sys.exit(1)
+    else:
+        print("✅ All session weights are within tolerance.")
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()