diff --git a/.github/actions/setup-python-env/action.yml b/.github/actions/setup-python-env/action.yml new file mode 100644 index 00000000..e876771f --- /dev/null +++ b/.github/actions/setup-python-env/action.yml @@ -0,0 +1,21 @@ +name: "Setup Python environment" +description: "Checkout, configure mise, and install dev dependencies for a given Python version" + +inputs: + python-version: + description: "Python version to install (e.g. 3.12)" + required: true + +runs: + using: "composite" + steps: + - name: Set up mise + uses: jdx/mise-action@1648a7812b9aeae629881980618f079932869151 # v4.0.1 + with: + cache: true + experimental: true + install_args: python@${{ inputs.python-version }} + - name: Install dependencies + shell: bash + run: | + mise exec python@${{ inputs.python-version }} -- make -C py install-dev diff --git a/.github/workflows/checks.yaml b/.github/workflows/checks.yaml index ca7844bf..4d4b1812 100644 --- a/.github/workflows/checks.yaml +++ b/.github/workflows/checks.yaml @@ -45,15 +45,10 @@ jobs: steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 - - name: Set up mise - uses: jdx/mise-action@1648a7812b9aeae629881980618f079932869151 # v4.0.1 + - name: Setup Python environment + uses: ./.github/actions/setup-python-env with: - cache: true - experimental: true - install_args: python@${{ matrix.python-version }} - - name: Install dependencies - run: | - mise exec python@${{ matrix.python-version }} -- make -C py install-dev + python-version: ${{ matrix.python-version }} - name: Test whether the Python SDK can be installed run: | # This is already done by make install-dev, but we're keeping this as a separate step @@ -72,23 +67,18 @@ jobs: matrix: python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] os: [ubuntu-latest, windows-latest] - shard: [0, 1] + shard: [0, 1, 2, 3] steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 - - name: Set up mise - uses: jdx/mise-action@1648a7812b9aeae629881980618f079932869151 # v4.0.1 + - name: Setup Python environment + uses: ./.github/actions/setup-python-env with: - cache: true - experimental: true - install_args: python@${{ matrix.python-version }} - - name: Install dependencies - run: | - mise exec python@${{ matrix.python-version }} -- make -C py install-dev - - name: Run nox tests (shard ${{ matrix.shard }}/2) + python-version: ${{ matrix.python-version }} + - name: Run nox tests (shard ${{ matrix.shard }}/4) shell: bash run: | - mise exec python@${{ matrix.python-version }} -- bash ./py/scripts/nox-matrix.sh ${{ matrix.shard }} 2 + mise exec python@${{ matrix.python-version }} -- python ./py/scripts/nox-matrix.py ${{ matrix.shard }} 4 adk-py: uses: ./.github/workflows/adk-py-test.yaml diff --git a/.github/workflows/update-session-weights.yaml b/.github/workflows/update-session-weights.yaml new file mode 100644 index 00000000..61e2d042 --- /dev/null +++ b/.github/workflows/update-session-weights.yaml @@ -0,0 +1,78 @@ +name: update-session-weights + +on: + schedule: + # Every Monday at 06:00 UTC + - cron: "0 6 * * 1" + workflow_dispatch: {} + +permissions: + contents: write + pull-requests: write + +jobs: + measure: + runs-on: ubuntu-latest + timeout-minutes: 30 + + strategy: + fail-fast: false + matrix: + shard: [0, 1, 2, 3] + + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + - name: Setup Python environment + uses: ./.github/actions/setup-python-env + with: + python-version: "3.13" + - name: Run nox tests (shard ${{ matrix.shard }}/4) + run: | + mise exec python@3.13 -- python ./py/scripts/nox-matrix.py ${{ matrix.shard }} 4 \ + --output-durations measured-durations-${{ matrix.shard }}.json + - name: Upload measured durations + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: session-durations-shard-${{ matrix.shard }} + path: measured-durations-${{ matrix.shard }}.json + retention-days: 5 + + update: + needs: [measure] + runs-on: ubuntu-latest + timeout-minutes: 5 + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + - name: Download measured durations + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 + with: + pattern: session-durations-shard-* + merge-multiple: true + - name: Check for drift + id: check + run: | + if python ./py/scripts/check-session-weights.py measured-durations-*.json; then + echo "drifted=false" >> "$GITHUB_OUTPUT" + else + echo "drifted=true" >> "$GITHUB_OUTPUT" + fi + - name: Update weights + if: steps.check.outputs.drifted == 'true' + run: | + python ./py/scripts/check-session-weights.py --update measured-durations-*.json + - name: Create pull request + if: steps.check.outputs.drifted == 'true' + uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # v7.0.8 + with: + commit-message: "ci: update nox session weights" + branch: auto/update-session-weights + title: "ci: update nox session weights" + body: | + Automated weekly update of `py/scripts/session-weights.json`. + + Session durations were re-measured on `ubuntu-latest` with Python 3.13 + and at least one session drifted beyond the 50% threshold. + + This keeps nox shard balancing accurate. + labels: ci + delete-branch: true diff --git a/py/scripts/check-session-weights.py b/py/scripts/check-session-weights.py new file mode 100644 index 00000000..e7fea128 --- /dev/null +++ b/py/scripts/check-session-weights.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +"""Compare measured nox session durations against session-weights.json. + +Reads one or more measured-duration JSON files (produced by +``nox-matrix.py --output-durations``), merges them, and reports sessions +whose actual duration drifted significantly from the recorded weight. + +Exit codes: + 0 — all weights are within tolerance (new/missing sessions are reported + but do not cause a non-zero exit; they receive the default weight) + 1 — at least one weight drifted beyond the threshold + +Usage: + python check-session-weights.py measured-shard-0.json measured-shard-1.json ... + +To update weights after downloading the measured-durations artifacts from CI: + python check-session-weights.py --update measured-shard-0.json measured-shard-1.json ... +""" + +import argparse +import json +import sys +from pathlib import Path + + +# A session is flagged when its measured duration differs from the recorded +# weight by more than this fraction (0.5 = 50%). +DRIFT_THRESHOLD = 0.5 + +# Ignore drift for sessions shorter than this (seconds). Short sessions +# have high relative variance and aren't worth chasing. +MIN_DURATION_FOR_DRIFT = 8 + + +def update_weights(weights_path: Path, weights_data: dict, measured: dict[str, int]) -> None: + """Overwrite session-weights.json with measured durations.""" + meta_keys = {k for k in weights_data if k.startswith("_")} + updated = {k: weights_data[k] for k in sorted(meta_keys)} + # Merge: keep measured values, drop sessions that no longer exist + all_sessions = sorted(set(weights_data.keys() - meta_keys) | set(measured.keys())) + for session in all_sessions: + if session in measured: + updated[session] = measured[session] + else: + # Session wasn't measured — keep the old weight (may be platform-specific + # or skipped; a full run across all shards would cover everything) + updated[session] = weights_data[session] + with open(weights_path, "w") as f: + json.dump(updated, f, indent=2, sort_keys=True) + f.write("\n") + n_changed = sum(1 for s in measured if s not in meta_keys and weights_data.get(s) != measured[s]) + n_new = sum(1 for s in measured if s not in weights_data) + print( + f"✅ Updated {weights_path} ({n_changed} changed, {n_new} new, {len(updated) - len(meta_keys)} total sessions)" + ) + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("measured_files", nargs="+", type=Path, help="JSON files with measured durations") + parser.add_argument( + "--weights", + type=Path, + default=Path(__file__).parent / "session-weights.json", + help="Path to session-weights.json (default: co-located with this script)", + ) + parser.add_argument( + "--update", + action="store_true", + help="Update session-weights.json with measured durations and exit", + ) + args = parser.parse_args() + + # Merge all measured durations (later files overwrite earlier for the same session) + measured: dict[str, int] = {} + for path in args.measured_files: + with open(path) as f: + measured.update(json.load(f)) + + if not measured: + print("⚠️ No measured durations found — nothing to check.") + sys.exit(0) + + with open(args.weights) as f: + weights_data: dict[str, int] = json.load(f) + + if args.update: + update_weights(args.weights, weights_data, measured) + return + + meta_keys = {k for k in weights_data if k.startswith("_")} + + drifted: list[str] = [] + new_sessions: list[str] = [] + + print(f"Comparing {len(measured)} measured sessions against session-weights.json\n") + print(f"{'Session':<50} {'Expected':>8} {'Actual':>8} {'Drift':>8}") + print("-" * 78) + + for session in sorted(measured): + actual = measured[session] + expected = weights_data.get(session) + + if expected is None: + new_sessions.append(session) + print(f"{session:<50} {'(new)':>8} {actual:>7}s {'':>8}") + continue + + if expected == 0: + drift_pct = 0.0 + else: + drift_pct = (actual - expected) / expected + + flag = "" + if abs(drift_pct) > DRIFT_THRESHOLD and max(actual, expected) >= MIN_DURATION_FOR_DRIFT: + flag = " ⚠️" + drifted.append(session) + + print(f"{session:<50} {expected:>7}s {actual:>7}s {drift_pct:>+7.0%}{flag}") + + # Check for sessions in weights but not measured (may have been removed) + known_sessions = {k for k in weights_data if k not in meta_keys} + missing = sorted(known_sessions - set(measured)) + + print() + + if new_sessions: + print(f"🆕 {len(new_sessions)} new session(s) not in session-weights.json:") + for s in new_sessions: + print(f" {s}: {measured[s]}s") + print() + + if missing: + print(f"❓ {len(missing)} session(s) in session-weights.json but not measured") + print(" (may be in another shard — only a concern if missing from ALL shards):") + for s in missing: + print(f" {s}") + print() + + if drifted: + print(f"⚠️ {len(drifted)} session(s) drifted beyond {DRIFT_THRESHOLD:.0%} threshold.") + print(" Consider updating py/scripts/session-weights.json") + sys.exit(1) + else: + print("✅ All session weights are within tolerance.") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/py/scripts/nox-matrix.py b/py/scripts/nox-matrix.py new file mode 100644 index 00000000..4c9ff0c3 --- /dev/null +++ b/py/scripts/nox-matrix.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python3 +"""Distribute nox sessions across shards using greedy bin-packing (LPT). + +Session weights are read from session-weights.json (co-located with this +script). Unknown sessions get a default weight. The algorithm sorts sessions +by weight descending and greedily assigns each to the lightest shard. + +Usage: + python nox-matrix.py [--dry-run] +""" + +import argparse +import json +import re +import subprocess +import sys +from pathlib import Path + + +def get_nox_sessions(noxfile: Path) -> list[str]: + """List available nox sessions by running ``nox -l``.""" + result = subprocess.run( + ["nox", "-l", "-f", str(noxfile)], + capture_output=True, + text=True, + check=True, + ) + sessions: list[str] = [] + for line in result.stdout.splitlines(): + if line.startswith("* "): + # Strip the leading "* " and any " -> description" suffix + name = line[2:].split(" -> ")[0].strip() + sessions.append(name) + return sorted(sessions) + + +def load_weights(weights_file: Path) -> tuple[dict[str, int], int]: + """Return (weights_map, default_weight) from the JSON file.""" + try: + with open(weights_file) as f: + data = json.load(f) + except FileNotFoundError: + data = {} + default = data.get("_default", 15) + return data, default + + +def assign_shards( + sessions: list[str], + total_shards: int, + weights: dict[str, int], + default_weight: int, +) -> list[list[str]]: + """Assign sessions to shards using greedy LPT bin-packing.""" + weighted = [(s, weights.get(s, default_weight)) for s in sessions] + weighted.sort(key=lambda x: -x[1]) + + shard_totals = [0] * total_shards + shard_assignments: list[list[str]] = [[] for _ in range(total_shards)] + + for name, weight in weighted: + lightest = min(range(total_shards), key=lambda i: shard_totals[i]) + shard_assignments[lightest].append(name) + shard_totals[lightest] += weight + + for i in range(total_shards): + count = len(shard_assignments[i]) + total = shard_totals[i] + print(f" shard {i}: {count} sessions, ~{total}s", file=sys.stderr) + + # Sort each shard's sessions for deterministic output + for assignments in shard_assignments: + assignments.sort() + + return shard_assignments + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("shard_index", type=int, help="Zero-based shard index") + parser.add_argument("num_shards", type=int, help="Total number of shards") + parser.add_argument("--dry-run", action="store_true", help="Print assignment without running nox") + parser.add_argument( + "--output-durations", + type=Path, + default=None, + help="Write measured session durations (seconds) to a JSON file", + ) + args = parser.parse_args() + + if args.shard_index >= args.num_shards: + print( + f"Error: shard_index ({args.shard_index}) must be less than num_shards ({args.num_shards})", + file=sys.stderr, + ) + sys.exit(1) + + root_dir = Path( + subprocess.run( + ["git", "rev-parse", "--show-toplevel"], + capture_output=True, + text=True, + check=True, + ).stdout.strip() + ) + + noxfile = root_dir / "py" / "noxfile.py" + weights_file = root_dir / "py" / "scripts" / "session-weights.json" + + all_sessions = get_nox_sessions(noxfile) + weights, default_weight = load_weights(weights_file) + shard_assignments = assign_shards(all_sessions, args.num_shards, weights, default_weight) + + my_sessions = shard_assignments[args.shard_index] + other_sessions = sorted(set(all_sessions) - set(my_sessions)) + + print( + f"nox matrix idx:{args.shard_index} shards:{args.num_shards} " + f"running {len(my_sessions)}/{len(all_sessions)} sessions" + ) + + if args.dry_run: + print("--------------------------------") + print("Would run the following sessions:") + print("\n".join(my_sessions)) + print() + print("--------------------------------") + print("Would skip the following sessions:") + print("\n".join(other_sessions)) + return + + cmd = ["nox", "-f", str(noxfile), "-s", *my_sessions] + + if args.output_durations is None: + sys.exit(subprocess.run(cmd).returncode) + + # Stream output while capturing session durations + durations: dict[str, int] = {} + duration_re = re.compile(r"nox > Session (\S+) was successful in (\d+) seconds\.") + process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1) + assert process.stdout is not None + for line in process.stdout: + sys.stdout.write(line) + sys.stdout.flush() + m = duration_re.search(line) + if m: + durations[m.group(1)] = int(m.group(2)) + process.wait() + + args.output_durations.parent.mkdir(parents=True, exist_ok=True) + with open(args.output_durations, "w") as f: + json.dump(durations, f, indent=2, sort_keys=True) + f.write("\n") + print(f"Wrote {len(durations)} session durations to {args.output_durations}", file=sys.stderr) + + sys.exit(process.returncode) + + +if __name__ == "__main__": + main() diff --git a/py/scripts/nox-matrix.sh b/py/scripts/nox-matrix.sh deleted file mode 100755 index 337297ea..00000000 --- a/py/scripts/nox-matrix.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/bin/bash -# -# This is a very crude script to parallelize nox sessions into groups. -# It's used to run the nox tests in parallel on GitHub Actions. -# -# - -set -euo pipefail - -ROOT_DIR=$(git rev-parse --show-toplevel) -NOXFILE=$ROOT_DIR/py/noxfile.py - -# Parse command line arguments -if [ $# -lt 2 ]; then - echo "Usage: $0 [--dry-run]" - exit 1 -fi - -INDEX=$1 -TOTAL=$2 -DRY_RUN=false -shift 2 -while [[ $# -gt 0 ]]; do - case "$1" in - --dry-run) - DRY_RUN=true - shift - ;; - *) - echo "Unknown option: $1" - echo "Usage: $0 [--dry-run]" - exit 1 - ;; - esac -done - -if [ "$INDEX" -ge "$TOTAL" ]; then - echo "Error: shard_index ($INDEX) must be less than number_of_shards ($TOTAL)" - exit 1 -fi - -# Nox formats the sessions like: -# * test_foo -# * test_bar -> Optional description -# We need to strip the description part after " -> " -all_sessions=$(nox -l -f $NOXFILE | grep "^\* " | cut -c 3- | sed 's/ ->.*$//' | sort) -matches=$(echo "$all_sessions" | awk "NR % $TOTAL == $INDEX") -misses=$(echo "$all_sessions" | awk "NR % $TOTAL != $INDEX") -n_matches=$(echo "$matches" | wc -l | xargs) -n_all=$(echo "$all_sessions" | wc -l | xargs) - -printf "nox matrix idx:%d shards:%d running %d/%d sessions\n" "$INDEX" "$TOTAL" "$n_matches" "$n_all" - -if [ "$DRY_RUN" = true ]; then - echo "--------------------------------" - echo "Would run the following sessions:" - echo "$matches" - echo "" - echo "--------------------------------" - echo "Would skip the following sessions:" - echo "$misses" - exit 0 -fi - -# Build session list and run nox once -# Quote each session name to handle parentheses in names like test_openai(latest) -session_list=$(echo "$matches" | sed 's/.*/"&"/' | tr '\n' ' ') -eval "nox -f $NOXFILE -s $session_list" diff --git a/py/scripts/session-weights.json b/py/scripts/session-weights.json new file mode 100644 index 00000000..edf007ea --- /dev/null +++ b/py/scripts/session-weights.json @@ -0,0 +1,57 @@ +{ + "_comment": "Approximate session durations in seconds from CI (ubuntu, Python 3.13). Used by nox-matrix.py for balanced shard assignment. Re-measure periodically and update.", + "_default": 15, + "pylint": 55, + "test_agentscope(1.0.0)": 20, + "test_agentscope(latest)": 22, + "test_agno(2.1.0)": 18, + "test_agno(2.4.0)": 19, + "test_agno(latest)": 19, + "test_anthropic(0.48.0)": 15, + "test_anthropic(0.49.0)": 15, + "test_anthropic(0.50.0)": 15, + "test_anthropic(latest)": 17, + "test_autoevals(0.0.129)": 10, + "test_autoevals(latest)": 10, + "test_braintrust_core": 10, + "test_claude_agent_sdk(0.1.10)": 14, + "test_claude_agent_sdk(latest)": 14, + "test_cli": 4, + "test_core": 10, + "test_dspy(2.6.0)": 25, + "test_dspy(latest)": 25, + "test_google_adk(1.14.1)": 39, + "test_google_adk(latest)": 24, + "test_google_genai(1.30.0)": 15, + "test_google_genai(latest)": 15, + "test_langchain(0.3.28)": 23, + "test_langchain(latest)": 22, + "test_latest_wrappers_novcr": 15, + "test_litellm(1.74.0)": 23, + "test_litellm(1.83.0)": 25, + "test_mistral(1.12.4)": 14, + "test_mistral(latest)": 14, + "test_openai(1.71)": 17, + "test_openai(1.77.0)": 18, + "test_openai(1.91)": 18, + "test_openai(1.92)": 19, + "test_openai(latest)": 19, + "test_openai_agents(0.0.19)": 17, + "test_openai_agents(latest)": 17, + "test_openai_http2_streaming": 5, + "test_openrouter(0.6.0)": 5, + "test_openrouter(latest)": 5, + "test_otel": 4, + "test_otel_not_installed": 3, + "test_pydantic_ai_integration(1.10.0)": 23, + "test_pydantic_ai_integration(latest)": 25, + "test_pydantic_ai_logfire": 8, + "test_pydantic_ai_wrap_openai(0.1.9)": 16, + "test_pydantic_ai_wrap_openai(1.0.1)": 19, + "test_pydantic_ai_wrap_openai(latest)": 20, + "test_pytest_plugin(8.4.2)": 11, + "test_pytest_plugin(latest)": 11, + "test_temporal(1.19.0)": 6, + "test_temporal(1.20.0)": 5, + "test_temporal(latest)": 5 +}