From 79d27334935304289b38f48469f40a60ed218642 Mon Sep 17 00:00:00 2001 From: notque Date: Sat, 28 Mar 2026 19:17:35 -0700 Subject: [PATCH 01/12] feat(agent-comparison): add autoresearch optimization review flow --- pipelines/comprehensive-review/SKILL.md | 27 +- .../test_agent_comparison_optimize_loop.py | 132 ++ .../tests/test_eval_compare_optimization.py | 33 + skills/agent-comparison/SKILL.md | 141 ++- .../references/benchmark-tasks.md | 73 ++ .../references/optimization-guide.md | 181 +++ .../optimization-tasks.example.json | 32 + .../scripts/generate_variant.py | 297 +++++ .../agent-comparison/scripts/optimize_loop.py | 1107 +++++++++++++++++ skills/skill-creator/assets/eval_viewer.html | 454 ++++++- skills/skill-creator/scripts/eval_compare.py | 34 + 11 files changed, 2483 insertions(+), 28 deletions(-) create mode 100644 scripts/tests/test_agent_comparison_optimize_loop.py create mode 100644 scripts/tests/test_eval_compare_optimization.py create mode 100644 skills/agent-comparison/references/optimization-guide.md create mode 100644 skills/agent-comparison/references/optimization-tasks.example.json create mode 100644 skills/agent-comparison/scripts/generate_variant.py create mode 100644 skills/agent-comparison/scripts/optimize_loop.py diff --git a/pipelines/comprehensive-review/SKILL.md b/pipelines/comprehensive-review/SKILL.md index 8ba22ea..9ea2a61 100644 --- a/pipelines/comprehensive-review/SKILL.md +++ b/pipelines/comprehensive-review/SKILL.md @@ -1,23 +1,16 @@ --- name: comprehensive-review description: | - Unified 4-wave code review: Wave 0 auto-discovers packages/modules and - dispatches one language-specialist agent per package for deep per-package - analysis. Wave 1 dispatches 12 foundation reviewers in parallel (with Wave 0 - context). Wave 2 dispatches 10 deep-dive reviewers that receive Wave 0+1 - findings as context for targeted analysis. Wave 3 dispatches 4-5 adversarial - reviewers that challenge Wave 1+2 consensus — contrarian, skeptical senior, - user advocate, meta-process, and conditionally SAPCC structural. Aggregates - all findings by severity with wave-agreement labels (unanimous, majority, - contested), then auto-fixes ALL issues. Covers per-package deep review, - security, business logic, architecture, error handling, test coverage, type - design, code quality, comment analysis, language idioms, docs validation, - newcomer perspective, performance, concurrency, API contracts, dependencies, - error messages, dead code, naming, observability, config safety, migration - safety, and adversarial challenge. - Use for "comprehensive review", "full review", "review everything", "review - and fix", or "thorough code review". - Do NOT use for single-concern reviews (use individual agents instead). + Four-wave code review pipeline for large or high-risk changes. Wave 0 + auto-discovers packages/modules and runs per-package specialist review. Wave + 1 runs broad foundation reviewers in parallel. Wave 2 runs targeted deep-dive + reviewers using earlier findings as context. Wave 3 runs adversarial reviewers + that challenge consensus and surface missed risks. Aggregates findings by + severity and agreement level, deduplicates them, and can auto-fix issues + unless review-only mode is used. Use for "comprehensive review", "full + review", "review everything", "review and fix", or "thorough code review". + Do NOT use for narrow single-concern reviews; use smaller review skills + instead. effort: high version: 4.0.0 user-invocable: false diff --git a/scripts/tests/test_agent_comparison_optimize_loop.py b/scripts/tests/test_agent_comparison_optimize_loop.py new file mode 100644 index 0000000..a0f2faa --- /dev/null +++ b/scripts/tests/test_agent_comparison_optimize_loop.py @@ -0,0 +1,132 @@ +import importlib.util +import json +from pathlib import Path +import sys + + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def load_module(name: str, relative_path: str): + spec = importlib.util.spec_from_file_location(name, REPO_ROOT / relative_path) + module = importlib.util.module_from_spec(spec) + assert spec.loader is not None + spec.loader.exec_module(module) + return module + + +def test_assess_target_rejects_missing_frontmatter(tmp_path): + optimize_loop = load_module( + "agent_comparison_optimize_loop", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + target = tmp_path / "SKILL.md" + target.write_text("# no frontmatter\nbody\n") + + scores = optimize_loop.assess_target( + target, + [{"query": "write tests", "should_trigger": True}], + "improve routing precision", + dry_run=True, + ) + + assert scores["parses"] is False + assert optimize_loop.composite_score(scores) == 0.0 + + +def test_check_protected_sections_rejects_missing_blocks(): + optimize_loop = load_module( + "agent_comparison_optimize_loop", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + original = ( + "alpha\n" + "\n" + "keep me\n" + "\n" + "omega\n" + ) + relocated = "alpha\nomega\n" + + assert optimize_loop.check_protected_sections(original, relocated) is False + + +def test_restore_protected_does_not_silently_reinsert_missing_blocks(): + generate_variant = load_module( + "agent_comparison_generate_variant", + "skills/agent-comparison/scripts/generate_variant.py", + ) + original = ( + "alpha\n" + "\n" + "keep me\n" + "\n" + "omega\n" + ) + variant = "alpha\nomega\n" + + restored = generate_variant.restore_protected(original, variant) + + assert restored == variant + + +def test_generate_variant_main_reads_current_content_from_file(tmp_path, monkeypatch, capsys): + generate_variant = load_module( + "agent_comparison_generate_variant", + "skills/agent-comparison/scripts/generate_variant.py", + ) + + class FakeBlock: + def __init__(self, block_type: str, text: str): + self.type = block_type + if block_type == "thinking": + self.thinking = text + else: + self.text = text + + class FakeResponse: + def __init__(self): + self.content = [ + FakeBlock("thinking", "reasoning"), + FakeBlock( + "text", + "---\ndescription: updated\n---" + "updated", + ), + ] + self.usage = type("Usage", (), {"input_tokens": 1, "output_tokens": 2})() + + class FakeClient: + def __init__(self): + self.messages = type("Messages", (), {"create": lambda self, **kwargs: FakeResponse()})() + + class FakeAnthropicModule: + class Anthropic: + def __new__(cls): + return FakeClient() + + content_file = tmp_path / "current.md" + content_file.write_text("---\ndescription: current\n---\n") + + monkeypatch.setattr(generate_variant, "anthropic", FakeAnthropicModule) + monkeypatch.setattr( + sys, + "argv", + [ + "generate_variant.py", + "--target", + "skills/example/SKILL.md", + "--goal", + "improve routing precision", + "--current-content-file", + str(content_file), + "--model", + "fake-model", + ], + ) + + generate_variant.main() + output = json.loads(capsys.readouterr().out) + + assert output["variant"] == "---\ndescription: updated\n---" + assert output["tokens_used"] == 3 diff --git a/scripts/tests/test_eval_compare_optimization.py b/scripts/tests/test_eval_compare_optimization.py new file mode 100644 index 0000000..f687c7a --- /dev/null +++ b/scripts/tests/test_eval_compare_optimization.py @@ -0,0 +1,33 @@ +import importlib.util +import json +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def load_module(name: str, relative_path: str): + spec = importlib.util.spec_from_file_location(name, REPO_ROOT / relative_path) + module = importlib.util.module_from_spec(spec) + assert spec.loader is not None + spec.loader.exec_module(module) + return module + + +def test_load_optimization_data_ignores_unrelated_results_json(tmp_path): + eval_compare = load_module( + "skill_creator_eval_compare", + "skills/skill-creator/scripts/eval_compare.py", + ) + (tmp_path / "results.json").write_text(json.dumps({"status": "not-optimization"})) + (tmp_path / "evals" / "iterations").mkdir(parents=True) + expected = { + "target": "skills/example/SKILL.md", + "baseline_score": {"train": 1.0, "test": 1.0}, + "iterations": [], + } + (tmp_path / "evals" / "iterations" / "results.json").write_text(json.dumps(expected)) + + loaded = eval_compare.load_optimization_data(tmp_path) + + assert loaded == expected diff --git a/skills/agent-comparison/SKILL.md b/skills/agent-comparison/SKILL.md index 2fed9c1..60a155d 100644 --- a/skills/agent-comparison/SKILL.md +++ b/skills/agent-comparison/SKILL.md @@ -2,12 +2,15 @@ name: agent-comparison description: | A/B test agent variants measuring quality and total session token cost - across simple and complex benchmarks. Use when creating compact agent - versions, validating agent changes, comparing internal vs external agents, - or deciding between variants for production. Use for "compare agents", - "A/B test", "benchmark agents", or "test agent efficiency". Route single-agent evaluation to agent-evaluation, testing skills, or optimizing prompts - without variant comparison. -version: 2.0.0 + across simple and complex benchmarks. Also supports automated optimization + loops (autoresearch) for frontmatter description and routing-trigger quality + using train/test eval sets. Use when creating compact agent versions, validating + agent changes, comparing internal vs external agents, optimizing a skill description, + or deciding between variants for production. Use for "compare agents", "A/B test", + "benchmark agents", "test agent efficiency", "optimize description", "optimize skill", + or "run autoresearch". Route single-agent evaluation to agent-evaluation, testing skills, + or simple prompt optimization without variant comparison. +version: 2.2.0 user-invocable: false allowed-tools: - Read @@ -22,6 +25,9 @@ routing: - "compare agents" - "A/B test agents" - "benchmark agents" + - "optimize skill" + - "optimize description" + - "run autoresearch" category: meta-tooling --- @@ -254,6 +260,129 @@ Remove temporary benchmark files and debug outputs. Keep only the comparison rep **Gate**: Report generated with all metrics. Verdict stated with evidence. Report saved to benchmark directory. +### Phase 5: OPTIMIZE (optional — invoked explicitly) + +**Goal**: Run an automated optimization loop that iteratively improves a markdown target's frontmatter `description` using trigger-rate eval tasks, then keeps only measured improvements. + +This phase is for routing/trigger optimization, not full code-generation benchmarking. Invoke it when the user says "optimize this skill", "optimize the description", or "run autoresearch". The existing manual A/B comparison (Phases 1-4) remains the path for full agent benchmarking. + +**Step 1: Validate optimization target and goal** + +Confirm the target file exists, has YAML frontmatter with a `description`, and the optimization goal is clear: + +```bash +# Target must be a markdown file with frontmatter description +test -f skills/{target}/SKILL.md +rg -n '^description:' skills/{target}/SKILL.md + +# Goal should be specific and measurable +# Good: "improve error handling instructions" +# Bad: "make it better" +``` + +**Step 2: Prepare trigger-rate eval tasks** + +```bash +python3 skills/agent-comparison/scripts/optimize_loop.py \ + --target skills/{target}/SKILL.md \ + --goal "{optimization goal}" \ + --benchmark-tasks skills/agent-comparison/references/optimization-tasks.example.json \ + --train-split 0.6 \ + --model claude-sonnet-4-20250514 \ + --verbose +``` + +Supported task schemas: +- Flat `tasks` list with optional `"split": "train" | "test"` per task +- Top-level `train` and `test` arrays + +Every task must include: +- `query`: the routing prompt to test +- `should_trigger`: whether the target should trigger for that prompt + +If no split markers are present, the loop does a reproducible random split with seed `42`. + +**Step 3: Run baseline evaluation** + +The loop automatically evaluates the unmodified target against the train set before starting iteration. This establishes the score to beat, and records a held-out baseline if test tasks exist. + +**Step 4: Enter optimization loop** + +The `optimize_loop.py` script handles the full loop: +- Calls `generate_variant.py` to propose changes (Claude with extended thinking) +- Evaluates each variant against train tasks +- Keeps variants that improve score by more than `--min-gain` (default 0.02) +- Reverts variants that don't improve, break hard gates, or delete sections without justification +- Checks held-out test set every 5 iterations for Goodhart divergence +- Stops on convergence (5 consecutive reverts), Goodhart alarm, or max iterations + +```bash +python3 skills/agent-comparison/scripts/optimize_loop.py \ + --target skills/{target}/SKILL.md \ + --goal "{optimization goal}" \ + --benchmark-tasks skills/agent-comparison/references/optimization-tasks.example.json \ + --max-iterations 20 \ + --min-gain 0.02 \ + --train-split 0.6 \ + --model claude-sonnet-4-20250514 \ + --report optimization-report.html \ + --output-dir evals/iterations \ + --verbose +``` + +The `--report` flag generates a live HTML dashboard that auto-refreshes every 10 seconds, showing a convergence chart, iteration table, and review/export controls. + +**Step 5: Present results in UI** + +Open the generated `optimization-report.html` in a browser. The report shows: +- Progress dashboard (status, baseline vs best, kept/reverted counts) +- Convergence chart (train solid line, held-out dashed line, baseline dotted) +- Iteration table with verdict, composite score, delta, and change summary +- Expandable inline diffs per iteration (click any row) + +**Step 6: Review kept snapshots** + +Not all KEEP iterations are real improvements — some may be harness artifacts. The user reviews the kept iterations as candidate snapshots from the original target: +- Inspect each kept iteration's diff in the report +- Use "Preview Selected Snapshot" only as a comparison aid in the UI +- Use "Export Selected" to download a review JSON describing the selected snapshot diffs + +**Step 7: Apply selected improvements to target file** + +Apply one reviewed improvement to the original target file. + +- If you want the best single kept variant, use `evals/iterations/best_variant.md`. +- If you exported selected diffs, treat that JSON as review material only. It is not auto-applied by the current tooling, and the current workflow does not support merging multiple kept diffs into a generated patch. + +```bash +# Review the best kept variant before applying +cat evals/iterations/best_variant.md | head -20 + +# Replace the target with the best kept variant +cp evals/iterations/best_variant.md skills/{target}/SKILL.md +``` + +**Step 8: Run final evaluation on FULL task set (train + test)** + +After applying improvements, run a final evaluation on ALL tasks (not just train) to verify the improvements generalize: + +```bash +# Re-run optimize_loop.py against the same task file and inspect results.json/report output +``` + +Compare final scores to the baseline to confirm net improvement. + +**Step 9: Record in learning-db** + +```bash +python3 scripts/learning-db.py learn \ + --skill agent-comparison \ + "autoresearch: {target} improved {baseline}→{best} over {iterations} iterations. \ + Kept: {kept}/{total}. Stop: {reason}. Changes: {summaries}" +``` + +**Gate**: Optimization complete. Results reviewed. Cherry-picked improvements applied and verified against full task set. Results recorded. + ### Optional Extensions These are off by default. Enable explicitly when needed: diff --git a/skills/agent-comparison/references/benchmark-tasks.md b/skills/agent-comparison/references/benchmark-tasks.md index c3f0d4b..f6142d1 100644 --- a/skills/agent-comparison/references/benchmark-tasks.md +++ b/skills/agent-comparison/references/benchmark-tasks.md @@ -184,3 +184,76 @@ cd benchmark/{task-name}/compact && go test -race -v # Manual alternative: compare outputs side-by-side using diff diff benchmark/{task-name}/full/ benchmark/{task-name}/compact/ ``` + +## Optimization Loop Task Format + +The current optimization loop is for frontmatter-description and routing-trigger quality. It does not run full code-generation benchmarks. Use Phase 5 with trigger-rate eval tasks, then use Phases 1-4 for full agent benchmarking. + +### Supported Task File Schemas + +Flat list with optional split markers: + +```json +{ + "tasks": [ + { + "name": "go-testing-positive", + "split": "train", + "complexity": "complex", + "query": "write table-driven tests for a Go parser with subtests and helpers", + "should_trigger": true + }, + { + "name": "kubernetes-negative", + "split": "test", + "complexity": "complex", + "query": "debug a kubernetes pod stuck in CrashLoopBackOff", + "should_trigger": false + } + ] +} +``` + +Explicit top-level train/test sets: + +```json +{ + "train": [ + { + "name": "positive-1", + "query": "write Go benchmarks and race tests for a worker pool", + "should_trigger": true + } + ], + "test": [ + { + "name": "negative-1", + "query": "design a PostgreSQL indexing strategy", + "should_trigger": false + } + ] +} +``` + +### Required Fields + +- `query`: the prompt used to test routing behavior +- `should_trigger`: expected boolean outcome for the target description + +### Optional Fields + +- `name`: human-readable label shown in reports +- `split`: `train` or `test` when using a flat `tasks` list +- `complexity`: used for stratified splitting when no explicit split is provided + +### Split Strategy + +- `train` tasks are used during each optimization iteration. +- `test` tasks are held out and checked every 5 iterations for Goodhart divergence. +- If no split markers are present, the loop performs a reproducible random split with seed `42`, stratified by `complexity`. + +### Task Selection Principles for Optimization + +1. Cover both positive and negative routing examples. A description that only improves recall while tanking precision is not an improvement. +2. Put at least one out-of-domain prompt in the held-out set. This catches overfitting where the description starts matching benchmark phrasing instead of the real scope. +3. Use realistic user wording, not only canonical trigger phrases. Optimization on synthetic wording alone produces brittle routing behavior. diff --git a/skills/agent-comparison/references/optimization-guide.md b/skills/agent-comparison/references/optimization-guide.md new file mode 100644 index 0000000..3b74e16 --- /dev/null +++ b/skills/agent-comparison/references/optimization-guide.md @@ -0,0 +1,181 @@ +# Autoresearch Optimization Guide + +## Scope + +The current autoresearch loop optimizes a markdown target's frontmatter +`description` using trigger-rate eval tasks. This is useful for improving +skill routing accuracy and similar description-driven dispatch behavior. + +It is not a replacement for the manual agent benchmark workflow in Phases 1-4. +If you want to compare real code-generation quality across benchmark tasks, use +the normal A/B process. + +## Supported Targets + +- `skills//SKILL.md` +- Other markdown targets with valid YAML frontmatter and a non-empty + `description` + +The loop rejects targets without parseable frontmatter or without a +`description`, because trigger-rate evaluation depends on the target text that +drives routing. + +## Supported Task Formats + +Every task must include: + +- `query`: the prompt to test +- `should_trigger`: whether the target should trigger for that prompt + +Optional fields: + +- `name`: label shown in logs and reports +- `split`: `train` or `test` +- `complexity`: used for stratified splitting when `split` is omitted + +Flat task list: + +```json +{ + "tasks": [ + { + "name": "positive-1", + "split": "train", + "complexity": "complex", + "query": "write table-driven Go tests with subtests and helper functions", + "should_trigger": true + }, + { + "name": "negative-1", + "split": "test", + "complexity": "complex", + "query": "debug a Kubernetes pod stuck in CrashLoopBackOff", + "should_trigger": false + } + ] +} +``` + +Explicit train/test sets: + +```json +{ + "train": [ + { + "name": "positive-1", + "query": "write race-safe Go tests for a worker pool", + "should_trigger": true + } + ], + "test": [ + { + "name": "negative-1", + "query": "optimize a PostgreSQL indexing strategy", + "should_trigger": false + } + ] +} +``` + +If no split markers are present, the loop performs a reproducible random split +using `--train-split` and seed `42`. + +## Command + +```bash +python3 skills/agent-comparison/scripts/optimize_loop.py \ + --target skills/go-testing/SKILL.md \ + --goal "improve routing precision without losing recall" \ + --benchmark-tasks skills/agent-comparison/references/optimization-tasks.example.json \ + --train-split 0.6 \ + --max-iterations 20 \ + --min-gain 0.02 \ + --model claude-sonnet-4-20250514 \ + --report optimization-report.html \ + --output-dir evals/iterations \ + --verbose +``` + +Useful flags: + +- `--dry-run`: exercise the loop mechanics without API calls +- `--report`: write a live HTML report +- `--output-dir`: persist iteration snapshots and `results.json` + +## Evaluation Model + +The loop follows the ADR-131 structure: + +1. Hard gates +2. Weighted composite score +3. Held-out regression checks + +### Layer 1: Hard Gates + +An iteration is rejected immediately if any of these fail: + +- `parses` +- `compiles` +- `tests_pass` +- `protected_intact` + +For description optimization, `parses` and `protected_intact` are the most +important gates. Protected sections fenced by `DO NOT OPTIMIZE` markers must be +preserved verbatim. + +### Layer 2: Composite Score + +The loop converts trigger-rate evaluation results into a weighted composite +score using the built-in weights in `optimize_loop.py`. A variant is kept only +if it beats the previous best by more than `--min-gain`. + +### Layer 3: Held-Out Regression Check + +Every 5 iterations, the current best variant is scored on the held-out test set. +If held-out performance drops below the baseline while train performance has +improved, the loop raises a Goodhart alarm and stops. + +## Deletion Safety Rule + +Deleting sections is allowed only with explicit justification. + +- `generate_variant.py` detects removed `##` headings +- the model must return a `deletion_justification` +- `optimize_loop.py` rejects deletions without one + +This enforces ADR-131's "no deletion without justification" rule. + +## Iteration Artifacts + +When `--output-dir` is set, the loop writes: + +- `001/variant.md` +- `001/scores.json` +- `001/verdict.json` +- `001/diff.patch` +- `best_variant.md` +- `results.json` + +When `--report` is set, it also writes a live HTML dashboard showing: + +- status, baseline, best score, kept/reverted counts +- convergence chart +- iteration table with diffs +- review/export controls for kept snapshot diffs from the original target + +## Choosing Good Eval Tasks + +1. Include both positive and negative prompts. +2. Put realistic user phrasing in both train and held-out sets. +3. Keep at least one out-of-domain negative example in held-out. +4. Do not let the eval set collapse into benchmark keywords only. + +## Limitations + +Current limitations are intentional and documented: + +- The loop does not execute full code-generation benchmarks. +- Pattern-based benchmark tasks with `prompt`, `expected_patterns`, and + `forbidden_patterns` are not supported by `optimize_loop.py`. +- For full agent quality comparisons, continue to use the manual benchmark and + grading flow in Phases 1-4. diff --git a/skills/agent-comparison/references/optimization-tasks.example.json b/skills/agent-comparison/references/optimization-tasks.example.json new file mode 100644 index 0000000..098226e --- /dev/null +++ b/skills/agent-comparison/references/optimization-tasks.example.json @@ -0,0 +1,32 @@ +{ + "tasks": [ + { + "name": "positive-go-tests", + "split": "train", + "complexity": "complex", + "query": "write table-driven Go tests with subtests and helper functions", + "should_trigger": true + }, + { + "name": "positive-benchmarks", + "split": "train", + "complexity": "simple", + "query": "add Go benchmarks and race-safe test coverage for a worker pool", + "should_trigger": true + }, + { + "name": "negative-kubernetes", + "split": "test", + "complexity": "complex", + "query": "debug a kubernetes pod stuck in CrashLoopBackOff", + "should_trigger": false + }, + { + "name": "negative-sql", + "split": "test", + "complexity": "simple", + "query": "design a PostgreSQL indexing strategy for a reporting query", + "should_trigger": false + } + ] +} diff --git a/skills/agent-comparison/scripts/generate_variant.py b/skills/agent-comparison/scripts/generate_variant.py new file mode 100644 index 0000000..2378e50 --- /dev/null +++ b/skills/agent-comparison/scripts/generate_variant.py @@ -0,0 +1,297 @@ +#!/usr/bin/env python3 +"""Generate a variant of an agent/skill file using Claude with extended thinking. + +Proposes modifications to improve the target file based on the optimization +goal and previous iteration failures. Preserves protected sections marked +with DO NOT OPTIMIZE markers. + +Pattern: follows improve_description.py's Claude + extended thinking approach. + +Usage: + python3 skills/agent-comparison/scripts/generate_variant.py \ + --target agents/golang-general-engineer.md \ + --goal "improve error handling instructions" \ + --current-content "..." \ + --failures '[...]' \ + --model claude-sonnet-4-20250514 + +Output (JSON to stdout): + { + "variant": "full file content...", + "summary": "Added CRITICAL warning for error wrapping", + "deletion_justification": "", + "reasoning": "Extended thinking content...", + "tokens_used": 12345 + } + +See ADR-131 for safety rules. +""" + +from __future__ import annotations + +import argparse +import json +import re +import sys + +try: + import anthropic +except ImportError: # pragma: no cover - exercised in environments without the SDK + anthropic = None + +# --------------------------------------------------------------------------- +# Protected section handling +# --------------------------------------------------------------------------- + +_PROTECTED_RE = re.compile( + r"(.*?)", + re.DOTALL, +) + + +def extract_protected(content: str) -> list[str]: + """Extract all protected sections from content.""" + return _PROTECTED_RE.findall(content) + + +def restore_protected(original: str, variant: str) -> str: + """Restore protected sections from original into variant.""" + orig_sections = extract_protected(original) + var_sections = extract_protected(variant) + + if len(orig_sections) != len(var_sections): + print( + "Warning: Protected section count mismatch " + f"(original={len(orig_sections)}, variant={len(var_sections)}).", + file=sys.stderr, + ) + return variant + + result = variant + for orig_sec, var_sec in zip(orig_sections, var_sections): + result = result.replace(var_sec, orig_sec, 1) + + return result + + +# --------------------------------------------------------------------------- +# Deletion detection +# --------------------------------------------------------------------------- + + +def detect_deletions(original: str, variant: str) -> list[str]: + """Find sections that exist in original but are missing from variant. + + Returns list of deleted section headings. Only checks ## headings. + """ + orig_headings = set(re.findall(r"^##\s+(.+)$", original, re.MULTILINE)) + var_headings = set(re.findall(r"^##\s+(.+)$", variant, re.MULTILINE)) + return sorted(orig_headings - var_headings) + + +# --------------------------------------------------------------------------- +# Variant generation +# --------------------------------------------------------------------------- + + +def generate_variant( + client: anthropic.Anthropic, + target_path: str, + goal: str, + current_content: str, + failures: list[dict], + model: str, + history: list[dict] | None = None, +) -> dict: + """Call Claude to generate a variant of the target file. + + Returns dict with variant content, summary, reasoning, and token count. + """ + # Build the prompt + failure_section = "" + if failures: + failure_section = "\n\nFailed tasks from the last iteration:\n" + for f in failures: + failure_section += f" - {f.get('name', 'unnamed')}: {f.get('details', 'failed')}\n" + + history_section = "" + if history: + history_section = "\n\nPrevious attempts (do NOT repeat — try structurally different approaches):\n" + for h in history: + history_section += f" Iteration {h.get('number', '?')}: {h.get('verdict', '?')} — {h.get('change_summary', '')}\n" + + protected_sections = extract_protected(current_content) + protected_notice = "" + if protected_sections: + protected_notice = f""" + +CRITICAL SAFETY RULE: The file contains {len(protected_sections)} protected section(s) marked with + and markers. +You MUST preserve these sections EXACTLY as they are — character for character. +Do not add, remove, or modify anything between these markers. +This is non-negotiable: protected sections contain safety gates that must not be +removed even if removing them would improve test scores.""" + + prompt = f"""You are optimizing an agent/skill file to improve its performance. + +Target file: {target_path} +Optimization goal: {goal} + +Current content of the file: + +{current_content} + +{failure_section}{history_section}{protected_notice} + +SAFETY RULES: +1. Do NOT delete sections without replacing them with equivalent or better content. + If you remove a section heading that exists in the original, you must explain what + replaces the removed functionality. Pure deletion degrades unmeasured capabilities. + +2. Do NOT change the tools, SDKs, or interfaces the agent uses. The variant must work + in the same environment as the original (no switching from SDK to curl, etc.). + +3. Keep YAML frontmatter structure intact (name, description, routing, etc.). + +4. Focus on making the agent/skill better at achieving the stated goal. Common + improvements include: + - Moving critical information to more prominent positions (CRITICAL banners) + - Adding explicit planning steps before code generation + - Improving error handling instructions with specific patterns + - Adding concrete examples for ambiguous instructions + - Restructuring for clarity when sections are dense + +Please respond with the complete modified file content inside tags, +and a brief summary of what you changed and why inside tags. + +If you removed any existing `##` section heading, include a brief justification +inside tags. If you did not remove a section, return +empty tags. + + +[complete file content here] + + + +[1-2 sentence description of the change] + + + +[why any removed section was replaced safely, or leave blank] +""" + + try: + response = client.messages.create( + model=model, + max_tokens=16000, + thinking={ + "type": "enabled", + "budget_tokens": 10000, + }, + messages=[{"role": "user", "content": prompt}], + ) + except anthropic.APIStatusError as e: + print(f"Error: API returned status {e.status_code}: {e.message}", file=sys.stderr) + sys.exit(1) + except anthropic.APIConnectionError as e: + print(f"Error: API connection failed: {e}", file=sys.stderr) + sys.exit(1) + + # Extract thinking and text + thinking_text = "" + text = "" + for block in response.content: + if block.type == "thinking": + thinking_text = block.thinking + elif block.type == "text": + text = block.text + + # Parse variant content + variant_match = re.search(r"(.*?)", text, re.DOTALL) + if not variant_match: + print("Error: No tags in response", file=sys.stderr) + sys.exit(1) + + variant = variant_match.group(1).strip() + + # Parse summary + summary_match = re.search(r"(.*?)", text, re.DOTALL) + summary = summary_match.group(1).strip() if summary_match else "No summary provided" + + deletion_match = re.search(r"(.*?)", text, re.DOTALL) + deletion_justification = deletion_match.group(1).strip() if deletion_match else "" + + # Restore protected sections (safety net) + variant = restore_protected(current_content, variant) + + # Check for unauthorized deletions + deletions = detect_deletions(current_content, variant) + if deletions: + print(f"Warning: Deleted sections: {deletions}", file=sys.stderr) + + tokens_used = response.usage.input_tokens + response.usage.output_tokens + + return { + "variant": variant, + "summary": summary, + "deletion_justification": deletion_justification, + "reasoning": thinking_text, + "tokens_used": tokens_used, + "deletions": deletions, + } + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def main(): + parser = argparse.ArgumentParser(description="Generate agent/skill variant using Claude") + parser.add_argument("--target", required=True, help="Path to target file (for context)") + parser.add_argument("--goal", required=True, help="Optimization goal") + content_group = parser.add_mutually_exclusive_group(required=True) + content_group.add_argument("--current-content", help="Current file content") + content_group.add_argument("--current-content-file", help="Path to a file containing the current content") + parser.add_argument("--failures", default="[]", help="JSON list of failed tasks") + parser.add_argument("--history", default="[]", help="JSON list of previous iterations") + parser.add_argument("--model", default="claude-sonnet-4-20250514", help="Model to use") + args = parser.parse_args() + + try: + failures = json.loads(args.failures) + except json.JSONDecodeError as e: + print(f"Error: --failures is not valid JSON: {e}", file=sys.stderr) + sys.exit(1) + try: + history = json.loads(args.history) + except json.JSONDecodeError as e: + print(f"Error: --history is not valid JSON: {e}", file=sys.stderr) + sys.exit(1) + + if anthropic is None: + print("Error: anthropic SDK is not installed", file=sys.stderr) + sys.exit(1) + + current_content = ( + open(args.current_content_file, encoding="utf-8").read() + if args.current_content_file + else args.current_content + ) + + client = anthropic.Anthropic() + result = generate_variant( + client=client, + target_path=args.target, + goal=args.goal, + current_content=current_content, + failures=failures, + model=args.model, + history=history if history else None, + ) + + print(json.dumps(result, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/skills/agent-comparison/scripts/optimize_loop.py b/skills/agent-comparison/scripts/optimize_loop.py new file mode 100644 index 0000000..228dd1d --- /dev/null +++ b/skills/agent-comparison/scripts/optimize_loop.py @@ -0,0 +1,1107 @@ +#!/usr/bin/env python3 +"""Autoresearch optimization loop for agent/skill files. + +Wraps the existing agent-comparison evaluation infrastructure in an outer +loop that proposes variants, evaluates them, and keeps/reverts based on +score improvement. The keep/revert decision is arithmetic — no LLM +judgment in the loop itself. + +Usage: + python3 skills/agent-comparison/scripts/optimize_loop.py \ + --target agents/golang-general-engineer.md \ + --goal "improve error handling instructions" \ + --benchmark-tasks tasks.json \ + --max-iterations 20 \ + --min-gain 0.02 + +See ADR-131 for architecture details. +""" + +from __future__ import annotations + +import argparse +import json +import random +import re +import subprocess +import sys +import tempfile +import time +from pathlib import Path + +# --------------------------------------------------------------------------- +# Scoring helpers +# --------------------------------------------------------------------------- + +QUALITY_WEIGHTS = { + "correctness": 0.40, + "error_handling": 0.20, + "language_idioms": 0.15, + "testing": 0.15, + "efficiency": 0.10, +} + +HARD_GATE_KEYS = ["parses", "compiles", "tests_pass", "protected_intact"] + + +def passes_hard_gates(scores: dict) -> bool: + """Layer 1: Hard gates — score is 0 if any fail.""" + return all(scores.get(key, False) for key in HARD_GATE_KEYS) + + +def composite_score(scores: dict) -> float: + """Layer 2: Weighted quality score, conditional on hard gates passing.""" + if not passes_hard_gates(scores): + return 0.0 + total = 0.0 + for dim, weight in QUALITY_WEIGHTS.items(): + total += scores.get(dim, 0.0) * weight + return round(total, 4) + + +def holdout_diverges( + train_score: float, + holdout_score: float, + baseline_holdout: float, + baseline_train: float = 0.0, + threshold: float = 0.5, +) -> bool: + """Goodhart alarm: held-out score drops while train has improved.""" + holdout_dropped = (baseline_holdout - holdout_score) > threshold + train_improved = train_score > baseline_train + return holdout_dropped and train_improved + + +# --------------------------------------------------------------------------- +# Iteration snapshot +# --------------------------------------------------------------------------- + + +def save_iteration( + output_dir: Path, + iteration: int, + variant_content: str, + scores: dict, + verdict: str, + reasoning: str, + diff_text: str, + change_summary: str, + stop_reason: str | None = None, + deletions: list[str] | None = None, + deletion_justification: str = "", +) -> dict: + """Save a full iteration snapshot and return its metadata.""" + iter_dir = output_dir / f"{iteration:03d}" + iter_dir.mkdir(parents=True, exist_ok=True) + + (iter_dir / "variant.md").write_text(variant_content) + (iter_dir / "scores.json").write_text(json.dumps(scores, indent=2)) + + verdict_data = { + "iteration": iteration, + "verdict": verdict, + "composite_score": composite_score(scores), + "change_summary": change_summary, + "reasoning": reasoning, + "stop_reason": stop_reason, + "deletions": deletions or [], + "deletion_justification": deletion_justification, + } + (iter_dir / "verdict.json").write_text(json.dumps(verdict_data, indent=2)) + + if diff_text: + (iter_dir / "diff.patch").write_text(diff_text) + + return verdict_data + + +# --------------------------------------------------------------------------- +# Diff generation +# --------------------------------------------------------------------------- + + +def generate_diff(original: str, variant: str, label: str = "target") -> str: + """Generate a unified diff between two strings.""" + import difflib + + original_lines = original.splitlines(keepends=True) + variant_lines = variant.splitlines(keepends=True) + diff = difflib.unified_diff( + original_lines, + variant_lines, + fromfile=f"a/{label}", + tofile=f"b/{label}", + lineterm="\n", + ) + return "".join(diff) + + +def make_dry_run_variant(current_content: str, iteration: int) -> tuple[str, str, str]: + """Generate a deterministic local variant for --dry-run mode.""" + marker = f"" + if marker in current_content: + marker = f"" + if current_content.endswith("\n"): + variant = current_content + marker + "\n" + else: + variant = current_content + "\n" + marker + "\n" + return variant, "Synthetic dry-run mutation", "dry-run synthetic variant" + + +# --------------------------------------------------------------------------- +# HTML report generation +# --------------------------------------------------------------------------- + + +def _build_report_data( + target: str, + goal: str, + baseline_composite: float, + baseline_holdout: float | None, + train_size: int, + test_size: int, + iterations: list[dict], + max_iterations: int, + status: str, + total_tokens: int, +) -> dict: + """Build the data structure for HTML report generation.""" + return { + "target": target, + "goal": goal, + "status": status, + "baseline_score": {"train": baseline_composite, "test": baseline_holdout}, + "task_counts": {"train": train_size, "test": test_size}, + "max_iterations": max_iterations, + "total_tokens": total_tokens, + "iterations": iterations, + } + + +def generate_optimization_report(data: dict, auto_refresh: bool = False) -> str: + """Generate iteration history HTML report. + + The convergence chart is built client-side using safe DOM methods + (createElementNS, setAttribute, textContent) — no innerHTML. + All string data is escaped server-side via html.escape before + embedding in the template. + """ + import html as html_mod + + target = html_mod.escape(data.get("target", "")) + goal = html_mod.escape(data.get("goal", "")) + status = data.get("status", "RUNNING") + iterations = data.get("iterations", []) + baseline = data.get("baseline_score", {}) + task_counts = data.get("task_counts", {}) + + refresh = '' if auto_refresh else "" + + rows = "" + for it in iterations: + v = it["verdict"] + vcls = {"KEEP": "keep", "REVERT": "revert", "STOP": "stop"}.get(v, "") + sc = it["score"] + train_score = sc.get("train") + test_score = sc.get("test") + score_str = f'{train_score:.2f}' if isinstance(train_score, (int, float)) else "?" + if isinstance(test_score, (int, float)): + score_str += f' / {test_score:.2f}' + delta = str(it.get("delta", "")) + dcls = "d-pos" if delta.startswith("+") and delta != "+0" else "d-neg" if delta.startswith("-") else "d-zero" + summary = html_mod.escape(str(it.get("change_summary", ""))[:80]) + diff_esc = html_mod.escape(str(it.get("diff", ""))) + is_keep = v == "KEEP" + n = it["number"] + + rows += f""" + + {n} + {v} + {score_str} + {delta} + {summary} + + + +
{diff_esc}
+ """ + + chart_json = json.dumps([ + {"x": it["number"], "train": it["score"].get("train", 0), "test": it["score"].get("test")} + for it in iterations + ]) + diffs_json = json.dumps({it["number"]: str(it.get("diff", "")) for it in iterations}) + + bt = baseline.get("train", 0.0) + best = max((it["score"].get("train", bt) for it in iterations), default=bt) + kept = sum(1 for it in iterations if it["verdict"] == "KEEP") + reverted = sum(1 for it in iterations if it["verdict"] == "REVERT") + cur = len(iterations) + mx = data.get("max_iterations", 20) + scls = "running" if status == "RUNNING" else "done" if status in ("CONVERGED", "COMPLETE") else "alarm" + score_label = f"Train tasks: {task_counts.get('train', 0)}" + if task_counts.get("test"): + score_label += f" | Held-out tasks: {task_counts['test']}" + + return f""" + + +{refresh} +Optimization: {target} + + + +

Optimization: {target}

+

Goal: {goal}

+
+
Status{status}
+
Progress{cur}/{mx}
+
Baseline{bt:.2f}
+
Best{best:.2f} ({best - bt:+.2f})
+
Kept{kept}
+
Reverted{reverted}
+
+

{score_label}

+
+ + +{rows} +
#VerdictScoreDeltaChangePick
+
+ + +
+ + + +""" + + +# --------------------------------------------------------------------------- +# Task loading and splitting +# --------------------------------------------------------------------------- + + +def load_benchmark_tasks(path: Path) -> list[dict]: + """Load benchmark tasks from JSON file.""" + data = json.loads(path.read_text()) + if isinstance(data, list): + return data + if "tasks" in data: + return data["tasks"] + if "train" in data or "test" in data: + tasks = [] + for split_name in ("train", "test"): + for task in data.get(split_name, []): + normalized = dict(task) + normalized.setdefault("split", split_name) + tasks.append(normalized) + return tasks + raise ValueError("Task file must be a list, {'tasks': [...]}, or {'train': [...], 'test': [...]}.") + + +def split_tasks( + tasks: list[dict], + train_split: float, + seed: int = 42, +) -> tuple[list[dict], list[dict]]: + """Split tasks into train and test sets. + + Uses explicit 'split' field if present, otherwise random split + stratified by complexity. + """ + has_explicit = any("split" in t for t in tasks) + if has_explicit: + train = [t for t in tasks if t.get("split", "train") == "train"] + test = [t for t in tasks if t.get("split") == "test"] + return train, test + + rng = random.Random(seed) + by_complexity: dict[str, list[dict]] = {} + for t in tasks: + by_complexity.setdefault(t.get("complexity", "medium"), []).append(t) + + train, test = [], [] + for group in by_complexity.values(): + rng.shuffle(group) + n_train = max(1, int(len(group) * train_split)) + train.extend(group[:n_train]) + test.extend(group[n_train:]) + + return train, test + + +# --------------------------------------------------------------------------- +# Frontmatter parsing +# --------------------------------------------------------------------------- + + +def _parse_frontmatter(content: str) -> tuple[bool, str]: + """Parse YAML frontmatter, returning (valid, description).""" + if not content.startswith("---"): + return False, "" + lines = content.split("\n") + end_idx = None + for i, line in enumerate(lines[1:], start=1): + if line.strip() == "---": + end_idx = i + break + if end_idx is None: + return False, "" + + description = "" + fm_lines = lines[1:end_idx] + idx = 0 + while idx < len(fm_lines): + line = fm_lines[idx] + if line.startswith("description:"): + value = line[len("description:"):].strip() + if value in (">", "|", ">-", "|-"): + parts: list[str] = [] + idx += 1 + while idx < len(fm_lines) and (fm_lines[idx].startswith(" ") or fm_lines[idx].startswith("\t")): + parts.append(fm_lines[idx].strip()) + idx += 1 + description = " ".join(parts) + continue + else: + description = value.strip('"').strip("'") + idx += 1 + return True, description + + +def _is_trigger_task(task: dict) -> bool: + return "query" in task and "should_trigger" in task + + +def _is_pattern_task(task: dict) -> bool: + return "prompt" in task and ( + "expected_patterns" in task or "forbidden_patterns" in task or "weight" in task + ) + + +def _validate_task_set(tasks: list[dict]) -> None: + """Reject unsupported or mixed task formats early with a clear error.""" + if not tasks: + raise ValueError("Task file is empty.") + + trigger_tasks = sum(1 for task in tasks if _is_trigger_task(task)) + pattern_tasks = sum(1 for task in tasks if _is_pattern_task(task)) + + if trigger_tasks and pattern_tasks: + raise ValueError("Task file mixes trigger-rate and pattern benchmark formats. Use one format per run.") + + if trigger_tasks == len(tasks): + return + + if pattern_tasks == len(tasks): + raise ValueError( + "Pattern benchmark tasks are not supported by optimize_loop.py yet. " + "Use trigger-rate tasks with 'query' and 'should_trigger' fields." + ) + + raise ValueError( + "Unsupported task format. Expected trigger-rate tasks with 'query' and 'should_trigger' fields." + ) + + +# --------------------------------------------------------------------------- +# Trigger-rate evaluator (uses existing run_eval infrastructure) +# --------------------------------------------------------------------------- + + +def _run_trigger_rate( + target_path: Path, + description: str, + tasks: list[dict], + num_workers: int = 5, + timeout: int = 30, + verbose: bool = False, +) -> dict: + """Run trigger-rate assessment using the skill_eval infrastructure. + + Tasks must have 'query' and 'should_trigger' fields. + Returns run_eval-style results dict. + """ + import os + import tempfile + + task_file = None + try: + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(tasks, f) + task_file = f.name + + with tempfile.TemporaryDirectory() as skill_dir: + skill_md = Path(skill_dir) / "SKILL.md" + skill_md.write_text(target_path.read_text()) + + project_root = Path.cwd() + for parent in [project_root, *project_root.parents]: + if (parent / ".claude").is_dir(): + project_root = parent + break + + cmd = [ + sys.executable, "-m", "scripts.skill_eval.run_eval", + "--eval-set", task_file, + "--skill-path", skill_dir, + "--description", description, + "--num-workers", str(num_workers), + "--timeout", str(timeout), + "--runs-per-query", "1", + ] + if verbose: + cmd.append("--verbose") + print(f"Running trigger assessment: {len(tasks)} queries", file=sys.stderr) + + env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} + + result = subprocess.run( + cmd, capture_output=True, text=True, + cwd=str(project_root), env=env, timeout=600, + ) + + if result.returncode != 0: + if verbose: + print(f"Trigger assessment failed: {result.stderr[:300]}", file=sys.stderr) + return {"results": [], "summary": {"total": 0, "passed": 0, "failed": 0}} + + return json.loads(result.stdout) + finally: + if task_file: + Path(task_file).unlink(missing_ok=True) + + +# --------------------------------------------------------------------------- +# Evaluation bridge +# --------------------------------------------------------------------------- + + +def assess_target( + target_path: Path, + tasks: list[dict], + goal: str, + verbose: bool = False, + dry_run: bool = False, +) -> dict: + """Assess a target file against tasks. + + Supports three modes: + - Trigger-rate: tasks have 'query' + 'should_trigger' fields. + Uses existing run_eval infrastructure via claude -p. + - Dry-run: returns synthetic scores for testing loop mechanics. + - Benchmark (NYI): tasks have 'prompt' + 'name' fields. + + Returns scores dict with hard gate booleans and quality dimensions. + """ + scores: dict = { + "parses": True, + "compiles": True, + "tests_pass": True, + "protected_intact": True, + "correctness": 0.0, + "error_handling": 0.0, + "language_idioms": 0.0, + "testing": 0.0, + "efficiency": 0.0, + "task_results": [], + } + + content = target_path.read_text() + valid, description = _parse_frontmatter(content) + if not valid or not description: + scores["parses"] = False + return scores + + # Dry-run mode: content-dependent synthetic scores for testing loop mechanics. + # Hard gates always pass (the point is testing keep/revert logic). + # Quality scores vary deterministically based on content hash so that + # different variants produce different scores. + if dry_run: + import hashlib + h = int(hashlib.sha256(content.encode()).hexdigest()[:8], 16) + base = (h % 30 + 70) / 100.0 # 0.70-1.00 range — always decent + scores["correctness"] = round(base * 10, 2) + scores["error_handling"] = round(base * 8, 2) + scores["language_idioms"] = round(base * 7, 2) + scores["testing"] = round(base * 7, 2) + scores["efficiency"] = round(base * 6, 2) + scores["tests_pass"] = True # always pass in dry-run + for task in tasks: + name = task.get("name", task.get("query", "unnamed"))[:40] + scores["task_results"].append({ + "name": name, "passed": True, + "score": base, "details": "dry-run", + }) + return scores + + # Detect assessment mode from task format + is_trigger = all(_is_trigger_task(task) for task in tasks) + + if is_trigger: + results = _run_trigger_rate(target_path, description, tasks, verbose=verbose) + summary = results.get("summary", {}) + total = summary.get("total", 0) + passed = summary.get("passed", 0) + if total == 0: + return scores + + accuracy = passed / total + scores["correctness"] = round(accuracy * 10, 2) + scores["error_handling"] = round(accuracy * 8, 2) + scores["language_idioms"] = round(accuracy * 7, 2) + scores["testing"] = round(accuracy * 8, 2) + scores["efficiency"] = round(min(1.0, accuracy + 0.1) * 6, 2) + scores["tests_pass"] = passed == total + + for r in results.get("results", []): + scores["task_results"].append({ + "name": r.get("query", "unnamed")[:40], + "passed": r.get("pass", False), + "score": 1.0 if r.get("pass", False) else 0.0, + "details": f"trigger_rate={r.get('trigger_rate', 0):.2f}", + }) + return scores + + # Benchmark behavioral assessment — not yet implemented. + # Use trigger-rate format (tasks with 'query' + 'should_trigger') + # as the recommended starting point per ADR-131 research findings. + raise NotImplementedError( + "Pattern benchmark tasks are not yet implemented. " + "Use trigger-rate tasks with 'query' and 'should_trigger' fields. " + "See optimization-guide.md." + ) + + +# --------------------------------------------------------------------------- +# Protected section validation +# --------------------------------------------------------------------------- + +_PROTECTED_RE = re.compile( + r"(.*?)", + re.DOTALL, +) + + +def check_protected_sections(original: str, variant: str) -> bool: + """Verify DO NOT OPTIMIZE sections are preserved verbatim.""" + orig = list(_PROTECTED_RE.finditer(original)) + var = list(_PROTECTED_RE.finditer(variant)) + if len(orig) != len(var): + return False + return all(orig_match.group(0) == var_match.group(0) for orig_match, var_match in zip(orig, var)) + + +# --------------------------------------------------------------------------- +# Main loop +# --------------------------------------------------------------------------- + + +def run_optimization_loop( + target_path: Path, + goal: str, + benchmark_tasks_path: Path, + max_iterations: int = 20, + min_gain: float = 0.02, + train_split: float = 0.6, + model: str = "claude-sonnet-4-20250514", + verbose: bool = False, + report_path: Path | None = None, + output_dir: Path | None = None, + dry_run: bool = False, +) -> dict: + """Run the autoresearch optimization loop.""" + if output_dir is None: + output_dir = Path("evals/iterations") + output_dir.mkdir(parents=True, exist_ok=True) + + all_tasks = load_benchmark_tasks(benchmark_tasks_path) + _validate_task_set(all_tasks) + train_tasks, test_tasks = split_tasks(all_tasks, train_split) + + if verbose: + print(f"Tasks: {len(train_tasks)} train, {len(test_tasks)} test", file=sys.stderr) + + original_content = target_path.read_text() + target_valid, target_description = _parse_frontmatter(original_content) + if not target_valid or not target_description: + raise ValueError( + "Target must have YAML frontmatter with a non-empty description. " + "optimize_loop.py currently supports frontmatter-description optimization only." + ) + current_content = original_content + target_label = target_path.name + + if verbose: + print("Running baseline evaluation...", file=sys.stderr) + + baseline_scores = assess_target(target_path, train_tasks, goal, verbose, dry_run) + baseline_composite = composite_score(baseline_scores) + best_score = baseline_composite + best_content = current_content + best_iteration = 0 + + baseline_holdout_scores = assess_target(target_path, test_tasks, goal, verbose, dry_run) if test_tasks else None + baseline_holdout = composite_score(baseline_holdout_scores) if baseline_holdout_scores else None + + if verbose: + holdout_display = f"{baseline_holdout:.4f}" if baseline_holdout is not None else "n/a" + print(f"Baseline: train={baseline_composite:.4f}, holdout={holdout_display}", file=sys.stderr) + + iterations: list[dict] = [] + consecutive_reverts = 0 + exit_reason = "unknown" + status = "RUNNING" + total_tokens = 0 + + for i in range(1, max_iterations + 1): + if verbose: + print(f"\n{'=' * 60}\nIteration {i}/{max_iterations} (best={best_score:.4f})", file=sys.stderr) + + # 1. Generate variant + t0 = time.time() + last_failures = [] + if iterations: + last_scores_data = iterations[-1].get("scores", {}) + last_failures = [t for t in last_scores_data.get("task_results", []) if not t.get("passed")] + history = [ + { + "number": item["number"], + "verdict": item["verdict"], + "change_summary": item["change_summary"], + "delta": item["delta"], + } + for item in iterations[-5:] + ] + + if dry_run: + variant_content, change_summary, reasoning = make_dry_run_variant(current_content, i) + variant_output = { + "variant": variant_content, + "summary": change_summary, + "reasoning": reasoning, + "tokens_used": 0, + "deletions": [], + "deletion_justification": "", + } + deletions = [] + deletion_justification = "" + else: + with tempfile.NamedTemporaryFile(mode="w", suffix=target_path.suffix, encoding="utf-8") as current_file: + current_file.write(current_content) + current_file.flush() + variant_result = subprocess.run( + [ + sys.executable, + str(Path(__file__).parent / "generate_variant.py"), + "--target", str(target_path), + "--goal", goal, + "--current-content-file", current_file.name, + "--failures", json.dumps(last_failures), + "--history", json.dumps(history), + "--model", model, + ], + capture_output=True, text=True, timeout=120, + ) + + if variant_result.returncode != 0: + if verbose: + print(f"Variant generation failed: {variant_result.stderr}", file=sys.stderr) + consecutive_reverts += 1 + iterations.append({ + "number": i, "verdict": "REVERT", + "score": {"train": best_score}, + "delta": "0", "change_summary": "Variant generation failed", + "reasoning": variant_result.stderr[:200], "diff": "", + }) + if consecutive_reverts >= 5: + exit_reason = f"converged (5 consecutive reverts at iteration {i})" + status = "CONVERGED" + break + continue + + try: + variant_output = json.loads(variant_result.stdout) + variant_content = variant_output["variant"] + change_summary = variant_output.get("summary", "") + reasoning = variant_output.get("reasoning", "") + total_tokens += variant_output.get("tokens_used", 0) + deletions = variant_output.get("deletions", []) + deletion_justification = variant_output.get("deletion_justification", "").strip() + except (json.JSONDecodeError, KeyError) as e: + if verbose: + print(f"Parse error: {e}", file=sys.stderr) + consecutive_reverts += 1 + iterations.append({ + "number": i, "verdict": "REVERT", + "score": {"train": best_score}, + "delta": "0", "change_summary": f"Parse error: {e}", + "reasoning": "", "diff": "", + }) + if consecutive_reverts >= 5: + exit_reason = f"converged (5 consecutive reverts at iteration {i})" + status = "CONVERGED" + break + continue + + gen_elapsed = time.time() - t0 + + # 2. Validate protected sections + if not check_protected_sections(original_content, variant_content): + if verbose: + print("REJECTED: Protected sections modified", file=sys.stderr) + diff_text = generate_diff(current_content, variant_content, target_label) + save_iteration(output_dir, i, variant_content, {"protected_intact": False}, + "REVERT", "Protected sections modified", diff_text, change_summary) + iterations.append({ + "number": i, "verdict": "REVERT", + "score": {"train": 0.0}, + "delta": "0", "change_summary": "Protected sections modified", + "reasoning": reasoning, "diff": diff_text, + }) + consecutive_reverts += 1 + if consecutive_reverts >= 5: + exit_reason = f"converged (5 consecutive reverts at iteration {i})" + status = "CONVERGED" + break + continue + + if deletions and not deletion_justification: + if verbose: + print(f"REJECTED: Deleted sections without justification: {deletions}", file=sys.stderr) + diff_text = generate_diff(current_content, variant_content, target_label) + save_iteration( + output_dir, + i, + variant_content, + {"protected_intact": True}, + "REVERT", + "Deleted sections without justification", + diff_text, + change_summary, + deletions=deletions, + ) + iterations.append({ + "number": i, + "verdict": "REVERT", + "score": {"train": best_score}, + "delta": "0", + "change_summary": "Deleted sections without justification", + "reasoning": reasoning, + "diff": diff_text, + "deletions": deletions, + "deletion_justification": "", + }) + consecutive_reverts += 1 + if consecutive_reverts >= 5: + exit_reason = f"converged (5 consecutive reverts at iteration {i})" + status = "CONVERGED" + break + continue + + # 3. Evaluate variant + temp_target = target_path.parent / f".{target_path.stem}_variant{target_path.suffix}" + temp_target.write_text(variant_content) + try: + t0 = time.time() + variant_scores = assess_target(temp_target, train_tasks, goal, verbose, dry_run) + eval_elapsed = time.time() - t0 + variant_composite = composite_score(variant_scores) + finally: + temp_target.unlink(missing_ok=True) + + diff_text = generate_diff(current_content, variant_content, target_label) + + if verbose: + print(f"Score: {variant_composite:.4f} (gain={variant_composite - best_score:.4f}, gen={gen_elapsed:.1f}s, eval={eval_elapsed:.1f}s)", file=sys.stderr) + + # 4. Keep/revert (deterministic arithmetic) + gain = variant_composite - best_score + if gain > min_gain: + verdict = "KEEP" + best_score = variant_composite + best_content = variant_content + best_iteration = i + current_content = variant_content + consecutive_reverts = 0 + delta_str = f"+{gain:.2f}" + else: + verdict = "REVERT" + consecutive_reverts += 1 + delta_str = f"{gain:+.2f}" if gain != 0 else "0" + + if deletions and deletion_justification: + change_summary = f"{change_summary} [deletion justified]" + + save_iteration(output_dir, i, variant_content, variant_scores, + verdict, reasoning, diff_text, change_summary, + deletions=deletions, deletion_justification=deletion_justification) + + iteration_data: dict = { + "number": i, "verdict": verdict, + "score": {"train": variant_composite, "test": None}, + "delta": delta_str, "change_summary": change_summary, + "reasoning": reasoning, "diff": diff_text, + "tokens_used": variant_output.get("tokens_used", 0), + "scores": variant_scores, + "deletions": deletions, + "deletion_justification": deletion_justification, + } + + # 5. Goodhart alarm — every 5 iterations, check held-out set + if test_tasks and i % 5 == 0: + try: + temp_target.write_text(best_content) + holdout_scores = assess_target(temp_target, test_tasks, goal, verbose, dry_run) + holdout_composite = composite_score(holdout_scores) + iteration_data["score"]["test"] = holdout_composite + finally: + temp_target.unlink(missing_ok=True) + + if holdout_diverges(best_score, holdout_composite, baseline_holdout, baseline_composite): + if verbose: + print(f"GOODHART ALARM: holdout={holdout_composite:.4f} vs baseline={baseline_holdout:.4f}", file=sys.stderr) + exit_reason = f"goodhart_alarm (iteration {i})" + status = "GOODHART_ALARM" + iterations.append(iteration_data) + break + + iterations.append(iteration_data) + + # 6. Convergence check + if consecutive_reverts >= 5: + exit_reason = f"converged (5 consecutive reverts at iteration {i})" + status = "CONVERGED" + break + + # Regenerate live report + if report_path: + rd = _build_report_data(target_label, goal, baseline_composite, baseline_holdout, + len(train_tasks), len(test_tasks), iterations, max_iterations, + status, total_tokens) + report_path.write_text(generate_optimization_report(rd, auto_refresh=True)) + + else: + exit_reason = f"max_iterations ({max_iterations})" + status = "COMPLETE" + + # Final report + if report_path: + rd = _build_report_data(target_label, goal, baseline_composite, baseline_holdout, + len(train_tasks), len(test_tasks), iterations, max_iterations, + status, total_tokens) + report_path.write_text(generate_optimization_report(rd, auto_refresh=False)) + + if best_iteration > 0: + best_path = output_dir / "best_variant.md" + best_path.write_text(best_content) + if verbose: + print(f"\nBest variant saved to: {best_path}", file=sys.stderr) + + result = { + "exit_reason": exit_reason, "status": status, + "target": str(target_path), "goal": goal, + "baseline_score": {"train": baseline_composite, "test": baseline_holdout}, + "baseline_train_score": baseline_composite, + "baseline_holdout_score": baseline_holdout, + "best_score": best_score, + "best_iteration": best_iteration, "iterations_run": len(iterations), + "max_iterations": max_iterations, + "improvements_found": sum(1 for it in iterations if it["verdict"] == "KEEP"), + "total_tokens": total_tokens, + "train_size": len(train_tasks), "test_size": len(test_tasks), + "iterations": iterations, + } + (output_dir / "results.json").write_text(json.dumps(result, indent=2)) + return result + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def main(): + parser = argparse.ArgumentParser(description="Autoresearch optimization loop for agent/skill files") + parser.add_argument("--target", required=True, help="Path to agent/skill file to optimize") + parser.add_argument("--goal", required=True, help="Optimization objective") + parser.add_argument("--benchmark-tasks", required=True, help="Path to benchmark tasks JSON") + parser.add_argument("--max-iterations", type=int, default=20, help="Max iterations (default: 20)") + parser.add_argument("--min-gain", type=float, default=0.02, help="Min score gain to keep (default: 0.02)") + parser.add_argument("--train-split", type=float, default=0.6, help="Train fraction (default: 0.6)") + parser.add_argument("--model", required=True, help="Model for variant generation") + parser.add_argument("--verbose", action="store_true", help="Print progress to stderr") + parser.add_argument("--dry-run", action="store_true", help="Use synthetic scores (test loop mechanics without API)") + parser.add_argument("--report", default=None, help="Path for live HTML report") + parser.add_argument("--output-dir", default=None, help="Directory for iteration snapshots") + args = parser.parse_args() + + target = Path(args.target) + if not target.exists(): + print(f"Error: Target not found: {target}", file=sys.stderr) + sys.exit(1) + + tasks_path = Path(args.benchmark_tasks) + if not tasks_path.exists(): + print(f"Error: Tasks not found: {tasks_path}", file=sys.stderr) + sys.exit(1) + + try: + result = run_optimization_loop( + target_path=target, goal=args.goal, + benchmark_tasks_path=tasks_path, + max_iterations=args.max_iterations, min_gain=args.min_gain, + train_split=args.train_split, model=args.model, + verbose=args.verbose, + report_path=Path(args.report) if args.report else None, + output_dir=Path(args.output_dir) if args.output_dir else None, + dry_run=args.dry_run, + ) + except ValueError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + print(json.dumps(result, indent=2)) + if args.verbose: + print(f"\nExit: {result['exit_reason']}", file=sys.stderr) + print(f"Best: {result['best_score']:.4f} (iteration {result['best_iteration']})", file=sys.stderr) + print(f"Improvements: {result['improvements_found']}/{result['iterations_run']}", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/skills/skill-creator/assets/eval_viewer.html b/skills/skill-creator/assets/eval_viewer.html index 636532b..0c83595 100644 --- a/skills/skill-creator/assets/eval_viewer.html +++ b/skills/skill-creator/assets/eval_viewer.html @@ -90,6 +90,12 @@ transition: color 0.3s, border-color 0.3s, background 0.3s; } +.blind-notice.optimization { + color: var(--accent); + border-color: rgba(77, 142, 245, 0.35); + background: var(--accent-dim); +} + /* ─── Navigation ────────────────────────────────────────────────────── */ .nav { display: flex; @@ -641,6 +647,99 @@ @media (prefers-reduced-motion: reduce) { *, *::before, *::after { transition: none !important; animation: none !important; } } + +/* ─── Iterations Tab ───────────────────────────────────────────────── */ +.opt-dashboard { + background: var(--surface); + border: 1px solid var(--border); + border-radius: var(--radius-lg); + padding: 16px 20px; + margin-bottom: 20px; + display: grid; + grid-template-columns: repeat(auto-fit, minmax(160px, 1fr)); + gap: 12px; +} + +.opt-dash-item { + display: flex; + flex-direction: column; + gap: 2px; +} + +.opt-dash-label { + font-size: 11px; + color: var(--muted); + text-transform: uppercase; + letter-spacing: 0.06em; +} + +.opt-dash-value { + font-size: 16px; + font-weight: 600; + color: var(--bright); + font-variant-numeric: tabular-nums; +} + +.opt-running { color: var(--accent); } +.opt-done { color: var(--green); } +.opt-alarm { color: var(--red); } + +.opt-chart { margin-bottom: 20px; } + +.iter-row { cursor: pointer; transition: background 0.1s; } +.iter-row:hover { background: var(--surface-2); } + +.iter-diff-row td { padding: 0; } + +.iter-diff-block { + background: var(--code-bg); + border: 1px solid var(--border); + padding: 12px 14px; + font-size: 11.5px; + font-family: var(--font-mono); + max-height: 400px; + overflow: auto; + white-space: pre; + line-height: 1.6; + color: #8899bb; +} + +.verdict-keep { color: var(--green); font-weight: 600; } +.verdict-revert { color: var(--red); font-weight: 600; } +.verdict-stop { color: var(--yellow); font-weight: 600; } + +.opt-actions { + margin-top: 16px; + display: flex; + gap: 10px; + align-items: center; + flex-wrap: wrap; +} + +.opt-btn { + padding: 8px 18px; + border-radius: var(--radius); + border: 1px solid var(--border-2); + background: var(--surface-2); + color: var(--text); + cursor: pointer; + font-size: 12px; + font-family: var(--font-sans); + font-weight: 500; + transition: all 0.15s; +} + +.opt-btn:hover { color: var(--bright); background: var(--surface-3); } +.opt-btn:focus-visible { outline: 2px solid var(--accent); outline-offset: 2px; } + +.opt-btn-primary { + background: var(--accent); + color: #fff; + border: none; + box-shadow: 0 1px 4px rgba(77, 142, 245, 0.3); +} + +.opt-btn-primary:hover { background: #5a99f8; } @@ -656,12 +755,14 @@

Blind A/B Code Review

+
+
diff --git a/skills/skill-creator/scripts/eval_compare.py b/skills/skill-creator/scripts/eval_compare.py index 58f1849..cfae534 100644 --- a/skills/skill-creator/scripts/eval_compare.py +++ b/skills/skill-creator/scripts/eval_compare.py @@ -162,6 +162,38 @@ def find_iteration_dirs(workspace: Path) -> list[Path]: return [d for d in dirs if d.is_dir()] +def is_optimization_data(data: object) -> bool: + """Return True when the payload matches optimize_loop.py results.""" + if not isinstance(data, dict): + return False + iterations = data.get("iterations") + if not isinstance(iterations, list): + return False + if "baseline_score" not in data: + return False + if "target" not in data: + return False + return all( + isinstance(item, dict) and "number" in item and "verdict" in item + for item in iterations + ) + + +def load_optimization_data(workspace: Path) -> dict | None: + """Load optimization loop results when present in the workspace.""" + candidates = [ + workspace / "results.json", + workspace / "evals" / "iterations" / "results.json", + workspace / "out" / "results.json", + ] + for path in candidates: + if path.exists(): + data = load_json_safe(path) + if is_optimization_data(data): + return data + return None + + def build_data(workspace: Path) -> dict: """Build full comparison data.""" evals_path = workspace / "evals" / "evals.json" @@ -185,6 +217,7 @@ def build_data(workspace: Path) -> dict: "variantAName": "Variant A", "variantBName": "Variant B", "variantCName": "Variant C", + "optimization": load_optimization_data(workspace), } iteration = iterations[-1] # Latest iteration @@ -239,6 +272,7 @@ def build_data(workspace: Path) -> dict: "variantAName": variants.get("A", {}).get("name", "Variant A"), "variantBName": variants.get("B", {}).get("name", "Variant B"), "variantCName": variants.get("C", {}).get("name", "Variant C"), + "optimization": load_optimization_data(workspace), } From 1d3b45291a647a4e0246c450c2dd414fce616ba2 Mon Sep 17 00:00:00 2001 From: notque Date: Sun, 29 Mar 2026 08:10:42 -0700 Subject: [PATCH 02/12] feat(autoresearch): migrate SDK to claude -p, add beam search, fix review issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Migrate generate_variant.py and improve_description.py from Anthropic SDK to claude -p subprocess invocation - Add beam search optimization with configurable width, candidates per parent, and frontier retention to optimize_loop.py - Add beam search parameters display and empty-state UX in eval_viewer.html - Update SKILL.md and optimization-guide.md for beam search documentation - Migrate skill-eval run_loop and rules-distill to use claude -p - Add test coverage for beam search, model flag omission, and claude -p flow Fixes from review: - Fix misplaced test_writes_pending_json_in_live_mode (back in TestFullPipeline) - Remove dead round_keeps variable from optimize_loop.py - Fix timeout mismatch (120s outer vs 300s inner → 360s outer) - Clarify --max-iterations help text (rounds, not individual iterations) --- scripts/rules-distill.py | 55 +- scripts/skill_eval/__init__.py | 5 +- scripts/skill_eval/improve_description.py | 111 +-- scripts/skill_eval/run_loop.py | 8 +- .../test_agent_comparison_optimize_loop.py | 262 ++++++- scripts/tests/test_rules_distill.py | 27 + scripts/tests/test_skill_eval_claude_code.py | 51 ++ skills/agent-comparison/SKILL.md | 37 +- .../references/optimization-guide.md | 49 +- .../scripts/generate_variant.py | 120 +-- .../agent-comparison/scripts/optimize_loop.py | 730 ++++++++++++------ skills/skill-creator/assets/eval_viewer.html | 6 + skills/skill-eval/SKILL.md | 9 +- 13 files changed, 1048 insertions(+), 422 deletions(-) create mode 100644 scripts/tests/test_skill_eval_claude_code.py diff --git a/scripts/rules-distill.py b/scripts/rules-distill.py index 77d57f1..db1b640 100644 --- a/scripts/rules-distill.py +++ b/scripts/rules-distill.py @@ -24,7 +24,9 @@ import argparse import json +import os import re +import subprocess import sys from datetime import datetime, timezone from pathlib import Path @@ -307,18 +309,49 @@ def filter_layer4_not_covered( # --------------------------------------------------------------------------- +def _run_claude_code(prompt: str, model: str | None = None) -> tuple[str, str]: + """Run Claude Code and return (assistant_text, raw_result_text).""" + cmd = ["claude", "-p", prompt, "--output-format", "json", "--print"] + if model: + cmd.extend(["--model", model]) + + env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} + result = subprocess.run( + cmd, + capture_output=True, + text=True, + cwd=str(REPO_ROOT), + env=env, + timeout=300, + ) + if result.returncode != 0: + return "", "" + + try: + events = json.loads(result.stdout) + except json.JSONDecodeError: + return "", "" + + assistant_text = "" + raw_result_text = "" + for event in events: + if event.get("type") == "assistant": + message = event.get("message", {}) + for content in message.get("content", []): + if content.get("type") == "text": + assistant_text += content.get("text", "") + elif event.get("type") == "result": + raw_result_text = event.get("result", "") + + return assistant_text or raw_result_text, raw_result_text + + def _llm_extract_principles(skill_content: str, skill_name: str) -> list[dict] | None: - """Try to extract principles via Anthropic SDK. + """Try to extract principles via Claude Code. Returns list of dicts with "principle" key, or None if unavailable. """ try: - import anthropic # type: ignore[import] - except ImportError: - return None - - try: - client = anthropic.Anthropic() prompt = f"""You are analyzing a Claude Code skill file to extract cross-cutting behavioral principles. Skill: {skill_name} @@ -337,12 +370,8 @@ def _llm_extract_principles(skill_content: str, skill_name: str) -> list[dict] | Return [] if no universal principles are found.""" - message = client.messages.create( - model="claude-haiku-4-5", - max_tokens=1024, - messages=[{"role": "user", "content": prompt}], - ) - raw = message.content[0].text.strip() + raw, _ = _run_claude_code(prompt, model="claude-haiku-4-5") + raw = raw.strip() # Parse JSON principles = json.loads(raw) if not isinstance(principles, list): diff --git a/scripts/skill_eval/__init__.py b/scripts/skill_eval/__init__.py index 34d2ca8..65f3d05 100644 --- a/scripts/skill_eval/__init__.py +++ b/scripts/skill_eval/__init__.py @@ -1,10 +1,9 @@ """Skill evaluation and description optimization toolkit. -Ported from Anthropic's skill-creator (https://github.com/anthropics/skills). -Adapted for the agents repo architecture. +Adapted from the upstream skill-creator workflow for this repo architecture. Usage: python -m scripts.skill_eval.run_eval --eval-set evals.json --skill-path path/to/skill - python -m scripts.skill_eval.run_loop --eval-set evals.json --skill-path path/to/skill --model claude-opus-4-6 + python -m scripts.skill_eval.run_loop --eval-set evals.json --skill-path path/to/skill python -m scripts.skill_eval.quick_validate path/to/skill """ diff --git a/scripts/skill_eval/improve_description.py b/scripts/skill_eval/improve_description.py index 1deb2b8..c759219 100644 --- a/scripts/skill_eval/improve_description.py +++ b/scripts/skill_eval/improve_description.py @@ -2,33 +2,81 @@ """Improve a skill description based on eval results. Takes eval results (from run_eval.py) and generates an improved description -using Claude with extended thinking. +through `claude -p`. """ import argparse import json +import os import re +import subprocess import sys from pathlib import Path -import anthropic - from scripts.skill_eval.utils import parse_skill_md +def _find_project_root() -> Path: + current = Path.cwd() + for parent in [current, *current.parents]: + if (parent / ".claude").is_dir(): + return parent + return current + + +def _run_claude_code(prompt: str, model: str | None) -> tuple[str, str]: + """Run Claude Code and return (assistant_text, raw_result_text).""" + cmd = ["claude", "-p", prompt, "--output-format", "json", "--print"] + if model: + cmd.extend(["--model", model]) + + env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} + result = subprocess.run( + cmd, + capture_output=True, + text=True, + cwd=str(_find_project_root()), + env=env, + timeout=300, + ) + if result.returncode != 0: + print(f"Error: claude -p failed with code {result.returncode}", file=sys.stderr) + if result.stderr: + print(result.stderr.strip(), file=sys.stderr) + sys.exit(1) + + try: + events = json.loads(result.stdout) + except json.JSONDecodeError as exc: + print(f"Error: could not parse claude -p JSON output: {exc}", file=sys.stderr) + sys.exit(1) + + assistant_text = "" + raw_result_text = "" + for event in events: + if event.get("type") == "assistant": + message = event.get("message", {}) + for content in message.get("content", []): + if content.get("type") == "text": + assistant_text += content.get("text", "") + elif event.get("type") == "result": + raw_result_text = event.get("result", "") + + return assistant_text or raw_result_text, raw_result_text + + def improve_description( - client: anthropic.Anthropic, skill_name: str, skill_content: str, current_description: str, eval_results: dict, history: list[dict], - model: str, + model: str | None, test_results: dict | None = None, log_dir: Path | None = None, iteration: int | None = None, ) -> str: - """Call Claude to improve the description based on eval results.""" + """Call Claude Code to improve the description based on eval results.""" failed_triggers = [r for r in eval_results["results"] if r["should_trigger"] and not r["pass"]] false_triggers = [r for r in eval_results["results"] if not r["should_trigger"] and not r["pass"]] @@ -107,24 +155,7 @@ def improve_description( Please respond with only the new description text in tags, nothing else.""" - response = client.messages.create( - model=model, - max_tokens=16000, - thinking={ - "type": "enabled", - "budget_tokens": 10000, - }, - messages=[{"role": "user", "content": prompt}], - ) - - # Extract thinking and text from response - thinking_text = "" - text = "" - for block in response.content: - if block.type == "thinking": - thinking_text = block.thinking - elif block.type == "text": - text = block.text + text, raw_result_text = _run_claude_code(prompt, model) # Parse out the tags match = re.search(r"(.*?)", text, re.DOTALL) @@ -134,8 +165,8 @@ def improve_description( transcript: dict = { "iteration": iteration, "prompt": prompt, - "thinking": thinking_text, "response": text, + "raw_result_text": raw_result_text, "parsed_description": description, "char_count": len(description), "over_limit": len(description) > 1024, @@ -144,34 +175,18 @@ def improve_description( # If over 1024 chars, ask the model to shorten it if len(description) > 1024: shorten_prompt = f"Your description is {len(description)} characters, which exceeds the hard 1024 character limit. Please rewrite it to be under 1024 characters while preserving the most important trigger words and intent coverage. Respond with only the new description in tags." - shorten_response = client.messages.create( - model=model, - max_tokens=16000, - thinking={ - "type": "enabled", - "budget_tokens": 10000, - }, - messages=[ - {"role": "user", "content": prompt}, - {"role": "assistant", "content": text}, - {"role": "user", "content": shorten_prompt}, - ], + rewrite_request = ( + f"{prompt}\n\nPrevious assistant response:\n\n{text}\n\n\n" + f"{shorten_prompt}" ) - - shorten_thinking = "" - shorten_text = "" - for block in shorten_response.content: - if block.type == "thinking": - shorten_thinking = block.thinking - elif block.type == "text": - shorten_text = block.text + shorten_text, shorten_raw_result_text = _run_claude_code(rewrite_request, model) match = re.search(r"(.*?)", shorten_text, re.DOTALL) shortened = match.group(1).strip().strip('"') if match else shorten_text.strip().strip('"') transcript["rewrite_prompt"] = shorten_prompt - transcript["rewrite_thinking"] = shorten_thinking transcript["rewrite_response"] = shorten_text + transcript["rewrite_raw_result_text"] = shorten_raw_result_text transcript["rewrite_description"] = shortened transcript["rewrite_char_count"] = len(shortened) description = shortened @@ -191,7 +206,7 @@ def main(): parser.add_argument("--eval-results", required=True, help="Path to eval results JSON (from run_eval.py)") parser.add_argument("--skill-path", required=True, help="Path to skill directory") parser.add_argument("--history", default=None, help="Path to history JSON (previous attempts)") - parser.add_argument("--model", required=True, help="Model for improvement") + parser.add_argument("--model", default=None, help="Optional Claude Code model override") parser.add_argument("--verbose", action="store_true", help="Print thinking to stderr") args = parser.parse_args() @@ -212,9 +227,7 @@ def main(): print(f"Current: {current_description}", file=sys.stderr) print(f"Score: {eval_results['summary']['passed']}/{eval_results['summary']['total']}", file=sys.stderr) - client = anthropic.Anthropic() new_description = improve_description( - client=client, skill_name=name, skill_content=content, current_description=current_description, diff --git a/scripts/skill_eval/run_loop.py b/scripts/skill_eval/run_loop.py index 6bc0c61..48034e6 100644 --- a/scripts/skill_eval/run_loop.py +++ b/scripts/skill_eval/run_loop.py @@ -15,8 +15,6 @@ import webbrowser from pathlib import Path -import anthropic - from scripts.skill_eval.generate_report import generate_html from scripts.skill_eval.improve_description import improve_description from scripts.skill_eval.run_eval import find_project_root, run_eval @@ -56,7 +54,7 @@ def run_loop( runs_per_query: int, trigger_threshold: float, holdout: float, - model: str, + model: str | None, verbose: bool, live_report_path: Path | None = None, log_dir: Path | None = None, @@ -75,7 +73,6 @@ def run_loop( train_set = eval_set test_set = [] - client = anthropic.Anthropic() history = [] exit_reason = "unknown" @@ -206,7 +203,6 @@ def print_eval_stats(label, results, elapsed): # Strip test scores from history so improvement model can't see them blinded_history = [{k: v for k, v in h.items() if not k.startswith("test_")} for h in history] new_description = improve_description( - client=client, skill_name=name, skill_content=content, current_description=current_description, @@ -264,7 +260,7 @@ def main(): parser.add_argument( "--holdout", type=float, default=0.4, help="Fraction of eval set to hold out for testing (0 to disable)" ) - parser.add_argument("--model", required=True, help="Model for improvement") + parser.add_argument("--model", default=None, help="Optional Claude Code model override") parser.add_argument("--verbose", action="store_true", help="Print progress to stderr") parser.add_argument( "--report", diff --git a/scripts/tests/test_agent_comparison_optimize_loop.py b/scripts/tests/test_agent_comparison_optimize_loop.py index a0f2faa..d3c8bdc 100644 --- a/scripts/tests/test_agent_comparison_optimize_loop.py +++ b/scripts/tests/test_agent_comparison_optimize_loop.py @@ -1,6 +1,7 @@ import importlib.util import json from pathlib import Path +import subprocess import sys @@ -76,39 +77,33 @@ def test_generate_variant_main_reads_current_content_from_file(tmp_path, monkeyp "skills/agent-comparison/scripts/generate_variant.py", ) - class FakeBlock: - def __init__(self, block_type: str, text: str): - self.type = block_type - if block_type == "thinking": - self.thinking = text - else: - self.text = text - - class FakeResponse: - def __init__(self): - self.content = [ - FakeBlock("thinking", "reasoning"), - FakeBlock( - "text", - "---\ndescription: updated\n---" - "updated", - ), - ] - self.usage = type("Usage", (), {"input_tokens": 1, "output_tokens": 2})() - - class FakeClient: - def __init__(self): - self.messages = type("Messages", (), {"create": lambda self, **kwargs: FakeResponse()})() - - class FakeAnthropicModule: - class Anthropic: - def __new__(cls): - return FakeClient() - content_file = tmp_path / "current.md" content_file.write_text("---\ndescription: current\n---\n") - monkeypatch.setattr(generate_variant, "anthropic", FakeAnthropicModule) + def fake_run(cmd, capture_output, text, cwd, env, timeout): + assert cmd[:2] == ["claude", "-p"] + payload = [ + { + "type": "assistant", + "message": { + "content": [ + { + "type": "text", + "text": "---\ndescription: updated\n---" + "updated", + } + ] + }, + }, + { + "type": "result", + "result": "raw result", + "usage": {"input_tokens": 1, "output_tokens": 2}, + }, + ] + return subprocess.CompletedProcess(cmd, 0, stdout=json.dumps(payload), stderr="") + + monkeypatch.setattr(generate_variant.subprocess, "run", fake_run) monkeypatch.setattr( sys, "argv", @@ -130,3 +125,210 @@ def __new__(cls): assert output["variant"] == "---\ndescription: updated\n---" assert output["tokens_used"] == 3 + assert output["reasoning"] == "raw result" + + +def test_optimize_loop_omits_model_flag_when_not_provided(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_nomodel", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + target = tmp_path / "SKILL.md" + target.write_text( + "---\nname: test-skill\ndescription: test description\nversion: 1.0.0\n---\n\n# Skill\n" + ) + tasks = [ + {"name": "train-positive", "query": "write go tests", "should_trigger": True, "split": "train"}, + {"name": "test-negative", "query": "debug kubernetes", "should_trigger": False, "split": "test"}, + ] + tasks_file = tmp_path / "tasks.json" + tasks_file.write_text(json.dumps({"tasks": tasks})) + + seen_cmds = [] + + def fake_assess_target(*args, **kwargs): + return { + "parses": True, + "correctness": 1.0, + "conciseness": 1.0, + "clarity": 1.0, + "task_results": [{"name": "train-positive", "passed": False}], + } + + def fake_run(cmd, capture_output, text, timeout): + seen_cmds.append(cmd) + payload = { + "variant": target.read_text(), + "summary": "no-op", + "reasoning": "ok", + "tokens_used": 0, + "deletion_justification": "", + } + return subprocess.CompletedProcess(cmd, 0, stdout=json.dumps(payload), stderr="") + + monkeypatch.setattr(optimize_loop, "assess_target", fake_assess_target) + monkeypatch.setattr(optimize_loop.subprocess, "run", fake_run) + + optimize_loop.run_optimization_loop( + target_path=target, + goal="improve routing precision", + benchmark_tasks_path=tasks_file, + max_iterations=1, + min_gain=0.02, + train_split=0.6, + model=None, + output_dir=tmp_path / "out", + report_path=tmp_path / "out" / "report.html", + verbose=False, + dry_run=False, + ) + + assert seen_cmds + assert "--model" not in seen_cmds[0] + + +def test_optimize_loop_respects_revert_streak_limit(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_revert_limit", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + target = tmp_path / "SKILL.md" + target.write_text( + "---\nname: test-skill\ndescription: test description\nversion: 1.0.0\n---\n\n# Skill\n" + ) + tasks_file = tmp_path / "tasks.json" + tasks_file.write_text( + json.dumps( + { + "tasks": [ + {"name": "train-positive", "query": "write go tests", "should_trigger": True, "split": "train"}, + {"name": "test-negative", "query": "debug kubernetes", "should_trigger": False, "split": "test"}, + ] + } + ) + ) + + def fake_assess_target(*args, **kwargs): + return { + "parses": True, + "correctness": 0.0, + "conciseness": 1.0, + "clarity": 1.0, + "task_results": [{"name": "train-positive", "passed": False}], + } + + def fake_run(cmd, capture_output, text, timeout): + payload = { + "variant": target.read_text(), + "summary": "no-op", + "reasoning": "ok", + "tokens_used": 0, + "deletion_justification": "", + } + return subprocess.CompletedProcess(cmd, 0, stdout=json.dumps(payload), stderr="") + + monkeypatch.setattr(optimize_loop, "assess_target", fake_assess_target) + monkeypatch.setattr(optimize_loop.subprocess, "run", fake_run) + + result = optimize_loop.run_optimization_loop( + target_path=target, + goal="improve routing precision", + benchmark_tasks_path=tasks_file, + max_iterations=10, + min_gain=0.02, + train_split=0.6, + revert_streak_limit=2, + model=None, + output_dir=tmp_path / "out", + report_path=tmp_path / "out" / "report.html", + verbose=False, + dry_run=False, + ) + + assert result["status"] == "CONVERGED" + assert "2 rounds without KEEP" in result["exit_reason"] + + +def test_optimize_loop_beam_search_retains_top_k_candidates(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_beam", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + target = tmp_path / "SKILL.md" + target.write_text( + "---\nname: test-skill\ndescription: test description\nversion: 1.0.0\n---\n\n# Skill\n" + ) + tasks_file = tmp_path / "tasks.json" + tasks_file.write_text( + json.dumps( + { + "tasks": [ + {"name": "train-positive", "query": "write go tests", "should_trigger": True, "split": "train"}, + {"name": "test-negative", "query": "debug kubernetes", "should_trigger": False, "split": "test"}, + ] + } + ) + ) + + generated = iter(["alpha", "beta"]) + + def fake_run(cmd, capture_output, text, timeout): + label = next(generated) + payload = { + "variant": target.read_text() + f"\n\n", + "summary": f"candidate-{label}", + "reasoning": "ok", + "tokens_used": 10, + "deletion_justification": "", + } + return subprocess.CompletedProcess(cmd, 0, stdout=json.dumps(payload), stderr="") + + def fake_assess_target(path, *args, **kwargs): + content = Path(path).read_text() + score = 0.0 + if "" in content: + score = 1.2 + elif "" in content: + score = 2.4 + return { + "parses": True, + "compiles": True, + "tests_pass": True, + "protected_intact": True, + "correctness": score, + "error_handling": 0.0, + "language_idioms": 0.0, + "testing": 0.0, + "efficiency": 0.0, + "task_results": [], + } + + monkeypatch.setattr(optimize_loop.subprocess, "run", fake_run) + monkeypatch.setattr(optimize_loop, "assess_target", fake_assess_target) + + result = optimize_loop.run_optimization_loop( + target_path=target, + goal="improve routing precision", + benchmark_tasks_path=tasks_file, + max_iterations=1, + min_gain=0.0, + train_split=0.6, + beam_width=2, + candidates_per_parent=2, + model=None, + output_dir=tmp_path / "out", + report_path=tmp_path / "out" / "report.html", + verbose=False, + dry_run=False, + ) + + assert result["search_strategy"] == "beam" + assert result["beam_width"] == 2 + assert result["candidates_per_parent"] == 2 + assert result["improvements_found"] == 2 + selected = [it for it in result["iterations"] if it.get("selected_for_frontier")] + assert len(selected) == 2 + assert selected[0]["frontier_rank"] == 1 or selected[1]["frontier_rank"] == 1 diff --git a/scripts/tests/test_rules_distill.py b/scripts/tests/test_rules_distill.py index 5acbc6a..854d963 100644 --- a/scripts/tests/test_rules_distill.py +++ b/scripts/tests/test_rules_distill.py @@ -8,6 +8,7 @@ import importlib import json +import subprocess import sys import tempfile from datetime import datetime, timedelta, timezone @@ -549,6 +550,32 @@ def test_writes_pending_json_in_live_mode(self, tmp_path): assert "skills_scanned" in written assert "candidates" in written + +class TestLlmExtraction: + def test_llm_extract_principles_uses_claude_code(self): + payload = json.dumps( + [ + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": '["Always verify output before completion"]'}]}, + }, + {"type": "result", "result": "ok"}, + ] + ) + completed = subprocess.CompletedProcess(["claude"], 0, stdout=payload, stderr="") + + with patch.object(rules_distill.subprocess, "run", return_value=completed) as mock_run: + result = rules_distill._llm_extract_principles("content", "skill-a") + + assert result == [ + { + "principle": "Always verify output before completion", + "raw": "Always verify output before completion", + "source": "skill-a", + } + ] + assert mock_run.call_args.args[0][:2] == ["claude", "-p"] + def test_candidates_have_required_fields(self, tmp_path): """All candidates must have id, principle, skills, status, confidence, verdict.""" skills_dir = tmp_path / "skills" diff --git a/scripts/tests/test_skill_eval_claude_code.py b/scripts/tests/test_skill_eval_claude_code.py new file mode 100644 index 0000000..c241105 --- /dev/null +++ b/scripts/tests/test_skill_eval_claude_code.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +import json +import subprocess +from pathlib import Path + + +def test_improve_description_uses_claude_code_and_shortens(monkeypatch, tmp_path): + from scripts.skill_eval import improve_description as mod + + calls: list[list[str]] = [] + + def fake_run(cmd, capture_output, text, cwd, env, timeout): + calls.append(cmd) + if len(calls) == 1: + text_out = ( + "" + + ("a" * 1030) + + "" + ) + else: + text_out = "short and valid" + payload = [ + {"type": "assistant", "message": {"content": [{"type": "text", "text": text_out}]}}, + {"type": "result", "result": "raw result"}, + ] + return subprocess.CompletedProcess(cmd, 0, stdout=json.dumps(payload), stderr="") + + monkeypatch.setattr(mod.subprocess, "run", fake_run) + + description = mod.improve_description( + skill_name="skill-eval", + skill_content="# Skill", + current_description="old", + eval_results={ + "results": [{"query": "improve this skill", "should_trigger": True, "pass": False, "triggers": 0, "runs": 1}], + "summary": {"passed": 0, "failed": 1, "total": 1}, + }, + history=[], + model=None, + log_dir=tmp_path, + iteration=1, + ) + + assert description == "short and valid" + assert calls + assert calls[0][:2] == ["claude", "-p"] + transcript = json.loads((tmp_path / "improve_iter_1.json").read_text()) + assert transcript["raw_result_text"] == "raw result" + assert transcript["rewrite_raw_result_text"] == "raw result" + diff --git a/skills/agent-comparison/SKILL.md b/skills/agent-comparison/SKILL.md index 60a155d..21e8c15 100644 --- a/skills/agent-comparison/SKILL.md +++ b/skills/agent-comparison/SKILL.md @@ -262,7 +262,7 @@ Remove temporary benchmark files and debug outputs. Keep only the comparison rep ### Phase 5: OPTIMIZE (optional — invoked explicitly) -**Goal**: Run an automated optimization loop that iteratively improves a markdown target's frontmatter `description` using trigger-rate eval tasks, then keeps only measured improvements. +**Goal**: Run an automated optimization loop that improves a markdown target's frontmatter `description` using trigger-rate eval tasks, then selects the best measured variants through beam search or single-path search. This phase is for routing/trigger optimization, not full code-generation benchmarking. Invoke it when the user says "optimize this skill", "optimize the description", or "run autoresearch". The existing manual A/B comparison (Phases 1-4) remains the path for full agent benchmarking. @@ -288,7 +288,6 @@ python3 skills/agent-comparison/scripts/optimize_loop.py \ --goal "{optimization goal}" \ --benchmark-tasks skills/agent-comparison/references/optimization-tasks.example.json \ --train-split 0.6 \ - --model claude-sonnet-4-20250514 \ --verbose ``` @@ -309,12 +308,15 @@ The loop automatically evaluates the unmodified target against the train set bef **Step 4: Enter optimization loop** The `optimize_loop.py` script handles the full loop: -- Calls `generate_variant.py` to propose changes (Claude with extended thinking) +- Calls `generate_variant.py` to propose changes through `claude -p` - Evaluates each variant against train tasks -- Keeps variants that improve score by more than `--min-gain` (default 0.02) +- Runs either: + - single-path hill climbing: `--beam-width 1 --candidates-per-parent 1` + - beam search with top-K retention: keep the best `K` improving candidates each round +- Keeps variants that beat their parent by more than `--min-gain` (default 0.02) - Reverts variants that don't improve, break hard gates, or delete sections without justification -- Checks held-out test set every 5 iterations for Goodhart divergence -- Stops on convergence (5 consecutive reverts), Goodhart alarm, or max iterations +- Checks held-out test set every `--holdout-check-cadence` rounds for Goodhart divergence +- Stops on convergence (`--revert-streak-limit` rounds without any KEEP), Goodhart alarm, or max iterations ```bash python3 skills/agent-comparison/scripts/optimize_loop.py \ @@ -324,14 +326,25 @@ python3 skills/agent-comparison/scripts/optimize_loop.py \ --max-iterations 20 \ --min-gain 0.02 \ --train-split 0.6 \ - --model claude-sonnet-4-20250514 \ + --beam-width 3 \ + --candidates-per-parent 2 \ + --revert-streak-limit 8 \ + --holdout-check-cadence 5 \ --report optimization-report.html \ --output-dir evals/iterations \ --verbose ``` +Omit `--model` to use Claude Code's configured default model, or pass it explicitly if you need a specific override. + The `--report` flag generates a live HTML dashboard that auto-refreshes every 10 seconds, showing a convergence chart, iteration table, and review/export controls. +Recommended modes: +- Fast single-path optimization: `--beam-width 1 --candidates-per-parent 1` +- True autoresearch sweep: `--max-iterations 20 --beam-width 3 --candidates-per-parent 2 --revert-streak-limit 20` +- Conservative search with strict keeps: raise `--min-gain` above `0.02` +- Exploratory search that accepts small wins: use `--min-gain 0.0` + **Step 5: Present results in UI** Open the generated `optimization-report.html` in a browser. The report shows: @@ -345,13 +358,15 @@ Open the generated `optimization-report.html` in a browser. The report shows: Not all KEEP iterations are real improvements — some may be harness artifacts. The user reviews the kept iterations as candidate snapshots from the original target: - Inspect each kept iteration's diff in the report - Use "Preview Selected Snapshot" only as a comparison aid in the UI -- Use "Export Selected" to download a review JSON describing the selected snapshot diffs +- Use "Export Selected" to download a review JSON describing the selected snapshot diff +- In beam mode, review the retained frontier candidates first; they are the strongest candidates from the latest round **Step 7: Apply selected improvements to target file** Apply one reviewed improvement to the original target file. - If you want the best single kept variant, use `evals/iterations/best_variant.md`. +- Beam search still writes a single `best_variant.md`: the highest-scoring kept candidate seen anywhere in the run. - If you exported selected diffs, treat that JSON as review material only. It is not auto-applied by the current tooling, and the current workflow does not support merging multiple kept diffs into a generated patch. ```bash @@ -370,7 +385,11 @@ After applying improvements, run a final evaluation on ALL tasks (not just train # Re-run optimize_loop.py against the same task file and inspect results.json/report output ``` -Compare final scores to the baseline to confirm net improvement. +Compare final scores to the baseline to confirm net improvement. In beam mode, the final report and `results.json` also include: +- `beam_width` +- `candidates_per_parent` +- `holdout_check_cadence` +- per-iteration frontier metadata (`selected_for_frontier`, `frontier_rank`, `parent_iteration`) **Step 9: Record in learning-db** diff --git a/skills/agent-comparison/references/optimization-guide.md b/skills/agent-comparison/references/optimization-guide.md index 3b74e16..3aa0f6a 100644 --- a/skills/agent-comparison/references/optimization-guide.md +++ b/skills/agent-comparison/references/optimization-guide.md @@ -90,17 +90,35 @@ python3 skills/agent-comparison/scripts/optimize_loop.py \ --train-split 0.6 \ --max-iterations 20 \ --min-gain 0.02 \ - --model claude-sonnet-4-20250514 \ + --beam-width 3 \ + --candidates-per-parent 2 \ + --revert-streak-limit 20 \ + --holdout-check-cadence 5 \ --report optimization-report.html \ --output-dir evals/iterations \ --verbose ``` +By default this uses Claude Code's configured model via `claude -p`. Pass `--model` only when you want to override that explicitly. + Useful flags: -- `--dry-run`: exercise the loop mechanics without API calls +- `--dry-run`: exercise the loop mechanics without calling Claude Code - `--report`: write a live HTML report - `--output-dir`: persist iteration snapshots and `results.json` +- `--beam-width`: retain the best K improving candidates per round +- `--candidates-per-parent`: generate multiple sibling variants from each frontier candidate +- `--revert-streak-limit`: stop after N rounds without any KEEP candidates +- `--holdout-check-cadence`: evaluate the global best on held-out tasks every N rounds + +Recommended search presets: + +- Single-path local search: + - `--beam-width 1 --candidates-per-parent 1` +- Balanced beam search: + - `--beam-width 3 --candidates-per-parent 2` +- Aggressive exploration: + - `--beam-width 5 --candidates-per-parent 3 --min-gain 0.0` ## Evaluation Model @@ -109,6 +127,7 @@ The loop follows the ADR-131 structure: 1. Hard gates 2. Weighted composite score 3. Held-out regression checks +4. Frontier retention ### Layer 1: Hard Gates @@ -126,14 +145,27 @@ preserved verbatim. ### Layer 2: Composite Score The loop converts trigger-rate evaluation results into a weighted composite -score using the built-in weights in `optimize_loop.py`. A variant is kept only -if it beats the previous best by more than `--min-gain`. +score using the built-in weights in `optimize_loop.py`. A candidate is kept only +if it beats its parent by more than `--min-gain`. ### Layer 3: Held-Out Regression Check -Every 5 iterations, the current best variant is scored on the held-out test set. -If held-out performance drops below the baseline while train performance has -improved, the loop raises a Goodhart alarm and stops. +Every `--holdout-check-cadence` rounds, the current global best variant is +scored on the held-out test set. If held-out performance drops below the +baseline while train performance has improved, the loop raises a Goodhart +alarm and stops. + +### Layer 4: Frontier Retention + +When beam search is enabled: + +- each frontier candidate generates `--candidates-per-parent` siblings +- every sibling is scored independently +- the top `--beam-width` KEEP candidates become the next frontier +- `best_variant.md` still tracks the single best candidate seen anywhere in the run + +When `--beam-width 1 --candidates-per-parent 1`, the behavior collapses back to +the original single-path optimizer. ## Deletion Safety Rule @@ -156,6 +188,9 @@ When `--output-dir` is set, the loop writes: - `best_variant.md` - `results.json` +`results.json` also records search metadata such as `beam_width`, +`candidates_per_parent`, and per-iteration frontier selection markers. + When `--report` is set, it also writes a live HTML dashboard showing: - status, baseline, best score, kept/reverted counts diff --git a/skills/agent-comparison/scripts/generate_variant.py b/skills/agent-comparison/scripts/generate_variant.py index 2378e50..bccf512 100644 --- a/skills/agent-comparison/scripts/generate_variant.py +++ b/skills/agent-comparison/scripts/generate_variant.py @@ -1,11 +1,11 @@ #!/usr/bin/env python3 -"""Generate a variant of an agent/skill file using Claude with extended thinking. +"""Generate a variant of an agent/skill file using Claude Code. Proposes modifications to improve the target file based on the optimization goal and previous iteration failures. Preserves protected sections marked with DO NOT OPTIMIZE markers. -Pattern: follows improve_description.py's Claude + extended thinking approach. +Pattern: uses `claude -p` so generation runs through Claude Code directly. Usage: python3 skills/agent-comparison/scripts/generate_variant.py \ @@ -13,7 +13,7 @@ --goal "improve error handling instructions" \ --current-content "..." \ --failures '[...]' \ - --model claude-sonnet-4-20250514 + --model claude-opus-4-6 Output (JSON to stdout): { @@ -31,13 +31,11 @@ import argparse import json +import os import re +import subprocess import sys - -try: - import anthropic -except ImportError: # pragma: no cover - exercised in environments without the SDK - anthropic = None +from pathlib import Path # --------------------------------------------------------------------------- # Protected section handling @@ -94,16 +92,68 @@ def detect_deletions(original: str, variant: str) -> list[str]: # --------------------------------------------------------------------------- +def _find_project_root() -> Path: + current = Path.cwd() + for parent in [current, *current.parents]: + if (parent / ".claude").is_dir(): + return parent + return current + + +def _run_claude_code(prompt: str, model: str | None) -> tuple[str, str, int]: + """Run Claude Code and return (response_text, raw_result_text, tokens_used).""" + cmd = ["claude", "-p", prompt, "--output-format", "json", "--print"] + if model: + cmd.extend(["--model", model]) + + env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} + result = subprocess.run( + cmd, + capture_output=True, + text=True, + cwd=str(_find_project_root()), + env=env, + timeout=300, + ) + if result.returncode != 0: + print(f"Error: claude -p failed with code {result.returncode}", file=sys.stderr) + if result.stderr: + print(result.stderr.strip(), file=sys.stderr) + sys.exit(1) + + try: + events = json.loads(result.stdout) + except json.JSONDecodeError as exc: + print(f"Error: could not parse claude -p JSON output: {exc}", file=sys.stderr) + sys.exit(1) + + assistant_text = "" + raw_result_text = "" + tokens_used = 0 + for event in events: + if event.get("type") == "assistant": + message = event.get("message", {}) + for content in message.get("content", []): + if content.get("type") == "text": + assistant_text += content.get("text", "") + elif event.get("type") == "result": + raw_result_text = event.get("result", "") + usage = event.get("usage", {}) + tokens_used = usage.get("input_tokens", 0) + usage.get("output_tokens", 0) + + return assistant_text or raw_result_text, raw_result_text, tokens_used + + def generate_variant( - client: anthropic.Anthropic, target_path: str, goal: str, current_content: str, failures: list[dict], - model: str, + model: str | None, history: list[dict] | None = None, + diversification_note: str | None = None, ) -> dict: - """Call Claude to generate a variant of the target file. + """Call Claude Code to generate a variant of the target file. Returns dict with variant content, summary, reasoning, and token count. """ @@ -120,6 +170,10 @@ def generate_variant( for h in history: history_section += f" Iteration {h.get('number', '?')}: {h.get('verdict', '?')} — {h.get('change_summary', '')}\n" + diversification_section = "" + if diversification_note: + diversification_section = f"\n\nSearch diversification instruction:\n{diversification_note}\n" + protected_sections = extract_protected(current_content) protected_notice = "" if protected_sections: @@ -141,7 +195,7 @@ def generate_variant( {current_content} -{failure_section}{history_section}{protected_notice} +{failure_section}{history_section}{diversification_section}{protected_notice} SAFETY RULES: 1. Do NOT delete sections without replacing them with equivalent or better content. @@ -180,31 +234,7 @@ def generate_variant( [why any removed section was replaced safely, or leave blank]
""" - try: - response = client.messages.create( - model=model, - max_tokens=16000, - thinking={ - "type": "enabled", - "budget_tokens": 10000, - }, - messages=[{"role": "user", "content": prompt}], - ) - except anthropic.APIStatusError as e: - print(f"Error: API returned status {e.status_code}: {e.message}", file=sys.stderr) - sys.exit(1) - except anthropic.APIConnectionError as e: - print(f"Error: API connection failed: {e}", file=sys.stderr) - sys.exit(1) - - # Extract thinking and text - thinking_text = "" - text = "" - for block in response.content: - if block.type == "thinking": - thinking_text = block.thinking - elif block.type == "text": - text = block.text + text, raw_result_text, tokens_used = _run_claude_code(prompt, model) # Parse variant content variant_match = re.search(r"(.*?)", text, re.DOTALL) @@ -229,13 +259,11 @@ def generate_variant( if deletions: print(f"Warning: Deleted sections: {deletions}", file=sys.stderr) - tokens_used = response.usage.input_tokens + response.usage.output_tokens - return { "variant": variant, "summary": summary, "deletion_justification": deletion_justification, - "reasoning": thinking_text, + "reasoning": raw_result_text, "tokens_used": tokens_used, "deletions": deletions, } @@ -255,7 +283,8 @@ def main(): content_group.add_argument("--current-content-file", help="Path to a file containing the current content") parser.add_argument("--failures", default="[]", help="JSON list of failed tasks") parser.add_argument("--history", default="[]", help="JSON list of previous iterations") - parser.add_argument("--model", default="claude-sonnet-4-20250514", help="Model to use") + parser.add_argument("--diversification-note", default=None, help="Optional search diversification hint") + parser.add_argument("--model", default=None, help="Optional Claude Code model override") args = parser.parse_args() try: @@ -269,25 +298,20 @@ def main(): print(f"Error: --history is not valid JSON: {e}", file=sys.stderr) sys.exit(1) - if anthropic is None: - print("Error: anthropic SDK is not installed", file=sys.stderr) - sys.exit(1) - current_content = ( - open(args.current_content_file, encoding="utf-8").read() + Path(args.current_content_file).read_text(encoding="utf-8") if args.current_content_file else args.current_content ) - client = anthropic.Anthropic() result = generate_variant( - client=client, target_path=args.target, goal=args.goal, current_content=current_content, failures=failures, model=args.model, history=history if history else None, + diversification_note=args.diversification_note, ) print(json.dumps(result, indent=2)) diff --git a/skills/agent-comparison/scripts/optimize_loop.py b/skills/agent-comparison/scripts/optimize_loop.py index 228dd1d..54eea55 100644 --- a/skills/agent-comparison/scripts/optimize_loop.py +++ b/skills/agent-comparison/scripts/optimize_loop.py @@ -89,6 +89,7 @@ def save_iteration( stop_reason: str | None = None, deletions: list[str] | None = None, deletion_justification: str = "", + metadata: dict | None = None, ) -> dict: """Save a full iteration snapshot and return its metadata.""" iter_dir = output_dir / f"{iteration:03d}" @@ -107,6 +108,8 @@ def save_iteration( "deletions": deletions or [], "deletion_justification": deletion_justification, } + if metadata: + verdict_data.update(metadata) (iter_dir / "verdict.json").write_text(json.dumps(verdict_data, indent=2)) if diff_text: @@ -148,6 +151,66 @@ def make_dry_run_variant(current_content: str, iteration: int) -> tuple[str, str return variant, "Synthetic dry-run mutation", "dry-run synthetic variant" +def _generate_variant_output( + current_content: str, + target_path: Path, + goal: str, + last_failures: list[dict], + history: list[dict], + model: str | None, + dry_run: bool, + iteration_number: int, + diversification_note: str | None = None, +) -> dict: + """Generate a candidate variant either synthetically or through Claude Code.""" + if dry_run: + variant_content, change_summary, reasoning = make_dry_run_variant(current_content, iteration_number) + return { + "variant": variant_content, + "summary": change_summary, + "reasoning": reasoning, + "tokens_used": 0, + "deletions": [], + "deletion_justification": "", + } + + with tempfile.NamedTemporaryFile(mode="w", suffix=target_path.suffix, encoding="utf-8") as current_file: + current_file.write(current_content) + current_file.flush() + variant_cmd = [ + sys.executable, + str(Path(__file__).parent / "generate_variant.py"), + "--target", + str(target_path), + "--goal", + goal, + "--current-content-file", + current_file.name, + "--failures", + json.dumps(last_failures), + "--history", + json.dumps(history), + ] + if diversification_note: + variant_cmd.extend(["--diversification-note", diversification_note]) + if model: + variant_cmd.extend(["--model", model]) + variant_result = subprocess.run( + variant_cmd, + capture_output=True, + text=True, + timeout=360, + ) + + if variant_result.returncode != 0: + raise RuntimeError(variant_result.stderr.strip() or "Variant generation failed") + + try: + return json.loads(variant_result.stdout) + except json.JSONDecodeError as e: + raise ValueError(f"Parse error: {e}") from e + + # --------------------------------------------------------------------------- # HTML report generation # --------------------------------------------------------------------------- @@ -204,9 +267,9 @@ def generate_optimization_report(data: dict, auto_refresh: bool = False) -> str: sc = it["score"] train_score = sc.get("train") test_score = sc.get("test") - score_str = f'{train_score:.2f}' if isinstance(train_score, (int, float)) else "?" + score_str = f"{train_score:.2f}" if isinstance(train_score, (int, float)) else "?" if isinstance(test_score, (int, float)): - score_str += f' / {test_score:.2f}' + score_str += f" / {test_score:.2f}" delta = str(it.get("delta", "")) dcls = "d-pos" if delta.startswith("+") and delta != "+0" else "d-neg" if delta.startswith("-") else "d-zero" summary = html_mod.escape(str(it.get("change_summary", ""))[:80]) @@ -227,10 +290,12 @@ def generate_optimization_report(data: dict, auto_refresh: bool = False) -> str:
{diff_esc}
""" - chart_json = json.dumps([ - {"x": it["number"], "train": it["score"].get("train", 0), "test": it["score"].get("test")} - for it in iterations - ]) + chart_json = json.dumps( + [ + {"x": it["number"], "train": it["score"].get("train", 0), "test": it["score"].get("test")} + for it in iterations + ] + ) diffs_json = json.dumps({it["number"]: str(it.get("diff", "")) for it in iterations}) bt = baseline.get("train", 0.0) @@ -489,7 +554,7 @@ def _parse_frontmatter(content: str) -> tuple[bool, str]: while idx < len(fm_lines): line = fm_lines[idx] if line.startswith("description:"): - value = line[len("description:"):].strip() + value = line[len("description:") :].strip() if value in (">", "|", ">-", "|-"): parts: list[str] = [] idx += 1 @@ -509,9 +574,7 @@ def _is_trigger_task(task: dict) -> bool: def _is_pattern_task(task: dict) -> bool: - return "prompt" in task and ( - "expected_patterns" in task or "forbidden_patterns" in task or "weight" in task - ) + return "prompt" in task and ("expected_patterns" in task or "forbidden_patterns" in task or "weight" in task) def _validate_task_set(tasks: list[dict]) -> None: @@ -534,9 +597,7 @@ def _validate_task_set(tasks: list[dict]) -> None: "Use trigger-rate tasks with 'query' and 'should_trigger' fields." ) - raise ValueError( - "Unsupported task format. Expected trigger-rate tasks with 'query' and 'should_trigger' fields." - ) + raise ValueError("Unsupported task format. Expected trigger-rate tasks with 'query' and 'should_trigger' fields.") # --------------------------------------------------------------------------- @@ -577,13 +638,21 @@ def _run_trigger_rate( break cmd = [ - sys.executable, "-m", "scripts.skill_eval.run_eval", - "--eval-set", task_file, - "--skill-path", skill_dir, - "--description", description, - "--num-workers", str(num_workers), - "--timeout", str(timeout), - "--runs-per-query", "1", + sys.executable, + "-m", + "scripts.skill_eval.run_eval", + "--eval-set", + task_file, + "--skill-path", + skill_dir, + "--description", + description, + "--num-workers", + str(num_workers), + "--timeout", + str(timeout), + "--runs-per-query", + "1", ] if verbose: cmd.append("--verbose") @@ -592,8 +661,12 @@ def _run_trigger_rate( env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} result = subprocess.run( - cmd, capture_output=True, text=True, - cwd=str(project_root), env=env, timeout=600, + cmd, + capture_output=True, + text=True, + cwd=str(project_root), + env=env, + timeout=600, ) if result.returncode != 0: @@ -654,6 +727,7 @@ def assess_target( # different variants produce different scores. if dry_run: import hashlib + h = int(hashlib.sha256(content.encode()).hexdigest()[:8], 16) base = (h % 30 + 70) / 100.0 # 0.70-1.00 range — always decent scores["correctness"] = round(base * 10, 2) @@ -664,10 +738,14 @@ def assess_target( scores["tests_pass"] = True # always pass in dry-run for task in tasks: name = task.get("name", task.get("query", "unnamed"))[:40] - scores["task_results"].append({ - "name": name, "passed": True, - "score": base, "details": "dry-run", - }) + scores["task_results"].append( + { + "name": name, + "passed": True, + "score": base, + "details": "dry-run", + } + ) return scores # Detect assessment mode from task format @@ -690,12 +768,14 @@ def assess_target( scores["tests_pass"] = passed == total for r in results.get("results", []): - scores["task_results"].append({ - "name": r.get("query", "unnamed")[:40], - "passed": r.get("pass", False), - "score": 1.0 if r.get("pass", False) else 0.0, - "details": f"trigger_rate={r.get('trigger_rate', 0):.2f}", - }) + scores["task_results"].append( + { + "name": r.get("query", "unnamed")[:40], + "passed": r.get("pass", False), + "score": 1.0 if r.get("pass", False) else 0.0, + "details": f"trigger_rate={r.get('trigger_rate', 0):.2f}", + } + ) return scores # Benchmark behavioral assessment — not yet implemented. @@ -739,13 +819,26 @@ def run_optimization_loop( max_iterations: int = 20, min_gain: float = 0.02, train_split: float = 0.6, - model: str = "claude-sonnet-4-20250514", + revert_streak_limit: int = 5, + beam_width: int = 1, + candidates_per_parent: int = 1, + holdout_check_cadence: int = 5, + model: str | None = None, verbose: bool = False, report_path: Path | None = None, output_dir: Path | None = None, dry_run: bool = False, ) -> dict: """Run the autoresearch optimization loop.""" + if beam_width < 1: + raise ValueError("beam_width must be >= 1") + if candidates_per_parent < 1: + raise ValueError("candidates_per_parent must be >= 1") + if revert_streak_limit < 1: + raise ValueError("revert_streak_limit must be >= 1") + if holdout_check_cadence < 0: + raise ValueError("holdout_check_cadence must be >= 0") + if output_dir is None: output_dir = Path("evals/iterations") output_dir.mkdir(parents=True, exist_ok=True) @@ -764,7 +857,6 @@ def run_optimization_loop( "Target must have YAML frontmatter with a non-empty description. " "optimize_loop.py currently supports frontmatter-description optimization only." ) - current_content = original_content target_label = target_path.name if verbose: @@ -773,7 +865,7 @@ def run_optimization_loop( baseline_scores = assess_target(target_path, train_tasks, goal, verbose, dry_run) baseline_composite = composite_score(baseline_scores) best_score = baseline_composite - best_content = current_content + best_content = original_content best_iteration = 0 baseline_holdout_scores = assess_target(target_path, test_tasks, goal, verbose, dry_run) if test_tasks else None @@ -783,237 +875,315 @@ def run_optimization_loop( holdout_display = f"{baseline_holdout:.4f}" if baseline_holdout is not None else "n/a" print(f"Baseline: train={baseline_composite:.4f}, holdout={holdout_display}", file=sys.stderr) + baseline_failures = [t for t in baseline_scores.get("task_results", []) if not t.get("passed")] + frontier = [ + { + "content": original_content, + "score": baseline_composite, + "iteration": 0, + "failures": baseline_failures, + "history": [], + } + ] + iterations: list[dict] = [] - consecutive_reverts = 0 + rounds_without_keep = 0 exit_reason = "unknown" status = "RUNNING" total_tokens = 0 + iteration_counter = 0 - for i in range(1, max_iterations + 1): + for round_number in range(1, max_iterations + 1): if verbose: - print(f"\n{'=' * 60}\nIteration {i}/{max_iterations} (best={best_score:.4f})", file=sys.stderr) - - # 1. Generate variant - t0 = time.time() - last_failures = [] - if iterations: - last_scores_data = iterations[-1].get("scores", {}) - last_failures = [t for t in last_scores_data.get("task_results", []) if not t.get("passed")] - history = [ - { - "number": item["number"], - "verdict": item["verdict"], - "change_summary": item["change_summary"], - "delta": item["delta"], - } - for item in iterations[-5:] - ] - - if dry_run: - variant_content, change_summary, reasoning = make_dry_run_variant(current_content, i) - variant_output = { - "variant": variant_content, - "summary": change_summary, - "reasoning": reasoning, - "tokens_used": 0, - "deletions": [], - "deletion_justification": "", - } - deletions = [] - deletion_justification = "" - else: - with tempfile.NamedTemporaryFile(mode="w", suffix=target_path.suffix, encoding="utf-8") as current_file: - current_file.write(current_content) - current_file.flush() - variant_result = subprocess.run( - [ - sys.executable, - str(Path(__file__).parent / "generate_variant.py"), - "--target", str(target_path), - "--goal", goal, - "--current-content-file", current_file.name, - "--failures", json.dumps(last_failures), - "--history", json.dumps(history), - "--model", model, - ], - capture_output=True, text=True, timeout=120, - ) - - if variant_result.returncode != 0: - if verbose: - print(f"Variant generation failed: {variant_result.stderr}", file=sys.stderr) - consecutive_reverts += 1 - iterations.append({ - "number": i, "verdict": "REVERT", - "score": {"train": best_score}, - "delta": "0", "change_summary": "Variant generation failed", - "reasoning": variant_result.stderr[:200], "diff": "", - }) - if consecutive_reverts >= 5: - exit_reason = f"converged (5 consecutive reverts at iteration {i})" - status = "CONVERGED" - break - continue - - try: - variant_output = json.loads(variant_result.stdout) - variant_content = variant_output["variant"] - change_summary = variant_output.get("summary", "") - reasoning = variant_output.get("reasoning", "") - total_tokens += variant_output.get("tokens_used", 0) - deletions = variant_output.get("deletions", []) - deletion_justification = variant_output.get("deletion_justification", "").strip() - except (json.JSONDecodeError, KeyError) as e: - if verbose: - print(f"Parse error: {e}", file=sys.stderr) - consecutive_reverts += 1 - iterations.append({ - "number": i, "verdict": "REVERT", - "score": {"train": best_score}, - "delta": "0", "change_summary": f"Parse error: {e}", - "reasoning": "", "diff": "", - }) - if consecutive_reverts >= 5: - exit_reason = f"converged (5 consecutive reverts at iteration {i})" - status = "CONVERGED" - break - continue - - gen_elapsed = time.time() - t0 - - # 2. Validate protected sections - if not check_protected_sections(original_content, variant_content): - if verbose: - print("REJECTED: Protected sections modified", file=sys.stderr) - diff_text = generate_diff(current_content, variant_content, target_label) - save_iteration(output_dir, i, variant_content, {"protected_intact": False}, - "REVERT", "Protected sections modified", diff_text, change_summary) - iterations.append({ - "number": i, "verdict": "REVERT", - "score": {"train": 0.0}, - "delta": "0", "change_summary": "Protected sections modified", - "reasoning": reasoning, "diff": diff_text, - }) - consecutive_reverts += 1 - if consecutive_reverts >= 5: - exit_reason = f"converged (5 consecutive reverts at iteration {i})" - status = "CONVERGED" - break - continue - - if deletions and not deletion_justification: - if verbose: - print(f"REJECTED: Deleted sections without justification: {deletions}", file=sys.stderr) - diff_text = generate_diff(current_content, variant_content, target_label) - save_iteration( - output_dir, - i, - variant_content, - {"protected_intact": True}, - "REVERT", - "Deleted sections without justification", - diff_text, - change_summary, - deletions=deletions, + print( + f"\n{'=' * 60}\nRound {round_number}/{max_iterations} " + f"(frontier={len(frontier)}, best={best_score:.4f})", + file=sys.stderr, ) - iterations.append({ - "number": i, - "verdict": "REVERT", - "score": {"train": best_score}, - "delta": "0", - "change_summary": "Deleted sections without justification", - "reasoning": reasoning, - "diff": diff_text, - "deletions": deletions, - "deletion_justification": "", - }) - consecutive_reverts += 1 - if consecutive_reverts >= 5: - exit_reason = f"converged (5 consecutive reverts at iteration {i})" - status = "CONVERGED" - break - continue - - # 3. Evaluate variant - temp_target = target_path.parent / f".{target_path.stem}_variant{target_path.suffix}" - temp_target.write_text(variant_content) - try: - t0 = time.time() - variant_scores = assess_target(temp_target, train_tasks, goal, verbose, dry_run) - eval_elapsed = time.time() - t0 - variant_composite = composite_score(variant_scores) - finally: - temp_target.unlink(missing_ok=True) - diff_text = generate_diff(current_content, variant_content, target_label) + kept_nodes: list[dict] = [] + iteration_by_number: dict[int, dict] = {} + + for parent_index, parent in enumerate(frontier, start=1): + last_failures = parent.get("failures", []) + history = parent.get("history", [])[-5:] + + for candidate_index in range(1, candidates_per_parent + 1): + iteration_counter += 1 + t0 = time.time() + diversification_note = None + if beam_width > 1 or candidates_per_parent > 1 or len(frontier) > 1: + diversification_note = ( + f"Round {round_number}; parent {parent_index}/{len(frontier)} " + f"(source iteration {parent.get('iteration', 0)}); " + f"candidate {candidate_index}/{candidates_per_parent}. " + "Produce a materially different approach from sibling candidates in this round." + ) + + base_metadata = { + "round": round_number, + "parent_iteration": parent.get("iteration", 0), + "parent_rank": parent_index, + "candidate_index": candidate_index, + "frontier_rank": None, + "selected_for_frontier": False, + } + + try: + variant_output = _generate_variant_output( + current_content=parent["content"], + target_path=target_path, + goal=goal, + last_failures=last_failures, + history=history, + model=model, + dry_run=dry_run, + iteration_number=iteration_counter, + diversification_note=diversification_note, + ) + variant_content = variant_output["variant"] + change_summary = variant_output.get("summary", "") + reasoning = variant_output.get("reasoning", "") + total_tokens += variant_output.get("tokens_used", 0) + deletions = variant_output.get("deletions", []) + deletion_justification = variant_output.get("deletion_justification", "").strip() + except (RuntimeError, ValueError, KeyError) as e: + if verbose: + print(f"Variant generation failed: {e}", file=sys.stderr) + iteration_data = { + "number": iteration_counter, + "verdict": "REVERT", + "score": {"train": parent["score"], "test": None}, + "delta": "0", + "change_summary": str(e), + "reasoning": "", + "diff": "", + "tokens_used": 0, + "scores": {}, + **base_metadata, + } + save_iteration( + output_dir, + iteration_counter, + parent["content"], + {}, + "REVERT", + "", + "", + str(e), + metadata=base_metadata, + ) + iterations.append(iteration_data) + iteration_by_number[iteration_counter] = iteration_data + continue + + gen_elapsed = time.time() - t0 + diff_text = generate_diff(parent["content"], variant_content, target_label) + + if not check_protected_sections(original_content, variant_content): + if verbose: + print("REJECTED: Protected sections modified", file=sys.stderr) + iteration_data = { + "number": iteration_counter, + "verdict": "REVERT", + "score": {"train": 0.0, "test": None}, + "delta": "0", + "change_summary": "Protected sections modified", + "reasoning": reasoning, + "diff": diff_text, + "tokens_used": variant_output.get("tokens_used", 0), + "scores": {"protected_intact": False}, + **base_metadata, + } + save_iteration( + output_dir, + iteration_counter, + variant_content, + {"protected_intact": False}, + "REVERT", + "Protected sections modified", + diff_text, + change_summary, + metadata=base_metadata, + ) + iterations.append(iteration_data) + iteration_by_number[iteration_counter] = iteration_data + continue + + if deletions and not deletion_justification: + if verbose: + print(f"REJECTED: Deleted sections without justification: {deletions}", file=sys.stderr) + iteration_data = { + "number": iteration_counter, + "verdict": "REVERT", + "score": {"train": parent["score"], "test": None}, + "delta": "0", + "change_summary": "Deleted sections without justification", + "reasoning": reasoning, + "diff": diff_text, + "tokens_used": variant_output.get("tokens_used", 0), + "scores": {"protected_intact": True}, + "deletions": deletions, + "deletion_justification": "", + **base_metadata, + } + save_iteration( + output_dir, + iteration_counter, + variant_content, + {"protected_intact": True}, + "REVERT", + "Deleted sections without justification", + diff_text, + change_summary, + deletions=deletions, + metadata=base_metadata, + ) + iterations.append(iteration_data) + iteration_by_number[iteration_counter] = iteration_data + continue + + temp_target = target_path.parent / f".{target_path.stem}_variant{target_path.suffix}" + temp_target.write_text(variant_content) + try: + t0 = time.time() + variant_scores = assess_target(temp_target, train_tasks, goal, verbose, dry_run) + eval_elapsed = time.time() - t0 + variant_composite = composite_score(variant_scores) + finally: + temp_target.unlink(missing_ok=True) + + gain = variant_composite - parent["score"] + if verbose: + print( + f"Candidate {iteration_counter}: score={variant_composite:.4f} " + f"(vs parent {parent['score']:.4f}, gain={gain:+.4f}, " + f"gen={gen_elapsed:.1f}s, eval={eval_elapsed:.1f}s)", + file=sys.stderr, + ) + + verdict = "KEEP" if gain > min_gain else "REVERT" + if deletions and deletion_justification: + change_summary = f"{change_summary} [deletion justified]" + delta_str = f"{gain:+.2f}" if gain != 0 else "0" + + metadata = { + **base_metadata, + "gain_against_parent": round(gain, 4), + } + + save_iteration( + output_dir, + iteration_counter, + variant_content, + variant_scores, + verdict, + reasoning, + diff_text, + change_summary, + deletions=deletions, + deletion_justification=deletion_justification, + metadata=metadata, + ) - if verbose: - print(f"Score: {variant_composite:.4f} (gain={variant_composite - best_score:.4f}, gen={gen_elapsed:.1f}s, eval={eval_elapsed:.1f}s)", file=sys.stderr) - - # 4. Keep/revert (deterministic arithmetic) - gain = variant_composite - best_score - if gain > min_gain: - verdict = "KEEP" - best_score = variant_composite - best_content = variant_content - best_iteration = i - current_content = variant_content - consecutive_reverts = 0 - delta_str = f"+{gain:.2f}" + iteration_data = { + "number": iteration_counter, + "verdict": verdict, + "score": {"train": variant_composite, "test": None}, + "delta": delta_str, + "change_summary": change_summary, + "reasoning": reasoning, + "diff": diff_text, + "tokens_used": variant_output.get("tokens_used", 0), + "scores": variant_scores, + "deletions": deletions, + "deletion_justification": deletion_justification, + **metadata, + } + iterations.append(iteration_data) + iteration_by_number[iteration_counter] = iteration_data + + if verdict == "KEEP": + if variant_composite > best_score: + best_score = variant_composite + best_content = variant_content + best_iteration = iteration_counter + + kept_nodes.append( + { + "content": variant_content, + "score": variant_composite, + "iteration": iteration_counter, + "failures": [t for t in variant_scores.get("task_results", []) if not t.get("passed")], + "history": parent.get("history", []) + + [ + { + "number": iteration_counter, + "verdict": verdict, + "change_summary": change_summary, + "delta": delta_str, + } + ], + } + ) + + if kept_nodes: + kept_nodes.sort(key=lambda item: (-item["score"], item["iteration"])) + frontier = kept_nodes[:beam_width] + for rank, node in enumerate(frontier, start=1): + item = iteration_by_number.get(node["iteration"]) + if item is not None: + item["selected_for_frontier"] = True + item["frontier_rank"] = rank + rounds_without_keep = 0 else: - verdict = "REVERT" - consecutive_reverts += 1 - delta_str = f"{gain:+.2f}" if gain != 0 else "0" - - if deletions and deletion_justification: - change_summary = f"{change_summary} [deletion justified]" - - save_iteration(output_dir, i, variant_content, variant_scores, - verdict, reasoning, diff_text, change_summary, - deletions=deletions, deletion_justification=deletion_justification) - - iteration_data: dict = { - "number": i, "verdict": verdict, - "score": {"train": variant_composite, "test": None}, - "delta": delta_str, "change_summary": change_summary, - "reasoning": reasoning, "diff": diff_text, - "tokens_used": variant_output.get("tokens_used", 0), - "scores": variant_scores, - "deletions": deletions, - "deletion_justification": deletion_justification, - } + rounds_without_keep += 1 - # 5. Goodhart alarm — every 5 iterations, check held-out set - if test_tasks and i % 5 == 0: + if test_tasks and holdout_check_cadence > 0 and round_number % holdout_check_cadence == 0: + temp_target = target_path.parent / f".{target_path.stem}_variant{target_path.suffix}" try: temp_target.write_text(best_content) holdout_scores = assess_target(temp_target, test_tasks, goal, verbose, dry_run) holdout_composite = composite_score(holdout_scores) - iteration_data["score"]["test"] = holdout_composite + if iterations: + iterations[-1]["score"]["test"] = holdout_composite finally: temp_target.unlink(missing_ok=True) if holdout_diverges(best_score, holdout_composite, baseline_holdout, baseline_composite): if verbose: - print(f"GOODHART ALARM: holdout={holdout_composite:.4f} vs baseline={baseline_holdout:.4f}", file=sys.stderr) - exit_reason = f"goodhart_alarm (iteration {i})" + print( + f"GOODHART ALARM: holdout={holdout_composite:.4f} vs baseline={baseline_holdout:.4f}", + file=sys.stderr, + ) + exit_reason = f"goodhart_alarm (round {round_number})" status = "GOODHART_ALARM" - iterations.append(iteration_data) break - iterations.append(iteration_data) - - # 6. Convergence check - if consecutive_reverts >= 5: - exit_reason = f"converged (5 consecutive reverts at iteration {i})" + if rounds_without_keep >= revert_streak_limit: + exit_reason = f"converged ({revert_streak_limit} rounds without KEEP by round {round_number})" status = "CONVERGED" break - # Regenerate live report if report_path: - rd = _build_report_data(target_label, goal, baseline_composite, baseline_holdout, - len(train_tasks), len(test_tasks), iterations, max_iterations, - status, total_tokens) + rd = _build_report_data( + target_label, + goal, + baseline_composite, + baseline_holdout, + len(train_tasks), + len(test_tasks), + iterations, + max_iterations, + status, + total_tokens, + ) + rd["search"] = { + "strategy": "beam", + "beam_width": beam_width, + "candidates_per_parent": candidates_per_parent, + "holdout_check_cadence": holdout_check_cadence, + } report_path.write_text(generate_optimization_report(rd, auto_refresh=True)) else: @@ -1022,9 +1192,24 @@ def run_optimization_loop( # Final report if report_path: - rd = _build_report_data(target_label, goal, baseline_composite, baseline_holdout, - len(train_tasks), len(test_tasks), iterations, max_iterations, - status, total_tokens) + rd = _build_report_data( + target_label, + goal, + baseline_composite, + baseline_holdout, + len(train_tasks), + len(test_tasks), + iterations, + max_iterations, + status, + total_tokens, + ) + rd["search"] = { + "strategy": "beam", + "beam_width": beam_width, + "candidates_per_parent": candidates_per_parent, + "holdout_check_cadence": holdout_check_cadence, + } report_path.write_text(generate_optimization_report(rd, auto_refresh=False)) if best_iteration > 0: @@ -1034,17 +1219,25 @@ def run_optimization_loop( print(f"\nBest variant saved to: {best_path}", file=sys.stderr) result = { - "exit_reason": exit_reason, "status": status, - "target": str(target_path), "goal": goal, + "exit_reason": exit_reason, + "status": status, + "target": str(target_path), + "goal": goal, "baseline_score": {"train": baseline_composite, "test": baseline_holdout}, "baseline_train_score": baseline_composite, "baseline_holdout_score": baseline_holdout, "best_score": best_score, - "best_iteration": best_iteration, "iterations_run": len(iterations), + "best_iteration": best_iteration, + "iterations_run": len(iterations), "max_iterations": max_iterations, "improvements_found": sum(1 for it in iterations if it["verdict"] == "KEEP"), "total_tokens": total_tokens, - "train_size": len(train_tasks), "test_size": len(test_tasks), + "search_strategy": "beam", + "beam_width": beam_width, + "candidates_per_parent": candidates_per_parent, + "holdout_check_cadence": holdout_check_cadence, + "train_size": len(train_tasks), + "test_size": len(test_tasks), "iterations": iterations, } (output_dir / "results.json").write_text(json.dumps(result, indent=2)) @@ -1061,12 +1254,38 @@ def main(): parser.add_argument("--target", required=True, help="Path to agent/skill file to optimize") parser.add_argument("--goal", required=True, help="Optimization objective") parser.add_argument("--benchmark-tasks", required=True, help="Path to benchmark tasks JSON") - parser.add_argument("--max-iterations", type=int, default=20, help="Max iterations (default: 20)") + parser.add_argument( + "--max-iterations", + type=int, + default=20, + help="Max optimization rounds (default: 20); each round evaluates up to beam_width x candidates_per_parent candidates", + ) parser.add_argument("--min-gain", type=float, default=0.02, help="Min score gain to keep (default: 0.02)") parser.add_argument("--train-split", type=float, default=0.6, help="Train fraction (default: 0.6)") - parser.add_argument("--model", required=True, help="Model for variant generation") + parser.add_argument( + "--revert-streak-limit", + type=int, + default=5, + help="Stop after this many rounds without any KEEP candidates (default: 5)", + ) + parser.add_argument("--beam-width", type=int, default=1, help="Number of kept candidates to retain per round") + parser.add_argument( + "--candidates-per-parent", + type=int, + default=1, + help="How many sibling variants to generate from each frontier candidate per round", + ) + parser.add_argument( + "--holdout-check-cadence", + type=int, + default=5, + help="Check held-out tasks every N rounds (default: 5; 0 disables)", + ) + parser.add_argument("--model", default=None, help="Optional Claude Code model override for variant generation") parser.add_argument("--verbose", action="store_true", help="Print progress to stderr") - parser.add_argument("--dry-run", action="store_true", help="Use synthetic scores (test loop mechanics without API)") + parser.add_argument( + "--dry-run", action="store_true", help="Use synthetic scores (test loop mechanics without calling Claude Code)" + ) parser.add_argument("--report", default=None, help="Path for live HTML report") parser.add_argument("--output-dir", default=None, help="Directory for iteration snapshots") args = parser.parse_args() @@ -1083,10 +1302,17 @@ def main(): try: result = run_optimization_loop( - target_path=target, goal=args.goal, + target_path=target, + goal=args.goal, benchmark_tasks_path=tasks_path, - max_iterations=args.max_iterations, min_gain=args.min_gain, - train_split=args.train_split, model=args.model, + max_iterations=args.max_iterations, + min_gain=args.min_gain, + train_split=args.train_split, + revert_streak_limit=args.revert_streak_limit, + beam_width=args.beam_width, + candidates_per_parent=args.candidates_per_parent, + holdout_check_cadence=args.holdout_check_cadence, + model=args.model, verbose=args.verbose, report_path=Path(args.report) if args.report else None, output_dir=Path(args.output_dir) if args.output_dir else None, diff --git a/skills/skill-creator/assets/eval_viewer.html b/skills/skill-creator/assets/eval_viewer.html index 0c83595..81243e0 100644 --- a/skills/skill-creator/assets/eval_viewer.html +++ b/skills/skill-creator/assets/eval_viewer.html @@ -1517,6 +1517,7 @@

Blind A/B Code Review

var actions = el('div','opt-actions'); var previewBtn = el('button','opt-btn opt-btn-primary','Preview Selected Snapshot'); + if(!keepNumbers.length) previewBtn.disabled = true; previewBtn.addEventListener('click', function(){ var selected = document.querySelector('#iterations-page .snapshot-pick:checked'); if(!selected){alert('Select one kept iteration');return;} @@ -1530,6 +1531,7 @@

Blind A/B Code Review

actions.appendChild(previewBtn); var exportBtn = el('button','opt-btn','Export Selected Snapshot'); + if(!keepNumbers.length) exportBtn.disabled = true; exportBtn.addEventListener('click', function(){ var selected = document.querySelector('#iterations-page .snapshot-pick:checked'); if(!selected){alert('Select one kept iteration');return;} @@ -1561,6 +1563,7 @@

Blind A/B Code Review

page.appendChild(previewArea); var snapshotNote = el('p','muted','Selection exports the full diff from the original target to one kept snapshot.'); + if(!keepNumbers.length) snapshotNote.textContent = 'No kept snapshot yet. Preview and export unlock after the first KEEP verdict.'; snapshotNote.style.marginTop = '12px'; page.appendChild(snapshotNote); @@ -1604,6 +1607,9 @@

Blind A/B Code Review

buildStat(goalBody, 'Target', opt.target||'?'); buildStat(goalBody, 'Goal', opt.goal); buildStat(goalBody, 'Train/Test Split', String(taskCounts.train||opt.train_size||'?')+'/'+String(taskCounts.test||opt.test_size||'?')); + if(opt.beam_width || opt.candidates_per_parent) { + buildStat(goalBody, 'Search', 'beam=' + String(opt.beam_width||1) + ', siblings=' + String(opt.candidates_per_parent||1)); + } goalCard.appendChild(goalBody); page.appendChild(goalCard); } diff --git a/skills/skill-eval/SKILL.md b/skills/skill-eval/SKILL.md index 8e5214f..ba89f3b 100644 --- a/skills/skill-eval/SKILL.md +++ b/skills/skill-eval/SKILL.md @@ -109,7 +109,6 @@ Automated loop that tests, improves, and re-tests descriptions using Claude with python3 -m scripts.skill_eval.run_loop \ --eval-set evals.json \ --skill-path \ - --model claude-opus-4-6 \ --max-iterations 5 \ --verbose ``` @@ -117,7 +116,7 @@ python3 -m scripts.skill_eval.run_loop \ This will: 1. Split eval set 60/40 train/test (stratified by should_trigger) — prevents overfitting to test cases 2. Evaluate current description on all queries (3 runs each for reliability) -3. Use Claude with extended thinking to propose improvements based on training failures +3. Use `claude -p` to propose improvements based on training failures 4. Re-evaluate the new description 5. Repeat until all pass or max iterations reached 6. Select best description by **test** score (not train score — prevents overfitting) @@ -207,9 +206,9 @@ If description optimization found a better description: **Cause**: Claude CLI not available for trigger evaluation **Solution**: Install Claude Code CLI. Trigger eval requires `claude -p` to test skill invocation. -### Error: "anthropic SDK not installed" -**Cause**: Description optimization requires the Anthropic Python SDK -**Solution**: `pip install anthropic`. Only needed for `improve_description.py` and `run_loop.py`. +### Error: "legacy SDK dependency" +**Cause**: Outdated instructions or an old checkout still expects a direct SDK client +**Solution**: Update to the current scripts. Description optimization now runs through `claude -p`. ### Error: "CLAUDECODE environment variable" **Cause**: Running eval from inside a Claude Code session blocks nested instances From 5490a0819d640f46fcbddea58a3b9d73942d399d Mon Sep 17 00:00:00 2001 From: notque Date: Sun, 29 Mar 2026 08:18:50 -0700 Subject: [PATCH 03/12] fix(review-round-1): address 8 findings from PR review Critical fixes: - Temp file collision in beam search: embed iteration_counter in filename - rules-distill.py: log errors on claude -p failure and JSONDecodeError - _run_trigger_rate: always print subprocess errors, not just under --verbose - _generate_variant_output: add cwd and env (strip CLAUDECODE) Important fixes: - _find_project_root: warn on silent cwd fallback in generate_variant and improve_description - improve_description: warn when tags not found - search_strategy: emit "hill_climb" for single-path runs (beam_width=1, candidates=1) - rules-distill: log exception in broad except clause --- scripts/rules-distill.py | 5 ++- scripts/skill_eval/improve_description.py | 3 ++ .../test_agent_comparison_optimize_loop.py | 37 +++++-------------- .../scripts/generate_variant.py | 8 ++-- .../agent-comparison/scripts/optimize_loop.py | 23 ++++++++---- 5 files changed, 36 insertions(+), 40 deletions(-) diff --git a/scripts/rules-distill.py b/scripts/rules-distill.py index db1b640..2e1eac6 100644 --- a/scripts/rules-distill.py +++ b/scripts/rules-distill.py @@ -325,11 +325,13 @@ def _run_claude_code(prompt: str, model: str | None = None) -> tuple[str, str]: timeout=300, ) if result.returncode != 0: + print(f"claude -p failed (exit {result.returncode}): {result.stderr}", file=sys.stderr) return "", "" try: events = json.loads(result.stdout) except json.JSONDecodeError: + print(f"claude -p returned invalid JSON: {result.stdout[:200]}", file=sys.stderr) return "", "" assistant_text = "" @@ -381,7 +383,8 @@ def _llm_extract_principles(skill_content: str, skill_name: str) -> list[dict] | for p in principles if isinstance(p, str) and len(p) >= 15 ] - except Exception: + except Exception as exc: + print(f"LLM extraction failed: {exc}", file=sys.stderr) return None diff --git a/scripts/skill_eval/improve_description.py b/scripts/skill_eval/improve_description.py index c759219..9b44ae2 100644 --- a/scripts/skill_eval/improve_description.py +++ b/scripts/skill_eval/improve_description.py @@ -21,6 +21,7 @@ def _find_project_root() -> Path: for parent in [current, *current.parents]: if (parent / ".claude").is_dir(): return parent + print("Warning: .claude/ directory not found, using cwd as project root", file=sys.stderr) return current @@ -159,6 +160,8 @@ def improve_description( # Parse out the tags match = re.search(r"(.*?)", text, re.DOTALL) + if not match: + print("Warning: tags not found in response, using raw output", file=sys.stderr) description = match.group(1).strip().strip('"') if match else text.strip().strip('"') # Log the transcript diff --git a/scripts/tests/test_agent_comparison_optimize_loop.py b/scripts/tests/test_agent_comparison_optimize_loop.py index d3c8bdc..662b63f 100644 --- a/scripts/tests/test_agent_comparison_optimize_loop.py +++ b/scripts/tests/test_agent_comparison_optimize_loop.py @@ -1,9 +1,8 @@ import importlib.util import json -from pathlib import Path import subprocess import sys - +from pathlib import Path REPO_ROOT = Path(__file__).resolve().parents[2] @@ -40,13 +39,7 @@ def test_check_protected_sections_rejects_missing_blocks(): "agent_comparison_optimize_loop", "skills/agent-comparison/scripts/optimize_loop.py", ) - original = ( - "alpha\n" - "\n" - "keep me\n" - "\n" - "omega\n" - ) + original = "alpha\n\nkeep me\n\nomega\n" relocated = "alpha\nomega\n" assert optimize_loop.check_protected_sections(original, relocated) is False @@ -57,13 +50,7 @@ def test_restore_protected_does_not_silently_reinsert_missing_blocks(): "agent_comparison_generate_variant", "skills/agent-comparison/scripts/generate_variant.py", ) - original = ( - "alpha\n" - "\n" - "keep me\n" - "\n" - "omega\n" - ) + original = "alpha\n\nkeep me\n\nomega\n" variant = "alpha\nomega\n" restored = generate_variant.restore_protected(original, variant) @@ -135,9 +122,7 @@ def test_optimize_loop_omits_model_flag_when_not_provided(tmp_path, monkeypatch) ) target = tmp_path / "SKILL.md" - target.write_text( - "---\nname: test-skill\ndescription: test description\nversion: 1.0.0\n---\n\n# Skill\n" - ) + target.write_text("---\nname: test-skill\ndescription: test description\nversion: 1.0.0\n---\n\n# Skill\n") tasks = [ {"name": "train-positive", "query": "write go tests", "should_trigger": True, "split": "train"}, {"name": "test-negative", "query": "debug kubernetes", "should_trigger": False, "split": "test"}, @@ -156,7 +141,7 @@ def fake_assess_target(*args, **kwargs): "task_results": [{"name": "train-positive", "passed": False}], } - def fake_run(cmd, capture_output, text, timeout): + def fake_run(cmd, capture_output, text, timeout, cwd=None, env=None): seen_cmds.append(cmd) payload = { "variant": target.read_text(), @@ -195,9 +180,7 @@ def test_optimize_loop_respects_revert_streak_limit(tmp_path, monkeypatch): ) target = tmp_path / "SKILL.md" - target.write_text( - "---\nname: test-skill\ndescription: test description\nversion: 1.0.0\n---\n\n# Skill\n" - ) + target.write_text("---\nname: test-skill\ndescription: test description\nversion: 1.0.0\n---\n\n# Skill\n") tasks_file = tmp_path / "tasks.json" tasks_file.write_text( json.dumps( @@ -219,7 +202,7 @@ def fake_assess_target(*args, **kwargs): "task_results": [{"name": "train-positive", "passed": False}], } - def fake_run(cmd, capture_output, text, timeout): + def fake_run(cmd, capture_output, text, timeout, cwd=None, env=None): payload = { "variant": target.read_text(), "summary": "no-op", @@ -258,9 +241,7 @@ def test_optimize_loop_beam_search_retains_top_k_candidates(tmp_path, monkeypatc ) target = tmp_path / "SKILL.md" - target.write_text( - "---\nname: test-skill\ndescription: test description\nversion: 1.0.0\n---\n\n# Skill\n" - ) + target.write_text("---\nname: test-skill\ndescription: test description\nversion: 1.0.0\n---\n\n# Skill\n") tasks_file = tmp_path / "tasks.json" tasks_file.write_text( json.dumps( @@ -275,7 +256,7 @@ def test_optimize_loop_beam_search_retains_top_k_candidates(tmp_path, monkeypatc generated = iter(["alpha", "beta"]) - def fake_run(cmd, capture_output, text, timeout): + def fake_run(cmd, capture_output, text, timeout, cwd=None, env=None): label = next(generated) payload = { "variant": target.read_text() + f"\n\n", diff --git a/skills/agent-comparison/scripts/generate_variant.py b/skills/agent-comparison/scripts/generate_variant.py index bccf512..31cb244 100644 --- a/skills/agent-comparison/scripts/generate_variant.py +++ b/skills/agent-comparison/scripts/generate_variant.py @@ -59,8 +59,7 @@ def restore_protected(original: str, variant: str) -> str: if len(orig_sections) != len(var_sections): print( - "Warning: Protected section count mismatch " - f"(original={len(orig_sections)}, variant={len(var_sections)}).", + f"Warning: Protected section count mismatch (original={len(orig_sections)}, variant={len(var_sections)}).", file=sys.stderr, ) return variant @@ -97,6 +96,7 @@ def _find_project_root() -> Path: for parent in [current, *current.parents]: if (parent / ".claude").is_dir(): return parent + print("Warning: .claude/ directory not found, using cwd as project root", file=sys.stderr) return current @@ -168,7 +168,9 @@ def generate_variant( if history: history_section = "\n\nPrevious attempts (do NOT repeat — try structurally different approaches):\n" for h in history: - history_section += f" Iteration {h.get('number', '?')}: {h.get('verdict', '?')} — {h.get('change_summary', '')}\n" + history_section += ( + f" Iteration {h.get('number', '?')}: {h.get('verdict', '?')} — {h.get('change_summary', '')}\n" + ) diversification_section = "" if diversification_note: diff --git a/skills/agent-comparison/scripts/optimize_loop.py b/skills/agent-comparison/scripts/optimize_loop.py index 54eea55..a03ceec 100644 --- a/skills/agent-comparison/scripts/optimize_loop.py +++ b/skills/agent-comparison/scripts/optimize_loop.py @@ -21,6 +21,7 @@ import argparse import json +import os import random import re import subprocess @@ -195,10 +196,18 @@ def _generate_variant_output( variant_cmd.extend(["--diversification-note", diversification_note]) if model: variant_cmd.extend(["--model", model]) + _variant_project_root = Path.cwd() + for _parent in [_variant_project_root, *_variant_project_root.parents]: + if (_parent / ".claude").is_dir(): + _variant_project_root = _parent + break + _variant_env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} variant_result = subprocess.run( variant_cmd, capture_output=True, text=True, + cwd=str(_variant_project_root), + env=_variant_env, timeout=360, ) @@ -618,9 +627,6 @@ def _run_trigger_rate( Tasks must have 'query' and 'should_trigger' fields. Returns run_eval-style results dict. """ - import os - import tempfile - task_file = None try: with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: @@ -670,8 +676,7 @@ def _run_trigger_rate( ) if result.returncode != 0: - if verbose: - print(f"Trigger assessment failed: {result.stderr[:300]}", file=sys.stderr) + print(f"Trigger assessment failed (exit {result.returncode}): {result.stderr[:300]}", file=sys.stderr) return {"results": [], "summary": {"total": 0, "passed": 0, "failed": 0}} return json.loads(result.stdout) @@ -1043,7 +1048,9 @@ def run_optimization_loop( iteration_by_number[iteration_counter] = iteration_data continue - temp_target = target_path.parent / f".{target_path.stem}_variant{target_path.suffix}" + temp_target = ( + target_path.parent / f".{target_path.stem}_variant_{iteration_counter}{target_path.suffix}" + ) temp_target.write_text(variant_content) try: t0 = time.time() @@ -1140,7 +1147,7 @@ def run_optimization_loop( rounds_without_keep += 1 if test_tasks and holdout_check_cadence > 0 and round_number % holdout_check_cadence == 0: - temp_target = target_path.parent / f".{target_path.stem}_variant{target_path.suffix}" + temp_target = target_path.parent / f".{target_path.stem}_holdout_check{target_path.suffix}" try: temp_target.write_text(best_content) holdout_scores = assess_target(temp_target, test_tasks, goal, verbose, dry_run) @@ -1232,7 +1239,7 @@ def run_optimization_loop( "max_iterations": max_iterations, "improvements_found": sum(1 for it in iterations if it["verdict"] == "KEEP"), "total_tokens": total_tokens, - "search_strategy": "beam", + "search_strategy": "beam" if beam_width > 1 or candidates_per_parent > 1 else "hill_climb", "beam_width": beam_width, "candidates_per_parent": candidates_per_parent, "holdout_check_cadence": holdout_check_cadence, From db510bbc9ddb7c700db59dbde3bdf07ad25a86cd Mon Sep 17 00:00:00 2001 From: notque Date: Sun, 29 Mar 2026 08:21:49 -0700 Subject: [PATCH 04/12] fix(review-round-2): handle JSON parse error in _run_trigger_rate, fix task-file leak Critical fixes: - Wrap json.loads in _run_trigger_rate with try/except JSONDecodeError (exits-0-but-invalid-JSON no longer crashes the entire optimization run) - Move task_file assignment before json.dump so finally block can always clean up the temp file on disk Also: document _run_claude_code soft-fail contract in rules-distill.py --- scripts/rules-distill.py | 7 ++++++- skills/agent-comparison/scripts/optimize_loop.py | 8 ++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/scripts/rules-distill.py b/scripts/rules-distill.py index 2e1eac6..6a19172 100644 --- a/scripts/rules-distill.py +++ b/scripts/rules-distill.py @@ -310,7 +310,12 @@ def filter_layer4_not_covered( def _run_claude_code(prompt: str, model: str | None = None) -> tuple[str, str]: - """Run Claude Code and return (assistant_text, raw_result_text).""" + """Run Claude Code and return (assistant_text, raw_result_text). + + Soft-fail contract: returns ('', '') on any failure (non-zero exit, invalid + JSON, timeout). Callers must treat empty strings as a no-op and fall back + to keyword-based extraction. + """ cmd = ["claude", "-p", prompt, "--output-format", "json", "--print"] if model: cmd.extend(["--model", model]) diff --git a/skills/agent-comparison/scripts/optimize_loop.py b/skills/agent-comparison/scripts/optimize_loop.py index a03ceec..56bc45b 100644 --- a/skills/agent-comparison/scripts/optimize_loop.py +++ b/skills/agent-comparison/scripts/optimize_loop.py @@ -630,8 +630,8 @@ def _run_trigger_rate( task_file = None try: with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: - json.dump(tasks, f) task_file = f.name + json.dump(tasks, f) with tempfile.TemporaryDirectory() as skill_dir: skill_md = Path(skill_dir) / "SKILL.md" @@ -679,7 +679,11 @@ def _run_trigger_rate( print(f"Trigger assessment failed (exit {result.returncode}): {result.stderr[:300]}", file=sys.stderr) return {"results": [], "summary": {"total": 0, "passed": 0, "failed": 0}} - return json.loads(result.stdout) + try: + return json.loads(result.stdout) + except json.JSONDecodeError as e: + print(f"Trigger assessment returned invalid JSON: {e} — stdout: {result.stdout[:200]}", file=sys.stderr) + return {"results": [], "summary": {"total": 0, "passed": 0, "failed": 0}} finally: if task_file: Path(task_file).unlink(missing_ok=True) From bb60b7df8b3d6f6f82b541dadc12129f0da07579 Mon Sep 17 00:00:00 2001 From: notque Date: Sun, 29 Mar 2026 08:29:26 -0700 Subject: [PATCH 05/12] fix(review-round-3): catch TimeoutExpired, move write_text inside cleanup guard - Add subprocess.TimeoutExpired to caught exceptions in variant generation loop (prevents unhandled crash when claude -p hits 360s timeout) - Move temp_target.write_text() inside try/finally block so partial writes are cleaned up on disk-full or permission errors --- skills/agent-comparison/scripts/optimize_loop.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skills/agent-comparison/scripts/optimize_loop.py b/skills/agent-comparison/scripts/optimize_loop.py index 56bc45b..a8776cf 100644 --- a/skills/agent-comparison/scripts/optimize_loop.py +++ b/skills/agent-comparison/scripts/optimize_loop.py @@ -956,7 +956,7 @@ def run_optimization_loop( total_tokens += variant_output.get("tokens_used", 0) deletions = variant_output.get("deletions", []) deletion_justification = variant_output.get("deletion_justification", "").strip() - except (RuntimeError, ValueError, KeyError) as e: + except (RuntimeError, ValueError, KeyError, subprocess.TimeoutExpired) as e: if verbose: print(f"Variant generation failed: {e}", file=sys.stderr) iteration_data = { @@ -1055,8 +1055,8 @@ def run_optimization_loop( temp_target = ( target_path.parent / f".{target_path.stem}_variant_{iteration_counter}{target_path.suffix}" ) - temp_target.write_text(variant_content) try: + temp_target.write_text(variant_content) t0 = time.time() variant_scores = assess_target(temp_target, train_tasks, goal, verbose, dry_run) eval_elapsed = time.time() - t0 From 926bedff1082692c26f6d600aed35669de6fac88 Mon Sep 17 00:00:00 2001 From: notque Date: Sun, 29 Mar 2026 09:00:23 -0700 Subject: [PATCH 06/12] style: fix import sort order and formatting - Fix import block ordering in test_eval_compare_optimization.py (ruff I001) - Fix formatting in test_skill_eval_claude_code.py and eval_compare.py (ruff format) --- scripts/tests/test_eval_compare_optimization.py | 1 - scripts/tests/test_skill_eval_claude_code.py | 11 ++++------- skills/skill-creator/scripts/eval_compare.py | 5 +---- 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/scripts/tests/test_eval_compare_optimization.py b/scripts/tests/test_eval_compare_optimization.py index f687c7a..011695c 100644 --- a/scripts/tests/test_eval_compare_optimization.py +++ b/scripts/tests/test_eval_compare_optimization.py @@ -2,7 +2,6 @@ import json from pathlib import Path - REPO_ROOT = Path(__file__).resolve().parents[2] diff --git a/scripts/tests/test_skill_eval_claude_code.py b/scripts/tests/test_skill_eval_claude_code.py index c241105..a0c9e05 100644 --- a/scripts/tests/test_skill_eval_claude_code.py +++ b/scripts/tests/test_skill_eval_claude_code.py @@ -13,11 +13,7 @@ def test_improve_description_uses_claude_code_and_shortens(monkeypatch, tmp_path def fake_run(cmd, capture_output, text, cwd, env, timeout): calls.append(cmd) if len(calls) == 1: - text_out = ( - "" - + ("a" * 1030) - + "" - ) + text_out = "" + ("a" * 1030) + "" else: text_out = "short and valid" payload = [ @@ -33,7 +29,9 @@ def fake_run(cmd, capture_output, text, cwd, env, timeout): skill_content="# Skill", current_description="old", eval_results={ - "results": [{"query": "improve this skill", "should_trigger": True, "pass": False, "triggers": 0, "runs": 1}], + "results": [ + {"query": "improve this skill", "should_trigger": True, "pass": False, "triggers": 0, "runs": 1} + ], "summary": {"passed": 0, "failed": 1, "total": 1}, }, history=[], @@ -48,4 +46,3 @@ def fake_run(cmd, capture_output, text, cwd, env, timeout): transcript = json.loads((tmp_path / "improve_iter_1.json").read_text()) assert transcript["raw_result_text"] == "raw result" assert transcript["rewrite_raw_result_text"] == "raw result" - diff --git a/skills/skill-creator/scripts/eval_compare.py b/skills/skill-creator/scripts/eval_compare.py index cfae534..9a53e25 100644 --- a/skills/skill-creator/scripts/eval_compare.py +++ b/skills/skill-creator/scripts/eval_compare.py @@ -173,10 +173,7 @@ def is_optimization_data(data: object) -> bool: return False if "target" not in data: return False - return all( - isinstance(item, dict) and "number" in item and "verdict" in item - for item in iterations - ) + return all(isinstance(item, dict) and "number" in item and "verdict" in item for item in iterations) def load_optimization_data(workspace: Path) -> dict | None: From fdf897d30a606978b5c72178c30379c64a73c7a7 Mon Sep 17 00:00:00 2001 From: notque Date: Sun, 29 Mar 2026 09:54:29 -0700 Subject: [PATCH 07/12] feat(adr-132): add behavioral eval mode and creation compliance task set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add _run_behavioral_eval() to optimize_loop.py that runs `claude -p "/do {query}"` and checks for ADR artifact creation, enabling direct testing of /do's creation protocol compliance. Trigger-rate optimization was proven inapplicable for /do (scored 0.0 across all 32 tasks) because /do is slash-invoked, not description-discovered. Behavioral eval via headless /do is the correct approach — confirmed that `claude -p "/do create..."` works but does NOT produce ADRs, validating the compliance gap. Changes: - Add _run_behavioral_eval() with artifact snapshot/diff detection - Add _is_behavioral_task() for eval_mode detection - Update _validate_task_set() for behavioral task format - Wire behavioral path into assess_target() - Add DO NOT OPTIMIZE markers to /do SKILL.md (Phase 2-5 protected) - Create 32-task benchmark set (16 positive, 16 negative, 60/40 split) --- .../do-creation-compliance-tasks.json | 292 ++++++++++++++++++ .../agent-comparison/scripts/optimize_loop.py | 145 ++++++++- skills/do/SKILL.md | 4 + 3 files changed, 436 insertions(+), 5 deletions(-) create mode 100644 skills/agent-comparison/references/do-creation-compliance-tasks.json diff --git a/skills/agent-comparison/references/do-creation-compliance-tasks.json b/skills/agent-comparison/references/do-creation-compliance-tasks.json new file mode 100644 index 0000000..81c2e05 --- /dev/null +++ b/skills/agent-comparison/references/do-creation-compliance-tasks.json @@ -0,0 +1,292 @@ +{ + "tasks": [ + { + "name": "create-agent-prometheus", + "split": "train", + "complexity": "complex", + "query": "create a new Prometheus alerting agent", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "scaffold-skill-migration", + "split": "train", + "complexity": "complex", + "query": "scaffold a new skill for database migration safety", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "build-pipeline-security", + "split": "train", + "complexity": "complex", + "query": "build a pipeline for automated security scanning", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "create-hook-sql", + "split": "train", + "complexity": "simple", + "query": "create a PostToolUse hook that detects SQL injection patterns", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "new-feature-webhooks", + "split": "train", + "complexity": "complex", + "query": "new feature: add webhook support for deployment notifications", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "scaffold-perses-plugin", + "split": "train", + "complexity": "complex", + "query": "scaffold a Perses dashboard plugin with CUE schema and React component", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "implicit-create-rails-agent", + "split": "train", + "complexity": "simple", + "query": "I need an agent for Ruby on Rails development", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "add-linting-pipeline", + "split": "train", + "complexity": "simple", + "query": "add a new linting pipeline to the toolkit", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "create-voice-profile", + "split": "train", + "complexity": "complex", + "query": "create a new voice profile from my blog writing samples", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "build-agent-rust", + "split": "train", + "complexity": "simple", + "query": "build a Rust development agent with cargo integration", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "create-reviewer-agent", + "split": "test", + "complexity": "complex", + "query": "create a new code review agent focused on accessibility compliance", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "scaffold-etl-pipeline", + "split": "test", + "complexity": "complex", + "query": "scaffold an ETL pipeline skill with data validation phases", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "new-session-hook", + "split": "test", + "complexity": "simple", + "query": "new SessionStart hook that loads team configuration from a YAML file", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "create-monitoring-skill", + "split": "test", + "complexity": "simple", + "query": "create a skill for monitoring Kubernetes pod health across namespaces", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "build-terraform-agent", + "split": "test", + "complexity": "complex", + "query": "build a Terraform infrastructure agent with plan-apply-verify phases", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "implicit-create-java-skill", + "split": "test", + "complexity": "simple", + "query": "we need a Java Spring Boot development skill", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-debug-go-tests", + "split": "train", + "complexity": "complex", + "query": "debug why the Go tests are failing in CI", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-review-pr-security", + "split": "train", + "complexity": "complex", + "query": "review this PR for security issues", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-optimize-db", + "split": "train", + "complexity": "simple", + "query": "optimize the database query performance", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-explain-routing", + "split": "train", + "complexity": "simple", + "query": "explain how the routing system works", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-update-errors", + "split": "train", + "complexity": "simple", + "query": "update the error messages in the auth handler", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-research-rbac", + "split": "train", + "complexity": "complex", + "query": "research best practices for Kubernetes RBAC", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-check-ci", + "split": "train", + "complexity": "simple", + "query": "check the CI status on this branch", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-fix-import", + "split": "train", + "complexity": "simple", + "query": "fix the broken import in agents/INDEX.json", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-refactor-middleware", + "split": "train", + "complexity": "complex", + "query": "refactor the authentication middleware to use context propagation", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-run-tests", + "split": "train", + "complexity": "simple", + "query": "run the Python quality gate on the scripts directory", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-check-coverage", + "split": "test", + "complexity": "simple", + "query": "check test coverage for the voice validation module", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-deploy-staging", + "split": "test", + "complexity": "complex", + "query": "deploy the latest version to the staging environment", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-audit-deps", + "split": "test", + "complexity": "simple", + "query": "audit dependencies for known CVEs in the Python packages", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-compare-agents", + "split": "test", + "complexity": "complex", + "query": "compare the golang-general-engineer and golang-compact agents on error handling tasks", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-investigate-memory-leak", + "split": "test", + "complexity": "complex", + "query": "investigate the memory leak in the webhook processing service", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-merge-pr", + "split": "test", + "complexity": "simple", + "query": "merge PR 205 after CI passes", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + } + ] +} diff --git a/skills/agent-comparison/scripts/optimize_loop.py b/skills/agent-comparison/scripts/optimize_loop.py index a8776cf..7f4605c 100644 --- a/skills/agent-comparison/scripts/optimize_loop.py +++ b/skills/agent-comparison/scripts/optimize_loop.py @@ -20,6 +20,7 @@ from __future__ import annotations import argparse +import glob import json import os import random @@ -586,6 +587,10 @@ def _is_pattern_task(task: dict) -> bool: return "prompt" in task and ("expected_patterns" in task or "forbidden_patterns" in task or "weight" in task) +def _is_behavioral_task(task: dict) -> bool: + return "query" in task and "should_trigger" in task and task.get("eval_mode") == "behavioral" + + def _validate_task_set(tasks: list[dict]) -> None: """Reject unsupported or mixed task formats early with a clear error.""" if not tasks: @@ -593,9 +598,22 @@ def _validate_task_set(tasks: list[dict]) -> None: trigger_tasks = sum(1 for task in tasks if _is_trigger_task(task)) pattern_tasks = sum(1 for task in tasks if _is_pattern_task(task)) + behavioral_tasks = sum(1 for task in tasks if _is_behavioral_task(task)) + + # behavioral tasks are a subset of trigger tasks (same base fields), so subtract them + # to avoid double-counting when checking for pure trigger-rate sets + pure_trigger_tasks = trigger_tasks - behavioral_tasks + + if (pure_trigger_tasks or behavioral_tasks) and pattern_tasks: + raise ValueError( + "Task file mixes trigger-rate/behavioral and pattern benchmark formats. Use one format per run." + ) - if trigger_tasks and pattern_tasks: - raise ValueError("Task file mixes trigger-rate and pattern benchmark formats. Use one format per run.") + if behavioral_tasks and pure_trigger_tasks: + raise ValueError("Task file mixes trigger-rate and behavioral eval modes. Use one eval_mode per run.") + + if behavioral_tasks == len(tasks): + return if trigger_tasks == len(tasks): return @@ -689,6 +707,95 @@ def _run_trigger_rate( Path(task_file).unlink(missing_ok=True) +# --------------------------------------------------------------------------- +# Behavioral evaluator (runs claude -p and checks for artifact creation) +# --------------------------------------------------------------------------- + + +def _run_behavioral_eval( + target_path: Path, + description: str, + tasks: list[dict], + timeout: int = 120, + verbose: bool = False, +) -> list[dict]: + """Run behavioral assessment by invoking claude -p and checking artifact output. + + Each task must have 'query', 'should_trigger', 'artifact_glob', and optionally + 'query_prefix' fields. Tasks are run sequentially since each claude -p invocation + is resource-intensive. + + Returns a list of per-task result dicts with keys: + triggered, should_trigger, pass, new_artifacts + """ + project_root = Path.cwd() + for parent in [project_root, *project_root.parents]: + if (parent / ".claude").is_dir(): + project_root = parent + break + + env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} + + results = [] + for task in tasks: + query: str = task["query"] + should_trigger: bool = task["should_trigger"] + artifact_glob: str = task.get("artifact_glob", "adr/*.md") + query_prefix: str = task.get("query_prefix", "/do ") + + full_query = f"{query_prefix}{query}" + + # Snapshot existing artifacts before the run + before: set[str] = set(glob.glob(str(project_root / artifact_glob))) + + triggered = False + new_artifacts: list[str] = [] + + if verbose: + print(f"[behavioral] Running: claude -p {full_query!r}", file=sys.stderr) + + try: + result = subprocess.run( + ["claude", "-p", full_query], + capture_output=True, + text=True, + cwd=str(project_root), + env=env, + timeout=timeout, + ) + if verbose and result.returncode != 0: + print( + f"[behavioral] claude exited {result.returncode}: {result.stderr[:300]}", + file=sys.stderr, + ) + + # Check for new files matching the artifact glob + after: set[str] = set(glob.glob(str(project_root / artifact_glob))) + new_artifacts = sorted(after - before) + triggered = len(new_artifacts) > 0 + + if verbose and new_artifacts: + print(f"[behavioral] New artifacts: {new_artifacts}", file=sys.stderr) + + except subprocess.TimeoutExpired: + if verbose: + print(f"[behavioral] Timed out after {timeout}s for query: {full_query!r}", file=sys.stderr) + triggered = False + + passed = triggered == should_trigger + results.append( + { + "query": query, + "triggered": triggered, + "should_trigger": should_trigger, + "pass": passed, + "new_artifacts": new_artifacts, + } + ) + + return results + + # --------------------------------------------------------------------------- # Evaluation bridge # --------------------------------------------------------------------------- @@ -758,7 +865,8 @@ def assess_target( return scores # Detect assessment mode from task format - is_trigger = all(_is_trigger_task(task) for task in tasks) + is_behavioral = all(_is_behavioral_task(task) for task in tasks) + is_trigger = not is_behavioral and all(_is_trigger_task(task) for task in tasks) if is_trigger: results = _run_trigger_rate(target_path, description, tasks, verbose=verbose) @@ -787,9 +895,36 @@ def assess_target( ) return scores + if is_behavioral: + behavioral_results = _run_behavioral_eval(target_path, description, tasks, verbose=verbose) + total = len(behavioral_results) + passed = sum(1 for r in behavioral_results if r.get("pass", False)) + if total == 0: + return scores + + accuracy = passed / total + scores["correctness"] = round(accuracy * 10, 2) + scores["error_handling"] = round(accuracy * 8, 2) + scores["language_idioms"] = round(accuracy * 7, 2) + scores["testing"] = round(accuracy * 8, 2) + scores["efficiency"] = round(min(1.0, accuracy + 0.1) * 6, 2) + scores["tests_pass"] = passed == total + + for r in behavioral_results: + artifact_summary = ", ".join(r.get("new_artifacts", [])) or "none" + scores["task_results"].append( + { + "name": r.get("query", "unnamed")[:40], + "passed": r.get("pass", False), + "score": 1.0 if r.get("pass", False) else 0.0, + "details": f"triggered={r.get('triggered')}, artifacts={artifact_summary}", + } + ) + return scores + # Benchmark behavioral assessment — not yet implemented. - # Use trigger-rate format (tasks with 'query' + 'should_trigger') - # as the recommended starting point per ADR-131 research findings. + # Use trigger-rate tasks ('query' + 'should_trigger') or behavioral tasks + # ('query' + 'should_trigger' + 'eval_mode: behavioral') per ADR-132. raise NotImplementedError( "Pattern benchmark tasks are not yet implemented. " "Use trigger-rate tasks with 'query' and 'should_trigger' fields. " diff --git a/skills/do/SKILL.md b/skills/do/SKILL.md index 4f68b1a..c65f6ca 100644 --- a/skills/do/SKILL.md +++ b/skills/do/SKILL.md @@ -71,6 +71,8 @@ Read and follow the repository CLAUDE.md before making any routing decision, bec **Gate**: Complexity classified. Display routing banner (ALL classifications). If not Trivial, proceed to Phase 2. If Trivial, handle directly after showing banner. + + --- ### Phase 2: ROUTE @@ -265,3 +267,5 @@ Solution: Stop execution. Create `task_plan.md`. Resume routing after plan is in - `agents/INDEX.json`: Agent triggers and metadata - `skills/INDEX.json`: Skill triggers, force-route flags, pairs_with - `pipelines/INDEX.json`: Pipeline phases, triggers, composition chains + + From 0434c2b2741cf2f858ec1280e58783c9b2472dbf Mon Sep 17 00:00:00 2001 From: notque Date: Sun, 29 Mar 2026 10:10:51 -0700 Subject: [PATCH 08/12] feat(adr-133): strengthen Phase 1 creation detection in /do SKILL.md Add explicit Creation Request Detection block to Phase 1 CLASSIFY, immediately before the Gate line. The block scans for creation verbs, domain object targets, and implicit creation patterns, then flags the request as [CREATION REQUEST DETECTED] so Phase 4 Step 0 is acknowledged before routing decisions consume model attention. This is ADR-133 Prong 2, Option A. Moving detection to Phase 1 addresses the root cause: the creation protocol was buried in Phase 4 where it competed with agent dispatch instructions and was frequently skipped. --- skills/do/SKILL.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/skills/do/SKILL.md b/skills/do/SKILL.md index c65f6ca..2a1d0aa 100644 --- a/skills/do/SKILL.md +++ b/skills/do/SKILL.md @@ -69,6 +69,22 @@ Read and follow the repository CLAUDE.md before making any routing decision, bec **Optional: Force Direct** — OFF by default. When explicitly enabled, overrides routing for trivial operations. Only applies when the user explicitly requests it. +**Creation Request Detection** (MANDATORY scan before Gate): + +Scan the request for creation signals before completing Phase 1: +- Explicit creation verbs: "create", "scaffold", "build", "add new", "new [component]", "implement new" +- Domain object targets: agent, skill, pipeline, hook, feature, plugin, workflow, voice profile +- Implicit creation: "I need a [component]", "we need a [component]", "build me a [component]" + +If ANY creation signal is found AND complexity is Simple+: +1. Output `[CREATION REQUEST DETECTED]` in your Phase 1 response +2. Set an internal reminder: **Phase 4 Step 0 is MANDATORY** — write ADR before dispatching any agent +3. Do NOT proceed to Phase 2 without explicitly acknowledging this flag + +This early detection exists because Phase 4 Step 0 is the most frequently skipped step in /do. Moving detection to Phase 1 ensures the creation protocol fires before routing decisions consume attention. + +**Not a creation request**: debugging, reviewing, fixing, refactoring, explaining, running, checking, auditing existing components. When ambiguous, check whether the output would be a NEW file that doesn't yet exist. + **Gate**: Complexity classified. Display routing banner (ALL classifications). If not Trivial, proceed to Phase 2. If Trivial, handle directly after showing banner. From c25f6a76ccbbd19857d62c64dd67d4487b9967bc Mon Sep 17 00:00:00 2001 From: notque Date: Sun, 29 Mar 2026 10:17:55 -0700 Subject: [PATCH 09/12] feat(adr-133): add creation-protocol-enforcer PreToolUse hook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Soft-warns when an Agent dispatch appears to be for a creation task but no recent .adr-session.json is present (stale = >900s or missing). Exit 0 only — never blocks. Prong 2 / Option B of ADR-133. --- hooks/creation-protocol-enforcer.py | 145 ++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 hooks/creation-protocol-enforcer.py diff --git a/hooks/creation-protocol-enforcer.py b/hooks/creation-protocol-enforcer.py new file mode 100644 index 0000000..72a2ccf --- /dev/null +++ b/hooks/creation-protocol-enforcer.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 +# hook-version: 1.0.0 +""" +PreToolUse:Agent Hook: Creation Protocol Enforcer + +Soft-warns when an Agent dispatch appears to be for a creation request +but no ADR has been written yet this session (i.e. .adr-session.json +does not exist or was last modified more than 900 seconds ago). + +This is a SOFT WARN — exit 0 only (never blocks). + +Detection logic: +- Tool is Agent +- tool_input["prompt"] contains creation keywords +- .adr-session.json in project root either does not exist or is stale (>900s) + +Allow-through conditions: +- Tool is not Agent +- No creation keywords found in prompt +- .adr-session.json exists and was modified within the last 900 seconds +- ADR_PROTOCOL_BYPASS=1 env var +""" + +import json +import os +import sys +import time +import traceback +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent / "lib")) +from stdin_timeout import read_stdin + +_BYPASS_ENV = "ADR_PROTOCOL_BYPASS" +_ADR_SESSION_FILE = ".adr-session.json" +_STALENESS_THRESHOLD_SECONDS = 900 + +_CREATION_KEYWORDS = [ + "create", + "scaffold", + "build a new", + "new agent", + "new skill", + "new pipeline", + "new hook", + "implement new", +] + +_WARNING_LINES = [ + "[creation-protocol-enforcer] Creation request detected but no recent ADR session found.", + "/do Phase 4 Step 0 requires: (1) Write ADR at adr/{name}.md, (2) Register via adr-query.py register, THEN dispatch agent.", + "If ADR was already written, set ADR_PROTOCOL_BYPASS=1 to suppress this warning.", +] + + +def _has_creation_keywords(prompt: str) -> bool: + """Return True if the prompt contains any creation keyword (case-insensitive).""" + lower = prompt.lower() + return any(kw in lower for kw in _CREATION_KEYWORDS) + + +def _adr_session_is_recent(base_dir: Path) -> bool: + """Return True if .adr-session.json exists and was modified within the threshold.""" + adr_session_path = base_dir / _ADR_SESSION_FILE + if not adr_session_path.exists(): + return False + try: + mtime = os.path.getmtime(adr_session_path) + age = time.time() - mtime + return age <= _STALENESS_THRESHOLD_SECONDS + except OSError: + return False + + +def main() -> None: + """Run the creation protocol enforcement check.""" + debug = os.environ.get("CLAUDE_HOOKS_DEBUG") + + raw = read_stdin(timeout=2) + try: + event = json.loads(raw) + except (json.JSONDecodeError, ValueError): + sys.exit(0) + + # Filter: only act on Agent tool dispatches. + tool_name = event.get("tool_name", "") + if tool_name != "Agent": + sys.exit(0) + + # Bypass env var. + if os.environ.get(_BYPASS_ENV) == "1": + if debug: + print( + f"[creation-protocol-enforcer] Bypassed via {_BYPASS_ENV}=1", + file=sys.stderr, + ) + sys.exit(0) + + tool_input = event.get("tool_input", {}) + prompt = tool_input.get("prompt", "") + if not prompt: + sys.exit(0) + + # Check for creation keywords. + if not _has_creation_keywords(prompt): + if debug: + print( + "[creation-protocol-enforcer] No creation keywords found — allowing through", + file=sys.stderr, + ) + sys.exit(0) + + # Resolve project root. + cwd_str = event.get("cwd") or os.environ.get("CLAUDE_PROJECT_DIR", ".") + base_dir = Path(cwd_str).resolve() + + # Check whether a recent ADR session exists. + if _adr_session_is_recent(base_dir): + if debug: + print( + "[creation-protocol-enforcer] Recent .adr-session.json found — allowing through", + file=sys.stderr, + ) + sys.exit(0) + + # No recent ADR session — emit soft warning to stdout (context injection). + print("\n".join(_WARNING_LINES)) + sys.exit(0) + + +if __name__ == "__main__": + try: + main() + except SystemExit: + raise + except Exception as e: + if os.environ.get("CLAUDE_HOOKS_DEBUG"): + traceback.print_exc(file=sys.stderr) + else: + print( + f"[creation-protocol-enforcer] Error: {type(e).__name__}: {e}", + file=sys.stderr, + ) + # Fail open — never exit non-zero on unexpected errors. + sys.exit(0) From 1d13702c23938653a5490e3866bee142b4169515 Mon Sep 17 00:00:00 2001 From: notque Date: Sun, 29 Mar 2026 10:56:12 -0700 Subject: [PATCH 10/12] fix(index): register kotlin, php, and swift agent entries in INDEX.json Three agents (kotlin-general-engineer, php-general-engineer, swift-general-engineer) existed on disk but were missing from agents/INDEX.json, making them invisible to the routing system. Added all three entries with triggers, pairs_with, complexity, and category sourced directly from each agent's frontmatter. Also fixes the pre-existing golang-general-engineer-compact ordering bug as a side effect of re-sorting the index alphabetically. --- agents/INDEX.json | 135 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 115 insertions(+), 20 deletions(-) diff --git a/agents/INDEX.json b/agents/INDEX.json index 06e5212..19d8fb8 100644 --- a/agents/INDEX.json +++ b/agents/INDEX.json @@ -115,23 +115,6 @@ "complexity": "Medium", "category": "meta" }, - "golang-general-engineer-compact": { - "file": "golang-general-engineer-compact.md", - "short_description": "Use this agent for focused Go development with tight context budgets", - "triggers": [ - "go", - "golang", - "tight context", - "compact", - "focused go" - ], - "pairs_with": [ - "go-pr-quality-gate", - "go-testing" - ], - "complexity": "Medium-Complex", - "category": "language" - }, "golang-general-engineer": { "file": "golang-general-engineer.md", "short_description": "Use this agent when you need expert assistance with Go development, including implementing features,\ndebugging issues, reviewing code quality, optimizing performance, or answering technical questions\nabout Go codebases", @@ -151,6 +134,23 @@ "complexity": "Medium-Complex", "category": "language" }, + "golang-general-engineer-compact": { + "file": "golang-general-engineer-compact.md", + "short_description": "Use this agent for focused Go development with tight context budgets", + "triggers": [ + "go", + "golang", + "tight context", + "compact", + "focused go" + ], + "pairs_with": [ + "go-pr-quality-gate", + "go-testing" + ], + "complexity": "Medium-Complex", + "category": "language" + }, "hook-development-engineer": { "file": "hook-development-engineer.md", "short_description": "Use this agent when developing Python hooks for Claude Code's event-driven system", @@ -171,6 +171,34 @@ "complexity": "Comprehensive", "category": "meta" }, + "kotlin-general-engineer": { + "file": "kotlin-general-engineer.md", + "short_description": "Use this agent when you need expert assistance with Kotlin development, including implementing features, debugging issues, reviewing code quality, optimizing coroutine usage, or answering technical questions about Kotlin codebases", + "triggers": [ + "kotlin", + "ktor", + "koin", + "coroutine", + "suspend fun", + "kotlin flow", + "StateFlow", + "kotest", + "mockk", + "gradle-kts", + "detekt", + "ktlint", + "ktfmt", + "android kotlin", + "kotlin-multiplatform" + ], + "pairs_with": [ + "systematic-debugging", + "verification-before-completion", + "systematic-code-review" + ], + "complexity": "Medium-Complex", + "category": "language" + }, "kubernetes-helm-engineer": { "file": "kubernetes-helm-engineer.md", "short_description": "Use this agent for Kubernetes and Helm deployment management, troubleshooting, and cloud-native infrastructure", @@ -354,6 +382,38 @@ "complexity": "Medium-Complex", "category": "development" }, + "php-general-engineer": { + "file": "php-general-engineer.md", + "short_description": "Use this agent when you need expert assistance with PHP development, including implementing features, debugging issues, reviewing code quality, enforcing security posture, or answering technical questions about PHP codebases", + "triggers": [ + "php", + "laravel", + "symfony", + "composer", + "artisan", + "eloquent", + "blade", + "twig", + "phpunit", + "pest", + "psr-12", + "psr standards", + "hybris", + "sapcc", + ".php files", + "doctrine", + "php-cs-fixer", + "phpstan", + "psalm" + ], + "pairs_with": [ + "systematic-debugging", + "verification-before-completion", + "systematic-code-review" + ], + "complexity": "Medium-Complex", + "category": "language" + }, "pipeline-orchestrator-engineer": { "file": "pipeline-orchestrator-engineer.md", "short_description": "Use this agent when building new pipelines that require coordinated creation\nof agents, skills, and hooks", @@ -792,7 +852,7 @@ }, "reviewer-meta-process": { "file": "reviewer-meta-process.md", - "short_description": "Meta-analysis of system design decisions \u2014 examines whether the SYSTEM ITSELF is creating\nproblems", + "short_description": "Meta-analysis of system design decisions — examines whether the SYSTEM ITSELF is creating\nproblems", "triggers": [ "meta-process review", "system design review", @@ -907,7 +967,7 @@ "hot paths", "N+1 queries", "allocations", - "O(n\u00b2)", + "O(n²)", "caching", "slow code", "performance optimization" @@ -1083,6 +1143,41 @@ "complexity": "Medium", "category": "language" }, + "swift-general-engineer": { + "file": "swift-general-engineer.md", + "short_description": "Use this agent when you need expert assistance with Swift development, including implementing features for iOS, macOS, watchOS, tvOS, visionOS, or server-side Swift, debugging issues, reviewing code quality, or answering technical questions about Swift codebases", + "triggers": [ + "swift", + "ios", + "macos", + "xcode", + "swiftui", + "uikit", + "appkit", + "watchos", + "tvos", + "visionos", + "vapor", + "spm", + "swift-package-manager", + "swiftlint", + "swiftformat", + "xctest", + "swift-testing", + "swift actor", + "swift sendable", + "swift-combine", + "swiftdata", + "coredata" + ], + "pairs_with": [ + "systematic-debugging", + "verification-before-completion", + "systematic-code-review" + ], + "complexity": "Medium-Complex", + "category": "language" + }, "system-upgrade-engineer": { "file": "system-upgrade-engineer.md", "short_description": "Use this agent for systematic upgrades to the agent/skill/hook ecosystem when\nClaude Code ships updates, user goals change, or retro learnings accumulate", @@ -1236,4 +1331,4 @@ "category": "language" } } -} \ No newline at end of file +} From 757063f6d3fd70e16a15715e3a63c581dd464b0d Mon Sep 17 00:00:00 2001 From: notque Date: Sun, 29 Mar 2026 10:59:54 -0700 Subject: [PATCH 11/12] fix(behavioral-eval): raise timeout to 240s, check artifacts after TimeoutExpired MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes to _run_behavioral_eval(): 1. Default timeout 120s -> 240s: headless /do creation sessions frequently exceed 120s when they dispatch agents that write files, create plans, etc. 2. Check artifact glob after TimeoutExpired: the subprocess may have written artifacts before the timeout fired. The old code set triggered=False on any timeout, causing false FAIL for tasks that completed their artifact writes but ran over time. E2E baseline results (6-task subset, 240s timeout): - Creation recall: 1/3 (33%) — implicit-create-rails passed (ADR-135 created) - Non-creation precision: 3/3 (100%) - build-agent-rust: genuine compliance gap (completed, no ADR) --- skills/agent-comparison/scripts/optimize_loop.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/skills/agent-comparison/scripts/optimize_loop.py b/skills/agent-comparison/scripts/optimize_loop.py index 7f4605c..d31cb70 100644 --- a/skills/agent-comparison/scripts/optimize_loop.py +++ b/skills/agent-comparison/scripts/optimize_loop.py @@ -716,7 +716,7 @@ def _run_behavioral_eval( target_path: Path, description: str, tasks: list[dict], - timeout: int = 120, + timeout: int = 240, verbose: bool = False, ) -> list[dict]: """Run behavioral assessment by invoking claude -p and checking artifact output. @@ -780,7 +780,12 @@ def _run_behavioral_eval( except subprocess.TimeoutExpired: if verbose: print(f"[behavioral] Timed out after {timeout}s for query: {full_query!r}", file=sys.stderr) - triggered = False + # Still check artifacts — the process may have written them before timing out + after_timeout: set[str] = set(glob.glob(str(project_root / artifact_glob))) + new_artifacts = sorted(after_timeout - before) + triggered = len(new_artifacts) > 0 + if verbose and triggered: + print(f"[behavioral] Artifacts found despite timeout: {new_artifacts}", file=sys.stderr) passed = triggered == should_trigger results.append( From 06a866499ee395698fd59010226f6ce478469c78 Mon Sep 17 00:00:00 2001 From: notque Date: Sun, 29 Mar 2026 11:08:38 -0700 Subject: [PATCH 12/12] fix(review-round-1): address 4 findings from PR review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. behavioral eval: always print claude exit code (not only in verbose mode) — silent failures would produce phantom 50% accuracy, corrupting optimization 2. behavioral eval: clean up created artifacts between tasks to prevent stale before-snapshots in multi-round optimization runs 3. creation-protocol-enforcer: expand keyword set to match SKILL.md vocabulary — 'build a', 'add new', 'new feature', 'i need a/an', 'we need a/an' previously covered <50% of the benchmark creation queries 4. SKILL.md Phase 1: move [CREATION REQUEST DETECTED] output to the Gate condition so LLM cannot proceed to Phase 2 without acknowledging the flag --- hooks/creation-protocol-enforcer.py | 10 ++++++++++ skills/agent-comparison/scripts/optimize_loop.py | 9 ++++++++- skills/do/SKILL.md | 9 ++++----- 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/hooks/creation-protocol-enforcer.py b/hooks/creation-protocol-enforcer.py index 72a2ccf..df53bd8 100644 --- a/hooks/creation-protocol-enforcer.py +++ b/hooks/creation-protocol-enforcer.py @@ -39,11 +39,21 @@ "create", "scaffold", "build a new", + "build a ", + "add a new", + "add new", "new agent", "new skill", "new pipeline", "new hook", + "new feature", + "new workflow", + "new plugin", "implement new", + "i need a ", + "i need an ", + "we need a ", + "we need an ", ] _WARNING_LINES = [ diff --git a/skills/agent-comparison/scripts/optimize_loop.py b/skills/agent-comparison/scripts/optimize_loop.py index d31cb70..dd17781 100644 --- a/skills/agent-comparison/scripts/optimize_loop.py +++ b/skills/agent-comparison/scripts/optimize_loop.py @@ -763,7 +763,7 @@ def _run_behavioral_eval( env=env, timeout=timeout, ) - if verbose and result.returncode != 0: + if result.returncode != 0: print( f"[behavioral] claude exited {result.returncode}: {result.stderr[:300]}", file=sys.stderr, @@ -787,6 +787,13 @@ def _run_behavioral_eval( if verbose and triggered: print(f"[behavioral] Artifacts found despite timeout: {new_artifacts}", file=sys.stderr) + # Clean up artifacts so they don't pollute the before-snapshot of the next task + for artifact_path in new_artifacts: + try: + Path(artifact_path).unlink(missing_ok=True) + except OSError: + pass + passed = triggered == should_trigger results.append( { diff --git a/skills/do/SKILL.md b/skills/do/SKILL.md index 2a1d0aa..978e670 100644 --- a/skills/do/SKILL.md +++ b/skills/do/SKILL.md @@ -77,15 +77,14 @@ Scan the request for creation signals before completing Phase 1: - Implicit creation: "I need a [component]", "we need a [component]", "build me a [component]" If ANY creation signal is found AND complexity is Simple+: -1. Output `[CREATION REQUEST DETECTED]` in your Phase 1 response -2. Set an internal reminder: **Phase 4 Step 0 is MANDATORY** — write ADR before dispatching any agent -3. Do NOT proceed to Phase 2 without explicitly acknowledging this flag +1. Set an internal flag: `is_creation = true` +2. **Phase 4 Step 0 is MANDATORY** — write ADR before dispatching any agent -This early detection exists because Phase 4 Step 0 is the most frequently skipped step in /do. Moving detection to Phase 1 ensures the creation protocol fires before routing decisions consume attention. +This early detection exists because Phase 4 Step 0 is the most frequently skipped step in /do. Moving detection to Phase 1 ensures the creation protocol fires before routing decisions consume attention. The Gate below enforces acknowledgment before Phase 2. **Not a creation request**: debugging, reviewing, fixing, refactoring, explaining, running, checking, auditing existing components. When ambiguous, check whether the output would be a NEW file that doesn't yet exist. -**Gate**: Complexity classified. Display routing banner (ALL classifications). If not Trivial, proceed to Phase 2. If Trivial, handle directly after showing banner. +**Gate**: Complexity classified. If a creation signal was detected, output `[CREATION REQUEST DETECTED]` before displaying the routing banner. Display routing banner (ALL classifications). If not Trivial, proceed to Phase 2. If Trivial, handle directly after showing banner.