diff --git a/agents/INDEX.json b/agents/INDEX.json index 06e5212..19d8fb8 100644 --- a/agents/INDEX.json +++ b/agents/INDEX.json @@ -115,23 +115,6 @@ "complexity": "Medium", "category": "meta" }, - "golang-general-engineer-compact": { - "file": "golang-general-engineer-compact.md", - "short_description": "Use this agent for focused Go development with tight context budgets", - "triggers": [ - "go", - "golang", - "tight context", - "compact", - "focused go" - ], - "pairs_with": [ - "go-pr-quality-gate", - "go-testing" - ], - "complexity": "Medium-Complex", - "category": "language" - }, "golang-general-engineer": { "file": "golang-general-engineer.md", "short_description": "Use this agent when you need expert assistance with Go development, including implementing features,\ndebugging issues, reviewing code quality, optimizing performance, or answering technical questions\nabout Go codebases", @@ -151,6 +134,23 @@ "complexity": "Medium-Complex", "category": "language" }, + "golang-general-engineer-compact": { + "file": "golang-general-engineer-compact.md", + "short_description": "Use this agent for focused Go development with tight context budgets", + "triggers": [ + "go", + "golang", + "tight context", + "compact", + "focused go" + ], + "pairs_with": [ + "go-pr-quality-gate", + "go-testing" + ], + "complexity": "Medium-Complex", + "category": "language" + }, "hook-development-engineer": { "file": "hook-development-engineer.md", "short_description": "Use this agent when developing Python hooks for Claude Code's event-driven system", @@ -171,6 +171,34 @@ "complexity": "Comprehensive", "category": "meta" }, + "kotlin-general-engineer": { + "file": "kotlin-general-engineer.md", + "short_description": "Use this agent when you need expert assistance with Kotlin development, including implementing features, debugging issues, reviewing code quality, optimizing coroutine usage, or answering technical questions about Kotlin codebases", + "triggers": [ + "kotlin", + "ktor", + "koin", + "coroutine", + "suspend fun", + "kotlin flow", + "StateFlow", + "kotest", + "mockk", + "gradle-kts", + "detekt", + "ktlint", + "ktfmt", + "android kotlin", + "kotlin-multiplatform" + ], + "pairs_with": [ + "systematic-debugging", + "verification-before-completion", + "systematic-code-review" + ], + "complexity": "Medium-Complex", + "category": "language" + }, "kubernetes-helm-engineer": { "file": "kubernetes-helm-engineer.md", "short_description": "Use this agent for Kubernetes and Helm deployment management, troubleshooting, and cloud-native infrastructure", @@ -354,6 +382,38 @@ "complexity": "Medium-Complex", "category": "development" }, + "php-general-engineer": { + "file": "php-general-engineer.md", + "short_description": "Use this agent when you need expert assistance with PHP development, including implementing features, debugging issues, reviewing code quality, enforcing security posture, or answering technical questions about PHP codebases", + "triggers": [ + "php", + "laravel", + "symfony", + "composer", + "artisan", + "eloquent", + "blade", + "twig", + "phpunit", + "pest", + "psr-12", + "psr standards", + "hybris", + "sapcc", + ".php files", + "doctrine", + "php-cs-fixer", + "phpstan", + "psalm" + ], + "pairs_with": [ + "systematic-debugging", + "verification-before-completion", + "systematic-code-review" + ], + "complexity": "Medium-Complex", + "category": "language" + }, "pipeline-orchestrator-engineer": { "file": "pipeline-orchestrator-engineer.md", "short_description": "Use this agent when building new pipelines that require coordinated creation\nof agents, skills, and hooks", @@ -792,7 +852,7 @@ }, "reviewer-meta-process": { "file": "reviewer-meta-process.md", - "short_description": "Meta-analysis of system design decisions \u2014 examines whether the SYSTEM ITSELF is creating\nproblems", + "short_description": "Meta-analysis of system design decisions — examines whether the SYSTEM ITSELF is creating\nproblems", "triggers": [ "meta-process review", "system design review", @@ -907,7 +967,7 @@ "hot paths", "N+1 queries", "allocations", - "O(n\u00b2)", + "O(n²)", "caching", "slow code", "performance optimization" @@ -1083,6 +1143,41 @@ "complexity": "Medium", "category": "language" }, + "swift-general-engineer": { + "file": "swift-general-engineer.md", + "short_description": "Use this agent when you need expert assistance with Swift development, including implementing features for iOS, macOS, watchOS, tvOS, visionOS, or server-side Swift, debugging issues, reviewing code quality, or answering technical questions about Swift codebases", + "triggers": [ + "swift", + "ios", + "macos", + "xcode", + "swiftui", + "uikit", + "appkit", + "watchos", + "tvos", + "visionos", + "vapor", + "spm", + "swift-package-manager", + "swiftlint", + "swiftformat", + "xctest", + "swift-testing", + "swift actor", + "swift sendable", + "swift-combine", + "swiftdata", + "coredata" + ], + "pairs_with": [ + "systematic-debugging", + "verification-before-completion", + "systematic-code-review" + ], + "complexity": "Medium-Complex", + "category": "language" + }, "system-upgrade-engineer": { "file": "system-upgrade-engineer.md", "short_description": "Use this agent for systematic upgrades to the agent/skill/hook ecosystem when\nClaude Code ships updates, user goals change, or retro learnings accumulate", @@ -1236,4 +1331,4 @@ "category": "language" } } -} \ No newline at end of file +} diff --git a/hooks/creation-protocol-enforcer.py b/hooks/creation-protocol-enforcer.py new file mode 100644 index 0000000..df53bd8 --- /dev/null +++ b/hooks/creation-protocol-enforcer.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 +# hook-version: 1.0.0 +""" +PreToolUse:Agent Hook: Creation Protocol Enforcer + +Soft-warns when an Agent dispatch appears to be for a creation request +but no ADR has been written yet this session (i.e. .adr-session.json +does not exist or was last modified more than 900 seconds ago). + +This is a SOFT WARN — exit 0 only (never blocks). + +Detection logic: +- Tool is Agent +- tool_input["prompt"] contains creation keywords +- .adr-session.json in project root either does not exist or is stale (>900s) + +Allow-through conditions: +- Tool is not Agent +- No creation keywords found in prompt +- .adr-session.json exists and was modified within the last 900 seconds +- ADR_PROTOCOL_BYPASS=1 env var +""" + +import json +import os +import sys +import time +import traceback +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent / "lib")) +from stdin_timeout import read_stdin + +_BYPASS_ENV = "ADR_PROTOCOL_BYPASS" +_ADR_SESSION_FILE = ".adr-session.json" +_STALENESS_THRESHOLD_SECONDS = 900 + +_CREATION_KEYWORDS = [ + "create", + "scaffold", + "build a new", + "build a ", + "add a new", + "add new", + "new agent", + "new skill", + "new pipeline", + "new hook", + "new feature", + "new workflow", + "new plugin", + "implement new", + "i need a ", + "i need an ", + "we need a ", + "we need an ", +] + +_WARNING_LINES = [ + "[creation-protocol-enforcer] Creation request detected but no recent ADR session found.", + "/do Phase 4 Step 0 requires: (1) Write ADR at adr/{name}.md, (2) Register via adr-query.py register, THEN dispatch agent.", + "If ADR was already written, set ADR_PROTOCOL_BYPASS=1 to suppress this warning.", +] + + +def _has_creation_keywords(prompt: str) -> bool: + """Return True if the prompt contains any creation keyword (case-insensitive).""" + lower = prompt.lower() + return any(kw in lower for kw in _CREATION_KEYWORDS) + + +def _adr_session_is_recent(base_dir: Path) -> bool: + """Return True if .adr-session.json exists and was modified within the threshold.""" + adr_session_path = base_dir / _ADR_SESSION_FILE + if not adr_session_path.exists(): + return False + try: + mtime = os.path.getmtime(adr_session_path) + age = time.time() - mtime + return age <= _STALENESS_THRESHOLD_SECONDS + except OSError: + return False + + +def main() -> None: + """Run the creation protocol enforcement check.""" + debug = os.environ.get("CLAUDE_HOOKS_DEBUG") + + raw = read_stdin(timeout=2) + try: + event = json.loads(raw) + except (json.JSONDecodeError, ValueError): + sys.exit(0) + + # Filter: only act on Agent tool dispatches. + tool_name = event.get("tool_name", "") + if tool_name != "Agent": + sys.exit(0) + + # Bypass env var. + if os.environ.get(_BYPASS_ENV) == "1": + if debug: + print( + f"[creation-protocol-enforcer] Bypassed via {_BYPASS_ENV}=1", + file=sys.stderr, + ) + sys.exit(0) + + tool_input = event.get("tool_input", {}) + prompt = tool_input.get("prompt", "") + if not prompt: + sys.exit(0) + + # Check for creation keywords. + if not _has_creation_keywords(prompt): + if debug: + print( + "[creation-protocol-enforcer] No creation keywords found — allowing through", + file=sys.stderr, + ) + sys.exit(0) + + # Resolve project root. + cwd_str = event.get("cwd") or os.environ.get("CLAUDE_PROJECT_DIR", ".") + base_dir = Path(cwd_str).resolve() + + # Check whether a recent ADR session exists. + if _adr_session_is_recent(base_dir): + if debug: + print( + "[creation-protocol-enforcer] Recent .adr-session.json found — allowing through", + file=sys.stderr, + ) + sys.exit(0) + + # No recent ADR session — emit soft warning to stdout (context injection). + print("\n".join(_WARNING_LINES)) + sys.exit(0) + + +if __name__ == "__main__": + try: + main() + except SystemExit: + raise + except Exception as e: + if os.environ.get("CLAUDE_HOOKS_DEBUG"): + traceback.print_exc(file=sys.stderr) + else: + print( + f"[creation-protocol-enforcer] Error: {type(e).__name__}: {e}", + file=sys.stderr, + ) + # Fail open — never exit non-zero on unexpected errors. + sys.exit(0) diff --git a/scripts/rules-distill.py b/scripts/rules-distill.py index 77d57f1..6a19172 100644 --- a/scripts/rules-distill.py +++ b/scripts/rules-distill.py @@ -24,7 +24,9 @@ import argparse import json +import os import re +import subprocess import sys from datetime import datetime, timezone from pathlib import Path @@ -307,18 +309,56 @@ def filter_layer4_not_covered( # --------------------------------------------------------------------------- -def _llm_extract_principles(skill_content: str, skill_name: str) -> list[dict] | None: - """Try to extract principles via Anthropic SDK. +def _run_claude_code(prompt: str, model: str | None = None) -> tuple[str, str]: + """Run Claude Code and return (assistant_text, raw_result_text). - Returns list of dicts with "principle" key, or None if unavailable. + Soft-fail contract: returns ('', '') on any failure (non-zero exit, invalid + JSON, timeout). Callers must treat empty strings as a no-op and fall back + to keyword-based extraction. """ + cmd = ["claude", "-p", prompt, "--output-format", "json", "--print"] + if model: + cmd.extend(["--model", model]) + + env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} + result = subprocess.run( + cmd, + capture_output=True, + text=True, + cwd=str(REPO_ROOT), + env=env, + timeout=300, + ) + if result.returncode != 0: + print(f"claude -p failed (exit {result.returncode}): {result.stderr}", file=sys.stderr) + return "", "" + try: - import anthropic # type: ignore[import] - except ImportError: - return None + events = json.loads(result.stdout) + except json.JSONDecodeError: + print(f"claude -p returned invalid JSON: {result.stdout[:200]}", file=sys.stderr) + return "", "" + + assistant_text = "" + raw_result_text = "" + for event in events: + if event.get("type") == "assistant": + message = event.get("message", {}) + for content in message.get("content", []): + if content.get("type") == "text": + assistant_text += content.get("text", "") + elif event.get("type") == "result": + raw_result_text = event.get("result", "") + + return assistant_text or raw_result_text, raw_result_text + +def _llm_extract_principles(skill_content: str, skill_name: str) -> list[dict] | None: + """Try to extract principles via Claude Code. + + Returns list of dicts with "principle" key, or None if unavailable. + """ try: - client = anthropic.Anthropic() prompt = f"""You are analyzing a Claude Code skill file to extract cross-cutting behavioral principles. Skill: {skill_name} @@ -337,12 +377,8 @@ def _llm_extract_principles(skill_content: str, skill_name: str) -> list[dict] | Return [] if no universal principles are found.""" - message = client.messages.create( - model="claude-haiku-4-5", - max_tokens=1024, - messages=[{"role": "user", "content": prompt}], - ) - raw = message.content[0].text.strip() + raw, _ = _run_claude_code(prompt, model="claude-haiku-4-5") + raw = raw.strip() # Parse JSON principles = json.loads(raw) if not isinstance(principles, list): @@ -352,7 +388,8 @@ def _llm_extract_principles(skill_content: str, skill_name: str) -> list[dict] | for p in principles if isinstance(p, str) and len(p) >= 15 ] - except Exception: + except Exception as exc: + print(f"LLM extraction failed: {exc}", file=sys.stderr) return None diff --git a/scripts/skill_eval/__init__.py b/scripts/skill_eval/__init__.py index 34d2ca8..65f3d05 100644 --- a/scripts/skill_eval/__init__.py +++ b/scripts/skill_eval/__init__.py @@ -1,10 +1,9 @@ """Skill evaluation and description optimization toolkit. -Ported from Anthropic's skill-creator (https://github.com/anthropics/skills). -Adapted for the agents repo architecture. +Adapted from the upstream skill-creator workflow for this repo architecture. Usage: python -m scripts.skill_eval.run_eval --eval-set evals.json --skill-path path/to/skill - python -m scripts.skill_eval.run_loop --eval-set evals.json --skill-path path/to/skill --model claude-opus-4-6 + python -m scripts.skill_eval.run_loop --eval-set evals.json --skill-path path/to/skill python -m scripts.skill_eval.quick_validate path/to/skill """ diff --git a/scripts/skill_eval/improve_description.py b/scripts/skill_eval/improve_description.py index 1deb2b8..9b44ae2 100644 --- a/scripts/skill_eval/improve_description.py +++ b/scripts/skill_eval/improve_description.py @@ -2,33 +2,82 @@ """Improve a skill description based on eval results. Takes eval results (from run_eval.py) and generates an improved description -using Claude with extended thinking. +through `claude -p`. """ import argparse import json +import os import re +import subprocess import sys from pathlib import Path -import anthropic - from scripts.skill_eval.utils import parse_skill_md +def _find_project_root() -> Path: + current = Path.cwd() + for parent in [current, *current.parents]: + if (parent / ".claude").is_dir(): + return parent + print("Warning: .claude/ directory not found, using cwd as project root", file=sys.stderr) + return current + + +def _run_claude_code(prompt: str, model: str | None) -> tuple[str, str]: + """Run Claude Code and return (assistant_text, raw_result_text).""" + cmd = ["claude", "-p", prompt, "--output-format", "json", "--print"] + if model: + cmd.extend(["--model", model]) + + env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} + result = subprocess.run( + cmd, + capture_output=True, + text=True, + cwd=str(_find_project_root()), + env=env, + timeout=300, + ) + if result.returncode != 0: + print(f"Error: claude -p failed with code {result.returncode}", file=sys.stderr) + if result.stderr: + print(result.stderr.strip(), file=sys.stderr) + sys.exit(1) + + try: + events = json.loads(result.stdout) + except json.JSONDecodeError as exc: + print(f"Error: could not parse claude -p JSON output: {exc}", file=sys.stderr) + sys.exit(1) + + assistant_text = "" + raw_result_text = "" + for event in events: + if event.get("type") == "assistant": + message = event.get("message", {}) + for content in message.get("content", []): + if content.get("type") == "text": + assistant_text += content.get("text", "") + elif event.get("type") == "result": + raw_result_text = event.get("result", "") + + return assistant_text or raw_result_text, raw_result_text + + def improve_description( - client: anthropic.Anthropic, skill_name: str, skill_content: str, current_description: str, eval_results: dict, history: list[dict], - model: str, + model: str | None, test_results: dict | None = None, log_dir: Path | None = None, iteration: int | None = None, ) -> str: - """Call Claude to improve the description based on eval results.""" + """Call Claude Code to improve the description based on eval results.""" failed_triggers = [r for r in eval_results["results"] if r["should_trigger"] and not r["pass"]] false_triggers = [r for r in eval_results["results"] if not r["should_trigger"] and not r["pass"]] @@ -107,35 +156,20 @@ def improve_description( Please respond with only the new description text in tags, nothing else.""" - response = client.messages.create( - model=model, - max_tokens=16000, - thinking={ - "type": "enabled", - "budget_tokens": 10000, - }, - messages=[{"role": "user", "content": prompt}], - ) - - # Extract thinking and text from response - thinking_text = "" - text = "" - for block in response.content: - if block.type == "thinking": - thinking_text = block.thinking - elif block.type == "text": - text = block.text + text, raw_result_text = _run_claude_code(prompt, model) # Parse out the tags match = re.search(r"(.*?)", text, re.DOTALL) + if not match: + print("Warning: tags not found in response, using raw output", file=sys.stderr) description = match.group(1).strip().strip('"') if match else text.strip().strip('"') # Log the transcript transcript: dict = { "iteration": iteration, "prompt": prompt, - "thinking": thinking_text, "response": text, + "raw_result_text": raw_result_text, "parsed_description": description, "char_count": len(description), "over_limit": len(description) > 1024, @@ -144,34 +178,18 @@ def improve_description( # If over 1024 chars, ask the model to shorten it if len(description) > 1024: shorten_prompt = f"Your description is {len(description)} characters, which exceeds the hard 1024 character limit. Please rewrite it to be under 1024 characters while preserving the most important trigger words and intent coverage. Respond with only the new description in tags." - shorten_response = client.messages.create( - model=model, - max_tokens=16000, - thinking={ - "type": "enabled", - "budget_tokens": 10000, - }, - messages=[ - {"role": "user", "content": prompt}, - {"role": "assistant", "content": text}, - {"role": "user", "content": shorten_prompt}, - ], + rewrite_request = ( + f"{prompt}\n\nPrevious assistant response:\n\n{text}\n\n\n" + f"{shorten_prompt}" ) - - shorten_thinking = "" - shorten_text = "" - for block in shorten_response.content: - if block.type == "thinking": - shorten_thinking = block.thinking - elif block.type == "text": - shorten_text = block.text + shorten_text, shorten_raw_result_text = _run_claude_code(rewrite_request, model) match = re.search(r"(.*?)", shorten_text, re.DOTALL) shortened = match.group(1).strip().strip('"') if match else shorten_text.strip().strip('"') transcript["rewrite_prompt"] = shorten_prompt - transcript["rewrite_thinking"] = shorten_thinking transcript["rewrite_response"] = shorten_text + transcript["rewrite_raw_result_text"] = shorten_raw_result_text transcript["rewrite_description"] = shortened transcript["rewrite_char_count"] = len(shortened) description = shortened @@ -191,7 +209,7 @@ def main(): parser.add_argument("--eval-results", required=True, help="Path to eval results JSON (from run_eval.py)") parser.add_argument("--skill-path", required=True, help="Path to skill directory") parser.add_argument("--history", default=None, help="Path to history JSON (previous attempts)") - parser.add_argument("--model", required=True, help="Model for improvement") + parser.add_argument("--model", default=None, help="Optional Claude Code model override") parser.add_argument("--verbose", action="store_true", help="Print thinking to stderr") args = parser.parse_args() @@ -212,9 +230,7 @@ def main(): print(f"Current: {current_description}", file=sys.stderr) print(f"Score: {eval_results['summary']['passed']}/{eval_results['summary']['total']}", file=sys.stderr) - client = anthropic.Anthropic() new_description = improve_description( - client=client, skill_name=name, skill_content=content, current_description=current_description, diff --git a/scripts/skill_eval/run_loop.py b/scripts/skill_eval/run_loop.py index 6bc0c61..48034e6 100644 --- a/scripts/skill_eval/run_loop.py +++ b/scripts/skill_eval/run_loop.py @@ -15,8 +15,6 @@ import webbrowser from pathlib import Path -import anthropic - from scripts.skill_eval.generate_report import generate_html from scripts.skill_eval.improve_description import improve_description from scripts.skill_eval.run_eval import find_project_root, run_eval @@ -56,7 +54,7 @@ def run_loop( runs_per_query: int, trigger_threshold: float, holdout: float, - model: str, + model: str | None, verbose: bool, live_report_path: Path | None = None, log_dir: Path | None = None, @@ -75,7 +73,6 @@ def run_loop( train_set = eval_set test_set = [] - client = anthropic.Anthropic() history = [] exit_reason = "unknown" @@ -206,7 +203,6 @@ def print_eval_stats(label, results, elapsed): # Strip test scores from history so improvement model can't see them blinded_history = [{k: v for k, v in h.items() if not k.startswith("test_")} for h in history] new_description = improve_description( - client=client, skill_name=name, skill_content=content, current_description=current_description, @@ -264,7 +260,7 @@ def main(): parser.add_argument( "--holdout", type=float, default=0.4, help="Fraction of eval set to hold out for testing (0 to disable)" ) - parser.add_argument("--model", required=True, help="Model for improvement") + parser.add_argument("--model", default=None, help="Optional Claude Code model override") parser.add_argument("--verbose", action="store_true", help="Print progress to stderr") parser.add_argument( "--report", diff --git a/scripts/tests/test_agent_comparison_optimize_loop.py b/scripts/tests/test_agent_comparison_optimize_loop.py index a0f2faa..662b63f 100644 --- a/scripts/tests/test_agent_comparison_optimize_loop.py +++ b/scripts/tests/test_agent_comparison_optimize_loop.py @@ -1,8 +1,8 @@ import importlib.util import json -from pathlib import Path +import subprocess import sys - +from pathlib import Path REPO_ROOT = Path(__file__).resolve().parents[2] @@ -39,13 +39,7 @@ def test_check_protected_sections_rejects_missing_blocks(): "agent_comparison_optimize_loop", "skills/agent-comparison/scripts/optimize_loop.py", ) - original = ( - "alpha\n" - "\n" - "keep me\n" - "\n" - "omega\n" - ) + original = "alpha\n\nkeep me\n\nomega\n" relocated = "alpha\nomega\n" assert optimize_loop.check_protected_sections(original, relocated) is False @@ -56,13 +50,7 @@ def test_restore_protected_does_not_silently_reinsert_missing_blocks(): "agent_comparison_generate_variant", "skills/agent-comparison/scripts/generate_variant.py", ) - original = ( - "alpha\n" - "\n" - "keep me\n" - "\n" - "omega\n" - ) + original = "alpha\n\nkeep me\n\nomega\n" variant = "alpha\nomega\n" restored = generate_variant.restore_protected(original, variant) @@ -76,39 +64,33 @@ def test_generate_variant_main_reads_current_content_from_file(tmp_path, monkeyp "skills/agent-comparison/scripts/generate_variant.py", ) - class FakeBlock: - def __init__(self, block_type: str, text: str): - self.type = block_type - if block_type == "thinking": - self.thinking = text - else: - self.text = text - - class FakeResponse: - def __init__(self): - self.content = [ - FakeBlock("thinking", "reasoning"), - FakeBlock( - "text", - "---\ndescription: updated\n---" - "updated", - ), - ] - self.usage = type("Usage", (), {"input_tokens": 1, "output_tokens": 2})() - - class FakeClient: - def __init__(self): - self.messages = type("Messages", (), {"create": lambda self, **kwargs: FakeResponse()})() - - class FakeAnthropicModule: - class Anthropic: - def __new__(cls): - return FakeClient() - content_file = tmp_path / "current.md" content_file.write_text("---\ndescription: current\n---\n") - monkeypatch.setattr(generate_variant, "anthropic", FakeAnthropicModule) + def fake_run(cmd, capture_output, text, cwd, env, timeout): + assert cmd[:2] == ["claude", "-p"] + payload = [ + { + "type": "assistant", + "message": { + "content": [ + { + "type": "text", + "text": "---\ndescription: updated\n---" + "updated", + } + ] + }, + }, + { + "type": "result", + "result": "raw result", + "usage": {"input_tokens": 1, "output_tokens": 2}, + }, + ] + return subprocess.CompletedProcess(cmd, 0, stdout=json.dumps(payload), stderr="") + + monkeypatch.setattr(generate_variant.subprocess, "run", fake_run) monkeypatch.setattr( sys, "argv", @@ -130,3 +112,204 @@ def __new__(cls): assert output["variant"] == "---\ndescription: updated\n---" assert output["tokens_used"] == 3 + assert output["reasoning"] == "raw result" + + +def test_optimize_loop_omits_model_flag_when_not_provided(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_nomodel", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + target = tmp_path / "SKILL.md" + target.write_text("---\nname: test-skill\ndescription: test description\nversion: 1.0.0\n---\n\n# Skill\n") + tasks = [ + {"name": "train-positive", "query": "write go tests", "should_trigger": True, "split": "train"}, + {"name": "test-negative", "query": "debug kubernetes", "should_trigger": False, "split": "test"}, + ] + tasks_file = tmp_path / "tasks.json" + tasks_file.write_text(json.dumps({"tasks": tasks})) + + seen_cmds = [] + + def fake_assess_target(*args, **kwargs): + return { + "parses": True, + "correctness": 1.0, + "conciseness": 1.0, + "clarity": 1.0, + "task_results": [{"name": "train-positive", "passed": False}], + } + + def fake_run(cmd, capture_output, text, timeout, cwd=None, env=None): + seen_cmds.append(cmd) + payload = { + "variant": target.read_text(), + "summary": "no-op", + "reasoning": "ok", + "tokens_used": 0, + "deletion_justification": "", + } + return subprocess.CompletedProcess(cmd, 0, stdout=json.dumps(payload), stderr="") + + monkeypatch.setattr(optimize_loop, "assess_target", fake_assess_target) + monkeypatch.setattr(optimize_loop.subprocess, "run", fake_run) + + optimize_loop.run_optimization_loop( + target_path=target, + goal="improve routing precision", + benchmark_tasks_path=tasks_file, + max_iterations=1, + min_gain=0.02, + train_split=0.6, + model=None, + output_dir=tmp_path / "out", + report_path=tmp_path / "out" / "report.html", + verbose=False, + dry_run=False, + ) + + assert seen_cmds + assert "--model" not in seen_cmds[0] + + +def test_optimize_loop_respects_revert_streak_limit(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_revert_limit", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + target = tmp_path / "SKILL.md" + target.write_text("---\nname: test-skill\ndescription: test description\nversion: 1.0.0\n---\n\n# Skill\n") + tasks_file = tmp_path / "tasks.json" + tasks_file.write_text( + json.dumps( + { + "tasks": [ + {"name": "train-positive", "query": "write go tests", "should_trigger": True, "split": "train"}, + {"name": "test-negative", "query": "debug kubernetes", "should_trigger": False, "split": "test"}, + ] + } + ) + ) + + def fake_assess_target(*args, **kwargs): + return { + "parses": True, + "correctness": 0.0, + "conciseness": 1.0, + "clarity": 1.0, + "task_results": [{"name": "train-positive", "passed": False}], + } + + def fake_run(cmd, capture_output, text, timeout, cwd=None, env=None): + payload = { + "variant": target.read_text(), + "summary": "no-op", + "reasoning": "ok", + "tokens_used": 0, + "deletion_justification": "", + } + return subprocess.CompletedProcess(cmd, 0, stdout=json.dumps(payload), stderr="") + + monkeypatch.setattr(optimize_loop, "assess_target", fake_assess_target) + monkeypatch.setattr(optimize_loop.subprocess, "run", fake_run) + + result = optimize_loop.run_optimization_loop( + target_path=target, + goal="improve routing precision", + benchmark_tasks_path=tasks_file, + max_iterations=10, + min_gain=0.02, + train_split=0.6, + revert_streak_limit=2, + model=None, + output_dir=tmp_path / "out", + report_path=tmp_path / "out" / "report.html", + verbose=False, + dry_run=False, + ) + + assert result["status"] == "CONVERGED" + assert "2 rounds without KEEP" in result["exit_reason"] + + +def test_optimize_loop_beam_search_retains_top_k_candidates(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_beam", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + target = tmp_path / "SKILL.md" + target.write_text("---\nname: test-skill\ndescription: test description\nversion: 1.0.0\n---\n\n# Skill\n") + tasks_file = tmp_path / "tasks.json" + tasks_file.write_text( + json.dumps( + { + "tasks": [ + {"name": "train-positive", "query": "write go tests", "should_trigger": True, "split": "train"}, + {"name": "test-negative", "query": "debug kubernetes", "should_trigger": False, "split": "test"}, + ] + } + ) + ) + + generated = iter(["alpha", "beta"]) + + def fake_run(cmd, capture_output, text, timeout, cwd=None, env=None): + label = next(generated) + payload = { + "variant": target.read_text() + f"\n\n", + "summary": f"candidate-{label}", + "reasoning": "ok", + "tokens_used": 10, + "deletion_justification": "", + } + return subprocess.CompletedProcess(cmd, 0, stdout=json.dumps(payload), stderr="") + + def fake_assess_target(path, *args, **kwargs): + content = Path(path).read_text() + score = 0.0 + if "" in content: + score = 1.2 + elif "" in content: + score = 2.4 + return { + "parses": True, + "compiles": True, + "tests_pass": True, + "protected_intact": True, + "correctness": score, + "error_handling": 0.0, + "language_idioms": 0.0, + "testing": 0.0, + "efficiency": 0.0, + "task_results": [], + } + + monkeypatch.setattr(optimize_loop.subprocess, "run", fake_run) + monkeypatch.setattr(optimize_loop, "assess_target", fake_assess_target) + + result = optimize_loop.run_optimization_loop( + target_path=target, + goal="improve routing precision", + benchmark_tasks_path=tasks_file, + max_iterations=1, + min_gain=0.0, + train_split=0.6, + beam_width=2, + candidates_per_parent=2, + model=None, + output_dir=tmp_path / "out", + report_path=tmp_path / "out" / "report.html", + verbose=False, + dry_run=False, + ) + + assert result["search_strategy"] == "beam" + assert result["beam_width"] == 2 + assert result["candidates_per_parent"] == 2 + assert result["improvements_found"] == 2 + selected = [it for it in result["iterations"] if it.get("selected_for_frontier")] + assert len(selected) == 2 + assert selected[0]["frontier_rank"] == 1 or selected[1]["frontier_rank"] == 1 diff --git a/scripts/tests/test_eval_compare_optimization.py b/scripts/tests/test_eval_compare_optimization.py index f687c7a..011695c 100644 --- a/scripts/tests/test_eval_compare_optimization.py +++ b/scripts/tests/test_eval_compare_optimization.py @@ -2,7 +2,6 @@ import json from pathlib import Path - REPO_ROOT = Path(__file__).resolve().parents[2] diff --git a/scripts/tests/test_rules_distill.py b/scripts/tests/test_rules_distill.py index 5acbc6a..854d963 100644 --- a/scripts/tests/test_rules_distill.py +++ b/scripts/tests/test_rules_distill.py @@ -8,6 +8,7 @@ import importlib import json +import subprocess import sys import tempfile from datetime import datetime, timedelta, timezone @@ -549,6 +550,32 @@ def test_writes_pending_json_in_live_mode(self, tmp_path): assert "skills_scanned" in written assert "candidates" in written + +class TestLlmExtraction: + def test_llm_extract_principles_uses_claude_code(self): + payload = json.dumps( + [ + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": '["Always verify output before completion"]'}]}, + }, + {"type": "result", "result": "ok"}, + ] + ) + completed = subprocess.CompletedProcess(["claude"], 0, stdout=payload, stderr="") + + with patch.object(rules_distill.subprocess, "run", return_value=completed) as mock_run: + result = rules_distill._llm_extract_principles("content", "skill-a") + + assert result == [ + { + "principle": "Always verify output before completion", + "raw": "Always verify output before completion", + "source": "skill-a", + } + ] + assert mock_run.call_args.args[0][:2] == ["claude", "-p"] + def test_candidates_have_required_fields(self, tmp_path): """All candidates must have id, principle, skills, status, confidence, verdict.""" skills_dir = tmp_path / "skills" diff --git a/scripts/tests/test_skill_eval_claude_code.py b/scripts/tests/test_skill_eval_claude_code.py new file mode 100644 index 0000000..a0c9e05 --- /dev/null +++ b/scripts/tests/test_skill_eval_claude_code.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +import json +import subprocess +from pathlib import Path + + +def test_improve_description_uses_claude_code_and_shortens(monkeypatch, tmp_path): + from scripts.skill_eval import improve_description as mod + + calls: list[list[str]] = [] + + def fake_run(cmd, capture_output, text, cwd, env, timeout): + calls.append(cmd) + if len(calls) == 1: + text_out = "" + ("a" * 1030) + "" + else: + text_out = "short and valid" + payload = [ + {"type": "assistant", "message": {"content": [{"type": "text", "text": text_out}]}}, + {"type": "result", "result": "raw result"}, + ] + return subprocess.CompletedProcess(cmd, 0, stdout=json.dumps(payload), stderr="") + + monkeypatch.setattr(mod.subprocess, "run", fake_run) + + description = mod.improve_description( + skill_name="skill-eval", + skill_content="# Skill", + current_description="old", + eval_results={ + "results": [ + {"query": "improve this skill", "should_trigger": True, "pass": False, "triggers": 0, "runs": 1} + ], + "summary": {"passed": 0, "failed": 1, "total": 1}, + }, + history=[], + model=None, + log_dir=tmp_path, + iteration=1, + ) + + assert description == "short and valid" + assert calls + assert calls[0][:2] == ["claude", "-p"] + transcript = json.loads((tmp_path / "improve_iter_1.json").read_text()) + assert transcript["raw_result_text"] == "raw result" + assert transcript["rewrite_raw_result_text"] == "raw result" diff --git a/skills/agent-comparison/SKILL.md b/skills/agent-comparison/SKILL.md index 7f7d1eb..21e8c15 100644 --- a/skills/agent-comparison/SKILL.md +++ b/skills/agent-comparison/SKILL.md @@ -262,7 +262,7 @@ Remove temporary benchmark files and debug outputs. Keep only the comparison rep ### Phase 5: OPTIMIZE (optional — invoked explicitly) -**Goal**: Run an automated optimization loop that iteratively improves a markdown target's frontmatter `description` using trigger-rate eval tasks, then keeps only measured improvements. +**Goal**: Run an automated optimization loop that improves a markdown target's frontmatter `description` using trigger-rate eval tasks, then selects the best measured variants through beam search or single-path search. This phase is for routing/trigger optimization, not full code-generation benchmarking. Invoke it when the user says "optimize this skill", "optimize the description", or "run autoresearch". The existing manual A/B comparison (Phases 1-4) remains the path for full agent benchmarking. @@ -288,7 +288,6 @@ python3 skills/agent-comparison/scripts/optimize_loop.py \ --goal "{optimization goal}" \ --benchmark-tasks skills/agent-comparison/references/optimization-tasks.example.json \ --train-split 0.6 \ - --model claude-sonnet-4-20250514 \ --verbose ``` @@ -309,12 +308,15 @@ The loop automatically evaluates the unmodified target against the train set bef **Step 4: Enter optimization loop** The `optimize_loop.py` script handles the full loop: -- Calls `generate_variant.py` to propose changes (Claude with extended thinking) +- Calls `generate_variant.py` to propose changes through `claude -p` - Evaluates each variant against train tasks -- Keeps variants that improve score by more than `--min-gain` (default 0.02) +- Runs either: + - single-path hill climbing: `--beam-width 1 --candidates-per-parent 1` + - beam search with top-K retention: keep the best `K` improving candidates each round +- Keeps variants that beat their parent by more than `--min-gain` (default 0.02) - Reverts variants that don't improve, break hard gates, or delete sections without justification -- Checks held-out test set every 5 iterations for Goodhart divergence -- Stops on convergence (5 consecutive reverts), Goodhart alarm, or max iterations +- Checks held-out test set every `--holdout-check-cadence` rounds for Goodhart divergence +- Stops on convergence (`--revert-streak-limit` rounds without any KEEP), Goodhart alarm, or max iterations ```bash python3 skills/agent-comparison/scripts/optimize_loop.py \ @@ -324,13 +326,24 @@ python3 skills/agent-comparison/scripts/optimize_loop.py \ --max-iterations 20 \ --min-gain 0.02 \ --train-split 0.6 \ - --model claude-sonnet-4-20250514 \ + --beam-width 3 \ + --candidates-per-parent 2 \ + --revert-streak-limit 8 \ + --holdout-check-cadence 5 \ --report optimization-report.html \ --output-dir evals/iterations \ --verbose ``` -The `--report` flag generates a live HTML dashboard that auto-refreshes every 10 seconds, showing a convergence chart, iteration table, and cherry-pick controls. +Omit `--model` to use Claude Code's configured default model, or pass it explicitly if you need a specific override. + +The `--report` flag generates a live HTML dashboard that auto-refreshes every 10 seconds, showing a convergence chart, iteration table, and review/export controls. + +Recommended modes: +- Fast single-path optimization: `--beam-width 1 --candidates-per-parent 1` +- True autoresearch sweep: `--max-iterations 20 --beam-width 3 --candidates-per-parent 2 --revert-streak-limit 20` +- Conservative search with strict keeps: raise `--min-gain` above `0.02` +- Exploratory search that accepts small wins: use `--min-gain 0.0` **Step 5: Present results in UI** @@ -340,19 +353,21 @@ Open the generated `optimization-report.html` in a browser. The report shows: - Iteration table with verdict, composite score, delta, and change summary - Expandable inline diffs per iteration (click any row) -**Step 6: User cherry-picks improvements** +**Step 6: Review kept snapshots** -Not all KEEP iterations are real improvements — some may be harness artifacts. The user reviews each kept iteration's diff and selects which to include: -- Check the "Pick" checkbox for desired iterations -- Click "Preview Combined" to see the merged diff -- Click "Export Selected" to download a JSON file with chosen diffs +Not all KEEP iterations are real improvements — some may be harness artifacts. The user reviews the kept iterations as candidate snapshots from the original target: +- Inspect each kept iteration's diff in the report +- Use "Preview Selected Snapshot" only as a comparison aid in the UI +- Use "Export Selected" to download a review JSON describing the selected snapshot diff +- In beam mode, review the retained frontier candidates first; they are the strongest candidates from the latest round **Step 7: Apply selected improvements to target file** -Apply the selected improvements to the original target file. +Apply one reviewed improvement to the original target file. - If you want the best single kept variant, use `evals/iterations/best_variant.md`. -- If you exported selected diffs, treat that JSON as review material for a manual follow-up apply step. It is not auto-applied by the current tooling. +- Beam search still writes a single `best_variant.md`: the highest-scoring kept candidate seen anywhere in the run. +- If you exported selected diffs, treat that JSON as review material only. It is not auto-applied by the current tooling, and the current workflow does not support merging multiple kept diffs into a generated patch. ```bash # Review the best kept variant before applying @@ -370,7 +385,11 @@ After applying improvements, run a final evaluation on ALL tasks (not just train # Re-run optimize_loop.py against the same task file and inspect results.json/report output ``` -Compare final scores to the baseline to confirm net improvement. +Compare final scores to the baseline to confirm net improvement. In beam mode, the final report and `results.json` also include: +- `beam_width` +- `candidates_per_parent` +- `holdout_check_cadence` +- per-iteration frontier metadata (`selected_for_frontier`, `frontier_rank`, `parent_iteration`) **Step 9: Record in learning-db** diff --git a/skills/agent-comparison/references/do-creation-compliance-tasks.json b/skills/agent-comparison/references/do-creation-compliance-tasks.json new file mode 100644 index 0000000..81c2e05 --- /dev/null +++ b/skills/agent-comparison/references/do-creation-compliance-tasks.json @@ -0,0 +1,292 @@ +{ + "tasks": [ + { + "name": "create-agent-prometheus", + "split": "train", + "complexity": "complex", + "query": "create a new Prometheus alerting agent", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "scaffold-skill-migration", + "split": "train", + "complexity": "complex", + "query": "scaffold a new skill for database migration safety", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "build-pipeline-security", + "split": "train", + "complexity": "complex", + "query": "build a pipeline for automated security scanning", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "create-hook-sql", + "split": "train", + "complexity": "simple", + "query": "create a PostToolUse hook that detects SQL injection patterns", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "new-feature-webhooks", + "split": "train", + "complexity": "complex", + "query": "new feature: add webhook support for deployment notifications", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "scaffold-perses-plugin", + "split": "train", + "complexity": "complex", + "query": "scaffold a Perses dashboard plugin with CUE schema and React component", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "implicit-create-rails-agent", + "split": "train", + "complexity": "simple", + "query": "I need an agent for Ruby on Rails development", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "add-linting-pipeline", + "split": "train", + "complexity": "simple", + "query": "add a new linting pipeline to the toolkit", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "create-voice-profile", + "split": "train", + "complexity": "complex", + "query": "create a new voice profile from my blog writing samples", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "build-agent-rust", + "split": "train", + "complexity": "simple", + "query": "build a Rust development agent with cargo integration", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "create-reviewer-agent", + "split": "test", + "complexity": "complex", + "query": "create a new code review agent focused on accessibility compliance", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "scaffold-etl-pipeline", + "split": "test", + "complexity": "complex", + "query": "scaffold an ETL pipeline skill with data validation phases", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "new-session-hook", + "split": "test", + "complexity": "simple", + "query": "new SessionStart hook that loads team configuration from a YAML file", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "create-monitoring-skill", + "split": "test", + "complexity": "simple", + "query": "create a skill for monitoring Kubernetes pod health across namespaces", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "build-terraform-agent", + "split": "test", + "complexity": "complex", + "query": "build a Terraform infrastructure agent with plan-apply-verify phases", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "implicit-create-java-skill", + "split": "test", + "complexity": "simple", + "query": "we need a Java Spring Boot development skill", + "should_trigger": true, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-debug-go-tests", + "split": "train", + "complexity": "complex", + "query": "debug why the Go tests are failing in CI", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-review-pr-security", + "split": "train", + "complexity": "complex", + "query": "review this PR for security issues", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-optimize-db", + "split": "train", + "complexity": "simple", + "query": "optimize the database query performance", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-explain-routing", + "split": "train", + "complexity": "simple", + "query": "explain how the routing system works", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-update-errors", + "split": "train", + "complexity": "simple", + "query": "update the error messages in the auth handler", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-research-rbac", + "split": "train", + "complexity": "complex", + "query": "research best practices for Kubernetes RBAC", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-check-ci", + "split": "train", + "complexity": "simple", + "query": "check the CI status on this branch", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-fix-import", + "split": "train", + "complexity": "simple", + "query": "fix the broken import in agents/INDEX.json", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-refactor-middleware", + "split": "train", + "complexity": "complex", + "query": "refactor the authentication middleware to use context propagation", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-run-tests", + "split": "train", + "complexity": "simple", + "query": "run the Python quality gate on the scripts directory", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-check-coverage", + "split": "test", + "complexity": "simple", + "query": "check test coverage for the voice validation module", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-deploy-staging", + "split": "test", + "complexity": "complex", + "query": "deploy the latest version to the staging environment", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-audit-deps", + "split": "test", + "complexity": "simple", + "query": "audit dependencies for known CVEs in the Python packages", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-compare-agents", + "split": "test", + "complexity": "complex", + "query": "compare the golang-general-engineer and golang-compact agents on error handling tasks", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-investigate-memory-leak", + "split": "test", + "complexity": "complex", + "query": "investigate the memory leak in the webhook processing service", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + }, + { + "name": "neg-merge-pr", + "split": "test", + "complexity": "simple", + "query": "merge PR 205 after CI passes", + "should_trigger": false, + "eval_mode": "behavioral", + "artifact_glob": "adr/*.md" + } + ] +} diff --git a/skills/agent-comparison/references/optimization-guide.md b/skills/agent-comparison/references/optimization-guide.md index 2c32067..3aa0f6a 100644 --- a/skills/agent-comparison/references/optimization-guide.md +++ b/skills/agent-comparison/references/optimization-guide.md @@ -90,17 +90,35 @@ python3 skills/agent-comparison/scripts/optimize_loop.py \ --train-split 0.6 \ --max-iterations 20 \ --min-gain 0.02 \ - --model claude-sonnet-4-20250514 \ + --beam-width 3 \ + --candidates-per-parent 2 \ + --revert-streak-limit 20 \ + --holdout-check-cadence 5 \ --report optimization-report.html \ --output-dir evals/iterations \ --verbose ``` +By default this uses Claude Code's configured model via `claude -p`. Pass `--model` only when you want to override that explicitly. + Useful flags: -- `--dry-run`: exercise the loop mechanics without API calls +- `--dry-run`: exercise the loop mechanics without calling Claude Code - `--report`: write a live HTML report - `--output-dir`: persist iteration snapshots and `results.json` +- `--beam-width`: retain the best K improving candidates per round +- `--candidates-per-parent`: generate multiple sibling variants from each frontier candidate +- `--revert-streak-limit`: stop after N rounds without any KEEP candidates +- `--holdout-check-cadence`: evaluate the global best on held-out tasks every N rounds + +Recommended search presets: + +- Single-path local search: + - `--beam-width 1 --candidates-per-parent 1` +- Balanced beam search: + - `--beam-width 3 --candidates-per-parent 2` +- Aggressive exploration: + - `--beam-width 5 --candidates-per-parent 3 --min-gain 0.0` ## Evaluation Model @@ -109,6 +127,7 @@ The loop follows the ADR-131 structure: 1. Hard gates 2. Weighted composite score 3. Held-out regression checks +4. Frontier retention ### Layer 1: Hard Gates @@ -126,14 +145,27 @@ preserved verbatim. ### Layer 2: Composite Score The loop converts trigger-rate evaluation results into a weighted composite -score using the built-in weights in `optimize_loop.py`. A variant is kept only -if it beats the previous best by more than `--min-gain`. +score using the built-in weights in `optimize_loop.py`. A candidate is kept only +if it beats its parent by more than `--min-gain`. ### Layer 3: Held-Out Regression Check -Every 5 iterations, the current best variant is scored on the held-out test set. -If held-out performance drops below the baseline while train performance has -improved, the loop raises a Goodhart alarm and stops. +Every `--holdout-check-cadence` rounds, the current global best variant is +scored on the held-out test set. If held-out performance drops below the +baseline while train performance has improved, the loop raises a Goodhart +alarm and stops. + +### Layer 4: Frontier Retention + +When beam search is enabled: + +- each frontier candidate generates `--candidates-per-parent` siblings +- every sibling is scored independently +- the top `--beam-width` KEEP candidates become the next frontier +- `best_variant.md` still tracks the single best candidate seen anywhere in the run + +When `--beam-width 1 --candidates-per-parent 1`, the behavior collapses back to +the original single-path optimizer. ## Deletion Safety Rule @@ -156,12 +188,15 @@ When `--output-dir` is set, the loop writes: - `best_variant.md` - `results.json` +`results.json` also records search metadata such as `beam_width`, +`candidates_per_parent`, and per-iteration frontier selection markers. + When `--report` is set, it also writes a live HTML dashboard showing: - status, baseline, best score, kept/reverted counts - convergence chart - iteration table with diffs -- cherry-pick controls for kept iterations +- review/export controls for kept snapshot diffs from the original target ## Choosing Good Eval Tasks diff --git a/skills/agent-comparison/scripts/generate_variant.py b/skills/agent-comparison/scripts/generate_variant.py index 2378e50..31cb244 100644 --- a/skills/agent-comparison/scripts/generate_variant.py +++ b/skills/agent-comparison/scripts/generate_variant.py @@ -1,11 +1,11 @@ #!/usr/bin/env python3 -"""Generate a variant of an agent/skill file using Claude with extended thinking. +"""Generate a variant of an agent/skill file using Claude Code. Proposes modifications to improve the target file based on the optimization goal and previous iteration failures. Preserves protected sections marked with DO NOT OPTIMIZE markers. -Pattern: follows improve_description.py's Claude + extended thinking approach. +Pattern: uses `claude -p` so generation runs through Claude Code directly. Usage: python3 skills/agent-comparison/scripts/generate_variant.py \ @@ -13,7 +13,7 @@ --goal "improve error handling instructions" \ --current-content "..." \ --failures '[...]' \ - --model claude-sonnet-4-20250514 + --model claude-opus-4-6 Output (JSON to stdout): { @@ -31,13 +31,11 @@ import argparse import json +import os import re +import subprocess import sys - -try: - import anthropic -except ImportError: # pragma: no cover - exercised in environments without the SDK - anthropic = None +from pathlib import Path # --------------------------------------------------------------------------- # Protected section handling @@ -61,8 +59,7 @@ def restore_protected(original: str, variant: str) -> str: if len(orig_sections) != len(var_sections): print( - "Warning: Protected section count mismatch " - f"(original={len(orig_sections)}, variant={len(var_sections)}).", + f"Warning: Protected section count mismatch (original={len(orig_sections)}, variant={len(var_sections)}).", file=sys.stderr, ) return variant @@ -94,16 +91,69 @@ def detect_deletions(original: str, variant: str) -> list[str]: # --------------------------------------------------------------------------- +def _find_project_root() -> Path: + current = Path.cwd() + for parent in [current, *current.parents]: + if (parent / ".claude").is_dir(): + return parent + print("Warning: .claude/ directory not found, using cwd as project root", file=sys.stderr) + return current + + +def _run_claude_code(prompt: str, model: str | None) -> tuple[str, str, int]: + """Run Claude Code and return (response_text, raw_result_text, tokens_used).""" + cmd = ["claude", "-p", prompt, "--output-format", "json", "--print"] + if model: + cmd.extend(["--model", model]) + + env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} + result = subprocess.run( + cmd, + capture_output=True, + text=True, + cwd=str(_find_project_root()), + env=env, + timeout=300, + ) + if result.returncode != 0: + print(f"Error: claude -p failed with code {result.returncode}", file=sys.stderr) + if result.stderr: + print(result.stderr.strip(), file=sys.stderr) + sys.exit(1) + + try: + events = json.loads(result.stdout) + except json.JSONDecodeError as exc: + print(f"Error: could not parse claude -p JSON output: {exc}", file=sys.stderr) + sys.exit(1) + + assistant_text = "" + raw_result_text = "" + tokens_used = 0 + for event in events: + if event.get("type") == "assistant": + message = event.get("message", {}) + for content in message.get("content", []): + if content.get("type") == "text": + assistant_text += content.get("text", "") + elif event.get("type") == "result": + raw_result_text = event.get("result", "") + usage = event.get("usage", {}) + tokens_used = usage.get("input_tokens", 0) + usage.get("output_tokens", 0) + + return assistant_text or raw_result_text, raw_result_text, tokens_used + + def generate_variant( - client: anthropic.Anthropic, target_path: str, goal: str, current_content: str, failures: list[dict], - model: str, + model: str | None, history: list[dict] | None = None, + diversification_note: str | None = None, ) -> dict: - """Call Claude to generate a variant of the target file. + """Call Claude Code to generate a variant of the target file. Returns dict with variant content, summary, reasoning, and token count. """ @@ -118,7 +168,13 @@ def generate_variant( if history: history_section = "\n\nPrevious attempts (do NOT repeat — try structurally different approaches):\n" for h in history: - history_section += f" Iteration {h.get('number', '?')}: {h.get('verdict', '?')} — {h.get('change_summary', '')}\n" + history_section += ( + f" Iteration {h.get('number', '?')}: {h.get('verdict', '?')} — {h.get('change_summary', '')}\n" + ) + + diversification_section = "" + if diversification_note: + diversification_section = f"\n\nSearch diversification instruction:\n{diversification_note}\n" protected_sections = extract_protected(current_content) protected_notice = "" @@ -141,7 +197,7 @@ def generate_variant( {current_content} -{failure_section}{history_section}{protected_notice} +{failure_section}{history_section}{diversification_section}{protected_notice} SAFETY RULES: 1. Do NOT delete sections without replacing them with equivalent or better content. @@ -180,31 +236,7 @@ def generate_variant( [why any removed section was replaced safely, or leave blank] """ - try: - response = client.messages.create( - model=model, - max_tokens=16000, - thinking={ - "type": "enabled", - "budget_tokens": 10000, - }, - messages=[{"role": "user", "content": prompt}], - ) - except anthropic.APIStatusError as e: - print(f"Error: API returned status {e.status_code}: {e.message}", file=sys.stderr) - sys.exit(1) - except anthropic.APIConnectionError as e: - print(f"Error: API connection failed: {e}", file=sys.stderr) - sys.exit(1) - - # Extract thinking and text - thinking_text = "" - text = "" - for block in response.content: - if block.type == "thinking": - thinking_text = block.thinking - elif block.type == "text": - text = block.text + text, raw_result_text, tokens_used = _run_claude_code(prompt, model) # Parse variant content variant_match = re.search(r"(.*?)", text, re.DOTALL) @@ -229,13 +261,11 @@ def generate_variant( if deletions: print(f"Warning: Deleted sections: {deletions}", file=sys.stderr) - tokens_used = response.usage.input_tokens + response.usage.output_tokens - return { "variant": variant, "summary": summary, "deletion_justification": deletion_justification, - "reasoning": thinking_text, + "reasoning": raw_result_text, "tokens_used": tokens_used, "deletions": deletions, } @@ -255,7 +285,8 @@ def main(): content_group.add_argument("--current-content-file", help="Path to a file containing the current content") parser.add_argument("--failures", default="[]", help="JSON list of failed tasks") parser.add_argument("--history", default="[]", help="JSON list of previous iterations") - parser.add_argument("--model", default="claude-sonnet-4-20250514", help="Model to use") + parser.add_argument("--diversification-note", default=None, help="Optional search diversification hint") + parser.add_argument("--model", default=None, help="Optional Claude Code model override") args = parser.parse_args() try: @@ -269,25 +300,20 @@ def main(): print(f"Error: --history is not valid JSON: {e}", file=sys.stderr) sys.exit(1) - if anthropic is None: - print("Error: anthropic SDK is not installed", file=sys.stderr) - sys.exit(1) - current_content = ( - open(args.current_content_file, encoding="utf-8").read() + Path(args.current_content_file).read_text(encoding="utf-8") if args.current_content_file else args.current_content ) - client = anthropic.Anthropic() result = generate_variant( - client=client, target_path=args.target, goal=args.goal, current_content=current_content, failures=failures, model=args.model, history=history if history else None, + diversification_note=args.diversification_note, ) print(json.dumps(result, indent=2)) diff --git a/skills/agent-comparison/scripts/optimize_loop.py b/skills/agent-comparison/scripts/optimize_loop.py index 228dd1d..dd17781 100644 --- a/skills/agent-comparison/scripts/optimize_loop.py +++ b/skills/agent-comparison/scripts/optimize_loop.py @@ -20,7 +20,9 @@ from __future__ import annotations import argparse +import glob import json +import os import random import re import subprocess @@ -89,6 +91,7 @@ def save_iteration( stop_reason: str | None = None, deletions: list[str] | None = None, deletion_justification: str = "", + metadata: dict | None = None, ) -> dict: """Save a full iteration snapshot and return its metadata.""" iter_dir = output_dir / f"{iteration:03d}" @@ -107,6 +110,8 @@ def save_iteration( "deletions": deletions or [], "deletion_justification": deletion_justification, } + if metadata: + verdict_data.update(metadata) (iter_dir / "verdict.json").write_text(json.dumps(verdict_data, indent=2)) if diff_text: @@ -148,6 +153,74 @@ def make_dry_run_variant(current_content: str, iteration: int) -> tuple[str, str return variant, "Synthetic dry-run mutation", "dry-run synthetic variant" +def _generate_variant_output( + current_content: str, + target_path: Path, + goal: str, + last_failures: list[dict], + history: list[dict], + model: str | None, + dry_run: bool, + iteration_number: int, + diversification_note: str | None = None, +) -> dict: + """Generate a candidate variant either synthetically or through Claude Code.""" + if dry_run: + variant_content, change_summary, reasoning = make_dry_run_variant(current_content, iteration_number) + return { + "variant": variant_content, + "summary": change_summary, + "reasoning": reasoning, + "tokens_used": 0, + "deletions": [], + "deletion_justification": "", + } + + with tempfile.NamedTemporaryFile(mode="w", suffix=target_path.suffix, encoding="utf-8") as current_file: + current_file.write(current_content) + current_file.flush() + variant_cmd = [ + sys.executable, + str(Path(__file__).parent / "generate_variant.py"), + "--target", + str(target_path), + "--goal", + goal, + "--current-content-file", + current_file.name, + "--failures", + json.dumps(last_failures), + "--history", + json.dumps(history), + ] + if diversification_note: + variant_cmd.extend(["--diversification-note", diversification_note]) + if model: + variant_cmd.extend(["--model", model]) + _variant_project_root = Path.cwd() + for _parent in [_variant_project_root, *_variant_project_root.parents]: + if (_parent / ".claude").is_dir(): + _variant_project_root = _parent + break + _variant_env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} + variant_result = subprocess.run( + variant_cmd, + capture_output=True, + text=True, + cwd=str(_variant_project_root), + env=_variant_env, + timeout=360, + ) + + if variant_result.returncode != 0: + raise RuntimeError(variant_result.stderr.strip() or "Variant generation failed") + + try: + return json.loads(variant_result.stdout) + except json.JSONDecodeError as e: + raise ValueError(f"Parse error: {e}") from e + + # --------------------------------------------------------------------------- # HTML report generation # --------------------------------------------------------------------------- @@ -204,9 +277,9 @@ def generate_optimization_report(data: dict, auto_refresh: bool = False) -> str: sc = it["score"] train_score = sc.get("train") test_score = sc.get("test") - score_str = f'{train_score:.2f}' if isinstance(train_score, (int, float)) else "?" + score_str = f"{train_score:.2f}" if isinstance(train_score, (int, float)) else "?" if isinstance(test_score, (int, float)): - score_str += f' / {test_score:.2f}' + score_str += f" / {test_score:.2f}" delta = str(it.get("delta", "")) dcls = "d-pos" if delta.startswith("+") and delta != "+0" else "d-neg" if delta.startswith("-") else "d-zero" summary = html_mod.escape(str(it.get("change_summary", ""))[:80]) @@ -227,10 +300,12 @@ def generate_optimization_report(data: dict, auto_refresh: bool = False) -> str:
{diff_esc}
""" - chart_json = json.dumps([ - {"x": it["number"], "train": it["score"].get("train", 0), "test": it["score"].get("test")} - for it in iterations - ]) + chart_json = json.dumps( + [ + {"x": it["number"], "train": it["score"].get("train", 0), "test": it["score"].get("test")} + for it in iterations + ] + ) diffs_json = json.dumps({it["number"]: str(it.get("diff", "")) for it in iterations}) bt = baseline.get("train", 0.0) @@ -489,7 +564,7 @@ def _parse_frontmatter(content: str) -> tuple[bool, str]: while idx < len(fm_lines): line = fm_lines[idx] if line.startswith("description:"): - value = line[len("description:"):].strip() + value = line[len("description:") :].strip() if value in (">", "|", ">-", "|-"): parts: list[str] = [] idx += 1 @@ -509,9 +584,11 @@ def _is_trigger_task(task: dict) -> bool: def _is_pattern_task(task: dict) -> bool: - return "prompt" in task and ( - "expected_patterns" in task or "forbidden_patterns" in task or "weight" in task - ) + return "prompt" in task and ("expected_patterns" in task or "forbidden_patterns" in task or "weight" in task) + + +def _is_behavioral_task(task: dict) -> bool: + return "query" in task and "should_trigger" in task and task.get("eval_mode") == "behavioral" def _validate_task_set(tasks: list[dict]) -> None: @@ -521,9 +598,22 @@ def _validate_task_set(tasks: list[dict]) -> None: trigger_tasks = sum(1 for task in tasks if _is_trigger_task(task)) pattern_tasks = sum(1 for task in tasks if _is_pattern_task(task)) + behavioral_tasks = sum(1 for task in tasks if _is_behavioral_task(task)) + + # behavioral tasks are a subset of trigger tasks (same base fields), so subtract them + # to avoid double-counting when checking for pure trigger-rate sets + pure_trigger_tasks = trigger_tasks - behavioral_tasks + + if (pure_trigger_tasks or behavioral_tasks) and pattern_tasks: + raise ValueError( + "Task file mixes trigger-rate/behavioral and pattern benchmark formats. Use one format per run." + ) - if trigger_tasks and pattern_tasks: - raise ValueError("Task file mixes trigger-rate and pattern benchmark formats. Use one format per run.") + if behavioral_tasks and pure_trigger_tasks: + raise ValueError("Task file mixes trigger-rate and behavioral eval modes. Use one eval_mode per run.") + + if behavioral_tasks == len(tasks): + return if trigger_tasks == len(tasks): return @@ -534,9 +624,7 @@ def _validate_task_set(tasks: list[dict]) -> None: "Use trigger-rate tasks with 'query' and 'should_trigger' fields." ) - raise ValueError( - "Unsupported task format. Expected trigger-rate tasks with 'query' and 'should_trigger' fields." - ) + raise ValueError("Unsupported task format. Expected trigger-rate tasks with 'query' and 'should_trigger' fields.") # --------------------------------------------------------------------------- @@ -557,14 +645,11 @@ def _run_trigger_rate( Tasks must have 'query' and 'should_trigger' fields. Returns run_eval-style results dict. """ - import os - import tempfile - task_file = None try: with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: - json.dump(tasks, f) task_file = f.name + json.dump(tasks, f) with tempfile.TemporaryDirectory() as skill_dir: skill_md = Path(skill_dir) / "SKILL.md" @@ -577,13 +662,21 @@ def _run_trigger_rate( break cmd = [ - sys.executable, "-m", "scripts.skill_eval.run_eval", - "--eval-set", task_file, - "--skill-path", skill_dir, - "--description", description, - "--num-workers", str(num_workers), - "--timeout", str(timeout), - "--runs-per-query", "1", + sys.executable, + "-m", + "scripts.skill_eval.run_eval", + "--eval-set", + task_file, + "--skill-path", + skill_dir, + "--description", + description, + "--num-workers", + str(num_workers), + "--timeout", + str(timeout), + "--runs-per-query", + "1", ] if verbose: cmd.append("--verbose") @@ -592,21 +685,129 @@ def _run_trigger_rate( env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} result = subprocess.run( - cmd, capture_output=True, text=True, - cwd=str(project_root), env=env, timeout=600, + cmd, + capture_output=True, + text=True, + cwd=str(project_root), + env=env, + timeout=600, ) if result.returncode != 0: - if verbose: - print(f"Trigger assessment failed: {result.stderr[:300]}", file=sys.stderr) + print(f"Trigger assessment failed (exit {result.returncode}): {result.stderr[:300]}", file=sys.stderr) return {"results": [], "summary": {"total": 0, "passed": 0, "failed": 0}} - return json.loads(result.stdout) + try: + return json.loads(result.stdout) + except json.JSONDecodeError as e: + print(f"Trigger assessment returned invalid JSON: {e} — stdout: {result.stdout[:200]}", file=sys.stderr) + return {"results": [], "summary": {"total": 0, "passed": 0, "failed": 0}} finally: if task_file: Path(task_file).unlink(missing_ok=True) +# --------------------------------------------------------------------------- +# Behavioral evaluator (runs claude -p and checks for artifact creation) +# --------------------------------------------------------------------------- + + +def _run_behavioral_eval( + target_path: Path, + description: str, + tasks: list[dict], + timeout: int = 240, + verbose: bool = False, +) -> list[dict]: + """Run behavioral assessment by invoking claude -p and checking artifact output. + + Each task must have 'query', 'should_trigger', 'artifact_glob', and optionally + 'query_prefix' fields. Tasks are run sequentially since each claude -p invocation + is resource-intensive. + + Returns a list of per-task result dicts with keys: + triggered, should_trigger, pass, new_artifacts + """ + project_root = Path.cwd() + for parent in [project_root, *project_root.parents]: + if (parent / ".claude").is_dir(): + project_root = parent + break + + env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} + + results = [] + for task in tasks: + query: str = task["query"] + should_trigger: bool = task["should_trigger"] + artifact_glob: str = task.get("artifact_glob", "adr/*.md") + query_prefix: str = task.get("query_prefix", "/do ") + + full_query = f"{query_prefix}{query}" + + # Snapshot existing artifacts before the run + before: set[str] = set(glob.glob(str(project_root / artifact_glob))) + + triggered = False + new_artifacts: list[str] = [] + + if verbose: + print(f"[behavioral] Running: claude -p {full_query!r}", file=sys.stderr) + + try: + result = subprocess.run( + ["claude", "-p", full_query], + capture_output=True, + text=True, + cwd=str(project_root), + env=env, + timeout=timeout, + ) + if result.returncode != 0: + print( + f"[behavioral] claude exited {result.returncode}: {result.stderr[:300]}", + file=sys.stderr, + ) + + # Check for new files matching the artifact glob + after: set[str] = set(glob.glob(str(project_root / artifact_glob))) + new_artifacts = sorted(after - before) + triggered = len(new_artifacts) > 0 + + if verbose and new_artifacts: + print(f"[behavioral] New artifacts: {new_artifacts}", file=sys.stderr) + + except subprocess.TimeoutExpired: + if verbose: + print(f"[behavioral] Timed out after {timeout}s for query: {full_query!r}", file=sys.stderr) + # Still check artifacts — the process may have written them before timing out + after_timeout: set[str] = set(glob.glob(str(project_root / artifact_glob))) + new_artifacts = sorted(after_timeout - before) + triggered = len(new_artifacts) > 0 + if verbose and triggered: + print(f"[behavioral] Artifacts found despite timeout: {new_artifacts}", file=sys.stderr) + + # Clean up artifacts so they don't pollute the before-snapshot of the next task + for artifact_path in new_artifacts: + try: + Path(artifact_path).unlink(missing_ok=True) + except OSError: + pass + + passed = triggered == should_trigger + results.append( + { + "query": query, + "triggered": triggered, + "should_trigger": should_trigger, + "pass": passed, + "new_artifacts": new_artifacts, + } + ) + + return results + + # --------------------------------------------------------------------------- # Evaluation bridge # --------------------------------------------------------------------------- @@ -654,6 +855,7 @@ def assess_target( # different variants produce different scores. if dry_run: import hashlib + h = int(hashlib.sha256(content.encode()).hexdigest()[:8], 16) base = (h % 30 + 70) / 100.0 # 0.70-1.00 range — always decent scores["correctness"] = round(base * 10, 2) @@ -664,14 +866,19 @@ def assess_target( scores["tests_pass"] = True # always pass in dry-run for task in tasks: name = task.get("name", task.get("query", "unnamed"))[:40] - scores["task_results"].append({ - "name": name, "passed": True, - "score": base, "details": "dry-run", - }) + scores["task_results"].append( + { + "name": name, + "passed": True, + "score": base, + "details": "dry-run", + } + ) return scores # Detect assessment mode from task format - is_trigger = all(_is_trigger_task(task) for task in tasks) + is_behavioral = all(_is_behavioral_task(task) for task in tasks) + is_trigger = not is_behavioral and all(_is_trigger_task(task) for task in tasks) if is_trigger: results = _run_trigger_rate(target_path, description, tasks, verbose=verbose) @@ -690,17 +897,46 @@ def assess_target( scores["tests_pass"] = passed == total for r in results.get("results", []): - scores["task_results"].append({ - "name": r.get("query", "unnamed")[:40], - "passed": r.get("pass", False), - "score": 1.0 if r.get("pass", False) else 0.0, - "details": f"trigger_rate={r.get('trigger_rate', 0):.2f}", - }) + scores["task_results"].append( + { + "name": r.get("query", "unnamed")[:40], + "passed": r.get("pass", False), + "score": 1.0 if r.get("pass", False) else 0.0, + "details": f"trigger_rate={r.get('trigger_rate', 0):.2f}", + } + ) + return scores + + if is_behavioral: + behavioral_results = _run_behavioral_eval(target_path, description, tasks, verbose=verbose) + total = len(behavioral_results) + passed = sum(1 for r in behavioral_results if r.get("pass", False)) + if total == 0: + return scores + + accuracy = passed / total + scores["correctness"] = round(accuracy * 10, 2) + scores["error_handling"] = round(accuracy * 8, 2) + scores["language_idioms"] = round(accuracy * 7, 2) + scores["testing"] = round(accuracy * 8, 2) + scores["efficiency"] = round(min(1.0, accuracy + 0.1) * 6, 2) + scores["tests_pass"] = passed == total + + for r in behavioral_results: + artifact_summary = ", ".join(r.get("new_artifacts", [])) or "none" + scores["task_results"].append( + { + "name": r.get("query", "unnamed")[:40], + "passed": r.get("pass", False), + "score": 1.0 if r.get("pass", False) else 0.0, + "details": f"triggered={r.get('triggered')}, artifacts={artifact_summary}", + } + ) return scores # Benchmark behavioral assessment — not yet implemented. - # Use trigger-rate format (tasks with 'query' + 'should_trigger') - # as the recommended starting point per ADR-131 research findings. + # Use trigger-rate tasks ('query' + 'should_trigger') or behavioral tasks + # ('query' + 'should_trigger' + 'eval_mode: behavioral') per ADR-132. raise NotImplementedError( "Pattern benchmark tasks are not yet implemented. " "Use trigger-rate tasks with 'query' and 'should_trigger' fields. " @@ -739,13 +975,26 @@ def run_optimization_loop( max_iterations: int = 20, min_gain: float = 0.02, train_split: float = 0.6, - model: str = "claude-sonnet-4-20250514", + revert_streak_limit: int = 5, + beam_width: int = 1, + candidates_per_parent: int = 1, + holdout_check_cadence: int = 5, + model: str | None = None, verbose: bool = False, report_path: Path | None = None, output_dir: Path | None = None, dry_run: bool = False, ) -> dict: """Run the autoresearch optimization loop.""" + if beam_width < 1: + raise ValueError("beam_width must be >= 1") + if candidates_per_parent < 1: + raise ValueError("candidates_per_parent must be >= 1") + if revert_streak_limit < 1: + raise ValueError("revert_streak_limit must be >= 1") + if holdout_check_cadence < 0: + raise ValueError("holdout_check_cadence must be >= 0") + if output_dir is None: output_dir = Path("evals/iterations") output_dir.mkdir(parents=True, exist_ok=True) @@ -764,7 +1013,6 @@ def run_optimization_loop( "Target must have YAML frontmatter with a non-empty description. " "optimize_loop.py currently supports frontmatter-description optimization only." ) - current_content = original_content target_label = target_path.name if verbose: @@ -773,7 +1021,7 @@ def run_optimization_loop( baseline_scores = assess_target(target_path, train_tasks, goal, verbose, dry_run) baseline_composite = composite_score(baseline_scores) best_score = baseline_composite - best_content = current_content + best_content = original_content best_iteration = 0 baseline_holdout_scores = assess_target(target_path, test_tasks, goal, verbose, dry_run) if test_tasks else None @@ -783,237 +1031,317 @@ def run_optimization_loop( holdout_display = f"{baseline_holdout:.4f}" if baseline_holdout is not None else "n/a" print(f"Baseline: train={baseline_composite:.4f}, holdout={holdout_display}", file=sys.stderr) + baseline_failures = [t for t in baseline_scores.get("task_results", []) if not t.get("passed")] + frontier = [ + { + "content": original_content, + "score": baseline_composite, + "iteration": 0, + "failures": baseline_failures, + "history": [], + } + ] + iterations: list[dict] = [] - consecutive_reverts = 0 + rounds_without_keep = 0 exit_reason = "unknown" status = "RUNNING" total_tokens = 0 + iteration_counter = 0 - for i in range(1, max_iterations + 1): + for round_number in range(1, max_iterations + 1): if verbose: - print(f"\n{'=' * 60}\nIteration {i}/{max_iterations} (best={best_score:.4f})", file=sys.stderr) - - # 1. Generate variant - t0 = time.time() - last_failures = [] - if iterations: - last_scores_data = iterations[-1].get("scores", {}) - last_failures = [t for t in last_scores_data.get("task_results", []) if not t.get("passed")] - history = [ - { - "number": item["number"], - "verdict": item["verdict"], - "change_summary": item["change_summary"], - "delta": item["delta"], - } - for item in iterations[-5:] - ] + print( + f"\n{'=' * 60}\nRound {round_number}/{max_iterations} " + f"(frontier={len(frontier)}, best={best_score:.4f})", + file=sys.stderr, + ) - if dry_run: - variant_content, change_summary, reasoning = make_dry_run_variant(current_content, i) - variant_output = { - "variant": variant_content, - "summary": change_summary, - "reasoning": reasoning, - "tokens_used": 0, - "deletions": [], - "deletion_justification": "", - } - deletions = [] - deletion_justification = "" - else: - with tempfile.NamedTemporaryFile(mode="w", suffix=target_path.suffix, encoding="utf-8") as current_file: - current_file.write(current_content) - current_file.flush() - variant_result = subprocess.run( - [ - sys.executable, - str(Path(__file__).parent / "generate_variant.py"), - "--target", str(target_path), - "--goal", goal, - "--current-content-file", current_file.name, - "--failures", json.dumps(last_failures), - "--history", json.dumps(history), - "--model", model, - ], - capture_output=True, text=True, timeout=120, + kept_nodes: list[dict] = [] + iteration_by_number: dict[int, dict] = {} + + for parent_index, parent in enumerate(frontier, start=1): + last_failures = parent.get("failures", []) + history = parent.get("history", [])[-5:] + + for candidate_index in range(1, candidates_per_parent + 1): + iteration_counter += 1 + t0 = time.time() + diversification_note = None + if beam_width > 1 or candidates_per_parent > 1 or len(frontier) > 1: + diversification_note = ( + f"Round {round_number}; parent {parent_index}/{len(frontier)} " + f"(source iteration {parent.get('iteration', 0)}); " + f"candidate {candidate_index}/{candidates_per_parent}. " + "Produce a materially different approach from sibling candidates in this round." + ) + + base_metadata = { + "round": round_number, + "parent_iteration": parent.get("iteration", 0), + "parent_rank": parent_index, + "candidate_index": candidate_index, + "frontier_rank": None, + "selected_for_frontier": False, + } + + try: + variant_output = _generate_variant_output( + current_content=parent["content"], + target_path=target_path, + goal=goal, + last_failures=last_failures, + history=history, + model=model, + dry_run=dry_run, + iteration_number=iteration_counter, + diversification_note=diversification_note, + ) + variant_content = variant_output["variant"] + change_summary = variant_output.get("summary", "") + reasoning = variant_output.get("reasoning", "") + total_tokens += variant_output.get("tokens_used", 0) + deletions = variant_output.get("deletions", []) + deletion_justification = variant_output.get("deletion_justification", "").strip() + except (RuntimeError, ValueError, KeyError, subprocess.TimeoutExpired) as e: + if verbose: + print(f"Variant generation failed: {e}", file=sys.stderr) + iteration_data = { + "number": iteration_counter, + "verdict": "REVERT", + "score": {"train": parent["score"], "test": None}, + "delta": "0", + "change_summary": str(e), + "reasoning": "", + "diff": "", + "tokens_used": 0, + "scores": {}, + **base_metadata, + } + save_iteration( + output_dir, + iteration_counter, + parent["content"], + {}, + "REVERT", + "", + "", + str(e), + metadata=base_metadata, + ) + iterations.append(iteration_data) + iteration_by_number[iteration_counter] = iteration_data + continue + + gen_elapsed = time.time() - t0 + diff_text = generate_diff(parent["content"], variant_content, target_label) + + if not check_protected_sections(original_content, variant_content): + if verbose: + print("REJECTED: Protected sections modified", file=sys.stderr) + iteration_data = { + "number": iteration_counter, + "verdict": "REVERT", + "score": {"train": 0.0, "test": None}, + "delta": "0", + "change_summary": "Protected sections modified", + "reasoning": reasoning, + "diff": diff_text, + "tokens_used": variant_output.get("tokens_used", 0), + "scores": {"protected_intact": False}, + **base_metadata, + } + save_iteration( + output_dir, + iteration_counter, + variant_content, + {"protected_intact": False}, + "REVERT", + "Protected sections modified", + diff_text, + change_summary, + metadata=base_metadata, + ) + iterations.append(iteration_data) + iteration_by_number[iteration_counter] = iteration_data + continue + + if deletions and not deletion_justification: + if verbose: + print(f"REJECTED: Deleted sections without justification: {deletions}", file=sys.stderr) + iteration_data = { + "number": iteration_counter, + "verdict": "REVERT", + "score": {"train": parent["score"], "test": None}, + "delta": "0", + "change_summary": "Deleted sections without justification", + "reasoning": reasoning, + "diff": diff_text, + "tokens_used": variant_output.get("tokens_used", 0), + "scores": {"protected_intact": True}, + "deletions": deletions, + "deletion_justification": "", + **base_metadata, + } + save_iteration( + output_dir, + iteration_counter, + variant_content, + {"protected_intact": True}, + "REVERT", + "Deleted sections without justification", + diff_text, + change_summary, + deletions=deletions, + metadata=base_metadata, + ) + iterations.append(iteration_data) + iteration_by_number[iteration_counter] = iteration_data + continue + + temp_target = ( + target_path.parent / f".{target_path.stem}_variant_{iteration_counter}{target_path.suffix}" ) - - if variant_result.returncode != 0: + try: + temp_target.write_text(variant_content) + t0 = time.time() + variant_scores = assess_target(temp_target, train_tasks, goal, verbose, dry_run) + eval_elapsed = time.time() - t0 + variant_composite = composite_score(variant_scores) + finally: + temp_target.unlink(missing_ok=True) + + gain = variant_composite - parent["score"] if verbose: - print(f"Variant generation failed: {variant_result.stderr}", file=sys.stderr) - consecutive_reverts += 1 - iterations.append({ - "number": i, "verdict": "REVERT", - "score": {"train": best_score}, - "delta": "0", "change_summary": "Variant generation failed", - "reasoning": variant_result.stderr[:200], "diff": "", - }) - if consecutive_reverts >= 5: - exit_reason = f"converged (5 consecutive reverts at iteration {i})" - status = "CONVERGED" - break - continue - - try: - variant_output = json.loads(variant_result.stdout) - variant_content = variant_output["variant"] - change_summary = variant_output.get("summary", "") - reasoning = variant_output.get("reasoning", "") - total_tokens += variant_output.get("tokens_used", 0) - deletions = variant_output.get("deletions", []) - deletion_justification = variant_output.get("deletion_justification", "").strip() - except (json.JSONDecodeError, KeyError) as e: - if verbose: - print(f"Parse error: {e}", file=sys.stderr) - consecutive_reverts += 1 - iterations.append({ - "number": i, "verdict": "REVERT", - "score": {"train": best_score}, - "delta": "0", "change_summary": f"Parse error: {e}", - "reasoning": "", "diff": "", - }) - if consecutive_reverts >= 5: - exit_reason = f"converged (5 consecutive reverts at iteration {i})" - status = "CONVERGED" - break - continue - - gen_elapsed = time.time() - t0 - - # 2. Validate protected sections - if not check_protected_sections(original_content, variant_content): - if verbose: - print("REJECTED: Protected sections modified", file=sys.stderr) - diff_text = generate_diff(current_content, variant_content, target_label) - save_iteration(output_dir, i, variant_content, {"protected_intact": False}, - "REVERT", "Protected sections modified", diff_text, change_summary) - iterations.append({ - "number": i, "verdict": "REVERT", - "score": {"train": 0.0}, - "delta": "0", "change_summary": "Protected sections modified", - "reasoning": reasoning, "diff": diff_text, - }) - consecutive_reverts += 1 - if consecutive_reverts >= 5: - exit_reason = f"converged (5 consecutive reverts at iteration {i})" - status = "CONVERGED" - break - continue - - if deletions and not deletion_justification: - if verbose: - print(f"REJECTED: Deleted sections without justification: {deletions}", file=sys.stderr) - diff_text = generate_diff(current_content, variant_content, target_label) - save_iteration( - output_dir, - i, - variant_content, - {"protected_intact": True}, - "REVERT", - "Deleted sections without justification", - diff_text, - change_summary, - deletions=deletions, - ) - iterations.append({ - "number": i, - "verdict": "REVERT", - "score": {"train": best_score}, - "delta": "0", - "change_summary": "Deleted sections without justification", - "reasoning": reasoning, - "diff": diff_text, - "deletions": deletions, - "deletion_justification": "", - }) - consecutive_reverts += 1 - if consecutive_reverts >= 5: - exit_reason = f"converged (5 consecutive reverts at iteration {i})" - status = "CONVERGED" - break - continue - - # 3. Evaluate variant - temp_target = target_path.parent / f".{target_path.stem}_variant{target_path.suffix}" - temp_target.write_text(variant_content) - try: - t0 = time.time() - variant_scores = assess_target(temp_target, train_tasks, goal, verbose, dry_run) - eval_elapsed = time.time() - t0 - variant_composite = composite_score(variant_scores) - finally: - temp_target.unlink(missing_ok=True) - - diff_text = generate_diff(current_content, variant_content, target_label) + print( + f"Candidate {iteration_counter}: score={variant_composite:.4f} " + f"(vs parent {parent['score']:.4f}, gain={gain:+.4f}, " + f"gen={gen_elapsed:.1f}s, eval={eval_elapsed:.1f}s)", + file=sys.stderr, + ) + + verdict = "KEEP" if gain > min_gain else "REVERT" + if deletions and deletion_justification: + change_summary = f"{change_summary} [deletion justified]" + delta_str = f"{gain:+.2f}" if gain != 0 else "0" + + metadata = { + **base_metadata, + "gain_against_parent": round(gain, 4), + } + + save_iteration( + output_dir, + iteration_counter, + variant_content, + variant_scores, + verdict, + reasoning, + diff_text, + change_summary, + deletions=deletions, + deletion_justification=deletion_justification, + metadata=metadata, + ) - if verbose: - print(f"Score: {variant_composite:.4f} (gain={variant_composite - best_score:.4f}, gen={gen_elapsed:.1f}s, eval={eval_elapsed:.1f}s)", file=sys.stderr) - - # 4. Keep/revert (deterministic arithmetic) - gain = variant_composite - best_score - if gain > min_gain: - verdict = "KEEP" - best_score = variant_composite - best_content = variant_content - best_iteration = i - current_content = variant_content - consecutive_reverts = 0 - delta_str = f"+{gain:.2f}" + iteration_data = { + "number": iteration_counter, + "verdict": verdict, + "score": {"train": variant_composite, "test": None}, + "delta": delta_str, + "change_summary": change_summary, + "reasoning": reasoning, + "diff": diff_text, + "tokens_used": variant_output.get("tokens_used", 0), + "scores": variant_scores, + "deletions": deletions, + "deletion_justification": deletion_justification, + **metadata, + } + iterations.append(iteration_data) + iteration_by_number[iteration_counter] = iteration_data + + if verdict == "KEEP": + if variant_composite > best_score: + best_score = variant_composite + best_content = variant_content + best_iteration = iteration_counter + + kept_nodes.append( + { + "content": variant_content, + "score": variant_composite, + "iteration": iteration_counter, + "failures": [t for t in variant_scores.get("task_results", []) if not t.get("passed")], + "history": parent.get("history", []) + + [ + { + "number": iteration_counter, + "verdict": verdict, + "change_summary": change_summary, + "delta": delta_str, + } + ], + } + ) + + if kept_nodes: + kept_nodes.sort(key=lambda item: (-item["score"], item["iteration"])) + frontier = kept_nodes[:beam_width] + for rank, node in enumerate(frontier, start=1): + item = iteration_by_number.get(node["iteration"]) + if item is not None: + item["selected_for_frontier"] = True + item["frontier_rank"] = rank + rounds_without_keep = 0 else: - verdict = "REVERT" - consecutive_reverts += 1 - delta_str = f"{gain:+.2f}" if gain != 0 else "0" - - if deletions and deletion_justification: - change_summary = f"{change_summary} [deletion justified]" - - save_iteration(output_dir, i, variant_content, variant_scores, - verdict, reasoning, diff_text, change_summary, - deletions=deletions, deletion_justification=deletion_justification) - - iteration_data: dict = { - "number": i, "verdict": verdict, - "score": {"train": variant_composite, "test": None}, - "delta": delta_str, "change_summary": change_summary, - "reasoning": reasoning, "diff": diff_text, - "tokens_used": variant_output.get("tokens_used", 0), - "scores": variant_scores, - "deletions": deletions, - "deletion_justification": deletion_justification, - } + rounds_without_keep += 1 - # 5. Goodhart alarm — every 5 iterations, check held-out set - if test_tasks and i % 5 == 0: + if test_tasks and holdout_check_cadence > 0 and round_number % holdout_check_cadence == 0: + temp_target = target_path.parent / f".{target_path.stem}_holdout_check{target_path.suffix}" try: temp_target.write_text(best_content) holdout_scores = assess_target(temp_target, test_tasks, goal, verbose, dry_run) holdout_composite = composite_score(holdout_scores) - iteration_data["score"]["test"] = holdout_composite + if iterations: + iterations[-1]["score"]["test"] = holdout_composite finally: temp_target.unlink(missing_ok=True) if holdout_diverges(best_score, holdout_composite, baseline_holdout, baseline_composite): if verbose: - print(f"GOODHART ALARM: holdout={holdout_composite:.4f} vs baseline={baseline_holdout:.4f}", file=sys.stderr) - exit_reason = f"goodhart_alarm (iteration {i})" + print( + f"GOODHART ALARM: holdout={holdout_composite:.4f} vs baseline={baseline_holdout:.4f}", + file=sys.stderr, + ) + exit_reason = f"goodhart_alarm (round {round_number})" status = "GOODHART_ALARM" - iterations.append(iteration_data) break - iterations.append(iteration_data) - - # 6. Convergence check - if consecutive_reverts >= 5: - exit_reason = f"converged (5 consecutive reverts at iteration {i})" + if rounds_without_keep >= revert_streak_limit: + exit_reason = f"converged ({revert_streak_limit} rounds without KEEP by round {round_number})" status = "CONVERGED" break - # Regenerate live report if report_path: - rd = _build_report_data(target_label, goal, baseline_composite, baseline_holdout, - len(train_tasks), len(test_tasks), iterations, max_iterations, - status, total_tokens) + rd = _build_report_data( + target_label, + goal, + baseline_composite, + baseline_holdout, + len(train_tasks), + len(test_tasks), + iterations, + max_iterations, + status, + total_tokens, + ) + rd["search"] = { + "strategy": "beam", + "beam_width": beam_width, + "candidates_per_parent": candidates_per_parent, + "holdout_check_cadence": holdout_check_cadence, + } report_path.write_text(generate_optimization_report(rd, auto_refresh=True)) else: @@ -1022,9 +1350,24 @@ def run_optimization_loop( # Final report if report_path: - rd = _build_report_data(target_label, goal, baseline_composite, baseline_holdout, - len(train_tasks), len(test_tasks), iterations, max_iterations, - status, total_tokens) + rd = _build_report_data( + target_label, + goal, + baseline_composite, + baseline_holdout, + len(train_tasks), + len(test_tasks), + iterations, + max_iterations, + status, + total_tokens, + ) + rd["search"] = { + "strategy": "beam", + "beam_width": beam_width, + "candidates_per_parent": candidates_per_parent, + "holdout_check_cadence": holdout_check_cadence, + } report_path.write_text(generate_optimization_report(rd, auto_refresh=False)) if best_iteration > 0: @@ -1034,17 +1377,25 @@ def run_optimization_loop( print(f"\nBest variant saved to: {best_path}", file=sys.stderr) result = { - "exit_reason": exit_reason, "status": status, - "target": str(target_path), "goal": goal, + "exit_reason": exit_reason, + "status": status, + "target": str(target_path), + "goal": goal, "baseline_score": {"train": baseline_composite, "test": baseline_holdout}, "baseline_train_score": baseline_composite, "baseline_holdout_score": baseline_holdout, "best_score": best_score, - "best_iteration": best_iteration, "iterations_run": len(iterations), + "best_iteration": best_iteration, + "iterations_run": len(iterations), "max_iterations": max_iterations, "improvements_found": sum(1 for it in iterations if it["verdict"] == "KEEP"), "total_tokens": total_tokens, - "train_size": len(train_tasks), "test_size": len(test_tasks), + "search_strategy": "beam" if beam_width > 1 or candidates_per_parent > 1 else "hill_climb", + "beam_width": beam_width, + "candidates_per_parent": candidates_per_parent, + "holdout_check_cadence": holdout_check_cadence, + "train_size": len(train_tasks), + "test_size": len(test_tasks), "iterations": iterations, } (output_dir / "results.json").write_text(json.dumps(result, indent=2)) @@ -1061,12 +1412,38 @@ def main(): parser.add_argument("--target", required=True, help="Path to agent/skill file to optimize") parser.add_argument("--goal", required=True, help="Optimization objective") parser.add_argument("--benchmark-tasks", required=True, help="Path to benchmark tasks JSON") - parser.add_argument("--max-iterations", type=int, default=20, help="Max iterations (default: 20)") + parser.add_argument( + "--max-iterations", + type=int, + default=20, + help="Max optimization rounds (default: 20); each round evaluates up to beam_width x candidates_per_parent candidates", + ) parser.add_argument("--min-gain", type=float, default=0.02, help="Min score gain to keep (default: 0.02)") parser.add_argument("--train-split", type=float, default=0.6, help="Train fraction (default: 0.6)") - parser.add_argument("--model", required=True, help="Model for variant generation") + parser.add_argument( + "--revert-streak-limit", + type=int, + default=5, + help="Stop after this many rounds without any KEEP candidates (default: 5)", + ) + parser.add_argument("--beam-width", type=int, default=1, help="Number of kept candidates to retain per round") + parser.add_argument( + "--candidates-per-parent", + type=int, + default=1, + help="How many sibling variants to generate from each frontier candidate per round", + ) + parser.add_argument( + "--holdout-check-cadence", + type=int, + default=5, + help="Check held-out tasks every N rounds (default: 5; 0 disables)", + ) + parser.add_argument("--model", default=None, help="Optional Claude Code model override for variant generation") parser.add_argument("--verbose", action="store_true", help="Print progress to stderr") - parser.add_argument("--dry-run", action="store_true", help="Use synthetic scores (test loop mechanics without API)") + parser.add_argument( + "--dry-run", action="store_true", help="Use synthetic scores (test loop mechanics without calling Claude Code)" + ) parser.add_argument("--report", default=None, help="Path for live HTML report") parser.add_argument("--output-dir", default=None, help="Directory for iteration snapshots") args = parser.parse_args() @@ -1083,10 +1460,17 @@ def main(): try: result = run_optimization_loop( - target_path=target, goal=args.goal, + target_path=target, + goal=args.goal, benchmark_tasks_path=tasks_path, - max_iterations=args.max_iterations, min_gain=args.min_gain, - train_split=args.train_split, model=args.model, + max_iterations=args.max_iterations, + min_gain=args.min_gain, + train_split=args.train_split, + revert_streak_limit=args.revert_streak_limit, + beam_width=args.beam_width, + candidates_per_parent=args.candidates_per_parent, + holdout_check_cadence=args.holdout_check_cadence, + model=args.model, verbose=args.verbose, report_path=Path(args.report) if args.report else None, output_dir=Path(args.output_dir) if args.output_dir else None, diff --git a/skills/do/SKILL.md b/skills/do/SKILL.md index 4f68b1a..978e670 100644 --- a/skills/do/SKILL.md +++ b/skills/do/SKILL.md @@ -69,7 +69,24 @@ Read and follow the repository CLAUDE.md before making any routing decision, bec **Optional: Force Direct** — OFF by default. When explicitly enabled, overrides routing for trivial operations. Only applies when the user explicitly requests it. -**Gate**: Complexity classified. Display routing banner (ALL classifications). If not Trivial, proceed to Phase 2. If Trivial, handle directly after showing banner. +**Creation Request Detection** (MANDATORY scan before Gate): + +Scan the request for creation signals before completing Phase 1: +- Explicit creation verbs: "create", "scaffold", "build", "add new", "new [component]", "implement new" +- Domain object targets: agent, skill, pipeline, hook, feature, plugin, workflow, voice profile +- Implicit creation: "I need a [component]", "we need a [component]", "build me a [component]" + +If ANY creation signal is found AND complexity is Simple+: +1. Set an internal flag: `is_creation = true` +2. **Phase 4 Step 0 is MANDATORY** — write ADR before dispatching any agent + +This early detection exists because Phase 4 Step 0 is the most frequently skipped step in /do. Moving detection to Phase 1 ensures the creation protocol fires before routing decisions consume attention. The Gate below enforces acknowledgment before Phase 2. + +**Not a creation request**: debugging, reviewing, fixing, refactoring, explaining, running, checking, auditing existing components. When ambiguous, check whether the output would be a NEW file that doesn't yet exist. + +**Gate**: Complexity classified. If a creation signal was detected, output `[CREATION REQUEST DETECTED]` before displaying the routing banner. Display routing banner (ALL classifications). If not Trivial, proceed to Phase 2. If Trivial, handle directly after showing banner. + + --- @@ -265,3 +282,5 @@ Solution: Stop execution. Create `task_plan.md`. Resume routing after plan is in - `agents/INDEX.json`: Agent triggers and metadata - `skills/INDEX.json`: Skill triggers, force-route flags, pairs_with - `pipelines/INDEX.json`: Pipeline phases, triggers, composition chains + + diff --git a/skills/skill-creator/assets/eval_viewer.html b/skills/skill-creator/assets/eval_viewer.html index 48822b6..81243e0 100644 --- a/skills/skill-creator/assets/eval_viewer.html +++ b/skills/skill-creator/assets/eval_viewer.html @@ -90,6 +90,12 @@ transition: color 0.3s, border-color 0.3s, background 0.3s; } +.blind-notice.optimization { + color: var(--accent); + border-color: rgba(77, 142, 245, 0.35); + background: var(--accent-dim); +} + /* ─── Navigation ────────────────────────────────────────────────────── */ .nav { display: flex; @@ -707,6 +713,7 @@ display: flex; gap: 10px; align-items: center; + flex-wrap: wrap; } .opt-btn { @@ -768,15 +775,20 @@

Blind A/B Code Review

// State const reviews = {}; +function setActivePage(pageName){ + document.querySelectorAll('.nav-btn').forEach(b=>b.classList.remove('active')); + document.querySelectorAll('.page').forEach(p=>p.classList.remove('active')); + const btn = document.querySelector(`.nav-btn[data-page="${pageName}"]`); + const page = document.getElementById(pageName+'-page'); + if(btn) btn.classList.add('active'); + if(page) page.classList.add('active'); + if(pageName==='iterations' && typeof window.drawOptChart==='function') window.drawOptChart(); +} + // Nav document.querySelectorAll('.nav-btn').forEach(btn => { btn.addEventListener('click', function(){ - document.querySelectorAll('.nav-btn').forEach(b=>b.classList.remove('active')); - document.querySelectorAll('.page').forEach(p=>p.classList.remove('active')); - this.classList.add('active'); - document.getElementById(this.dataset.page+'-page').classList.add('active'); - // Re-render iterations chart when tab becomes visible (clientWidth is 0 when hidden) - if(this.dataset.page==='iterations' && typeof window.drawOptChart==='function') window.drawOptChart(); + setActivePage(this.dataset.page); }); }); @@ -884,7 +896,7 @@

Blind A/B Code Review

function buildReviewPage(){ const page=document.getElementById('review-page'); if(!DATA||!DATA.evals||!DATA.evals.length){ - page.appendChild(el('p','stat-name','No data. Run generate_comparison.py to populate.')); + page.appendChild(el('p','stat-name','No blind-comparison evals in this workspace.')); return; } const tabs=el('div','eval-tabs'); @@ -1289,6 +1301,8 @@

Blind A/B Code Review

if(iterBtn) iterBtn.style.display = 'none'; return; } + var iterBtn = document.querySelector('[data-page="iterations"]'); + if(iterBtn) iterBtn.style.display = ''; var opt = DATA.optimization; var iterations = opt.iterations || []; @@ -1343,8 +1357,9 @@

Blind A/B Code Review

function drawOptChart() { var area = document.getElementById('opt-chart-area'); if(!iterations.length) { area.textContent = 'No iterations yet'; return; } + if(area.clientWidth <= 32) return; - var W = Math.min(area.clientWidth - 16, 800), H = 200; + var W = Math.max(280, Math.min(area.clientWidth - 16, 800)), H = 200; var pad = {l:40, r:20, t:10, b:30}; var pW = W-pad.l-pad.r, pH = H-pad.t-pad.b; var pts = iterations.map(function(it){ @@ -1434,6 +1449,8 @@

Blind A/B Code Review

thead.appendChild(hrow);tbl.appendChild(thead); var tbody = document.createElement('tbody'); + var keepNumbers = iterations.filter(function(it){return it.verdict==='KEEP';}).map(function(it){return it.number;}); + var defaultPick = keepNumbers.length ? keepNumbers[keepNumbers.length-1] : null; iterations.forEach(function(it){ var tr = document.createElement('tr'); tr.className = 'iter-row'; @@ -1460,10 +1477,11 @@

Blind A/B Code Review

var pickTd = document.createElement('td'); var pickLabel = document.createElement('label'); var pickCb = document.createElement('input'); - pickCb.type = 'checkbox'; - pickCb.className = 'cherry-pick-cb'; + pickCb.type = 'radio'; + pickCb.name = 'selected-iteration'; + pickCb.className = 'snapshot-pick'; pickCb.dataset.iteration = String(it.number); - if(it.verdict==='KEEP') pickCb.checked = true; + if(it.verdict==='KEEP' && it.number===defaultPick) pickCb.checked = true; if(it.verdict!=='KEEP') pickCb.disabled = true; pickLabel.appendChild(pickCb); pickLabel.appendChild(document.createTextNode(' Pick')); @@ -1498,34 +1516,35 @@

Blind A/B Code Review

// Actions bar var actions = el('div','opt-actions'); - var previewBtn = el('button','opt-btn opt-btn-primary','Preview Combined'); + var previewBtn = el('button','opt-btn opt-btn-primary','Preview Selected Snapshot'); + if(!keepNumbers.length) previewBtn.disabled = true; previewBtn.addEventListener('click', function(){ - var selected = Array.from(document.querySelectorAll('#iterations-page .cherry-pick-cb:checked')) - .map(function(cb){return parseInt(cb.dataset.iteration);}); - if(!selected.length){alert('No iterations selected');return;} - var combined = selected.map(function(n){ - var it = iterations.find(function(i){return i.number===n;}); - return '--- Iteration '+n+' ---\n'+(it&&it.diff||'(no diff)'); - }).join('\n\n'); + var selected = document.querySelector('#iterations-page .snapshot-pick:checked'); + if(!selected){alert('Select one kept iteration');return;} + var n = parseInt(selected.dataset.iteration); + var it = iterations.find(function(i){return i.number===n;}); + var combined = '--- Iteration '+n+' ---\n'+((it&& (it.full_diff||it.diff)) || '(no diff)'); var preview = document.getElementById('opt-preview-content'); preview.textContent = combined; document.getElementById('opt-preview-area').classList.remove('hidden'); }); actions.appendChild(previewBtn); - var exportBtn = el('button','opt-btn','Export Selected'); + var exportBtn = el('button','opt-btn','Export Selected Snapshot'); + if(!keepNumbers.length) exportBtn.disabled = true; exportBtn.addEventListener('click', function(){ - var selected = Array.from(document.querySelectorAll('#iterations-page .cherry-pick-cb:checked')) - .map(function(cb){return parseInt(cb.dataset.iteration);}); - if(!selected.length){alert('No iterations selected');return;} - var output = {selected_iterations:selected, diffs:{}}; - selected.forEach(function(n){ - var it = iterations.find(function(i){return i.number===n;}); - output.diffs[String(n)] = it&&it.diff||''; - }); + var selected = document.querySelector('#iterations-page .snapshot-pick:checked'); + if(!selected){alert('Select one kept iteration');return;} + var n = parseInt(selected.dataset.iteration); + var it = iterations.find(function(i){return i.number===n;}); + var output = { + selected_iteration:n, + full_diff:(it&&(it.full_diff||it.diff))||'', + diff_type:'full_diff_from_original' + }; var blob = new Blob([JSON.stringify(output,null,2)],{type:'application/json'}); var url = URL.createObjectURL(blob); - var a = document.createElement('a');a.href=url;a.download='cherry-picked-iterations.json';a.click(); + var a = document.createElement('a');a.href=url;a.download='selected-iteration.json';a.click(); URL.revokeObjectURL(url); }); actions.appendChild(exportBtn); @@ -1536,13 +1555,18 @@

Blind A/B Code Review

var previewArea = el('div','hidden'); previewArea.id = 'opt-preview-area'; previewArea.style.marginTop = '16px'; - previewArea.appendChild(el('h3','','Combined Preview')); + previewArea.appendChild(el('h3','','Selected Snapshot Preview')); previewArea.lastChild.style.cssText = 'color:var(--bright);margin-bottom:8px;font-size:14px'; var previewPre = el('pre','iter-diff-block',''); previewPre.id = 'opt-preview-content'; previewArea.appendChild(previewPre); page.appendChild(previewArea); + var snapshotNote = el('p','muted','Selection exports the full diff from the original target to one kept snapshot.'); + if(!keepNumbers.length) snapshotNote.textContent = 'No kept snapshot yet. Preview and export unlock after the first KEEP verdict.'; + snapshotNote.style.marginTop = '12px'; + page.appendChild(snapshotNote); + // Summary stats card if(opt.summary || iterations.length) { var statsCard = el('div','card'); @@ -1583,6 +1607,9 @@

Blind A/B Code Review

buildStat(goalBody, 'Target', opt.target||'?'); buildStat(goalBody, 'Goal', opt.goal); buildStat(goalBody, 'Train/Test Split', String(taskCounts.train||opt.train_size||'?')+'/'+String(taskCounts.test||opt.test_size||'?')); + if(opt.beam_width || opt.candidates_per_parent) { + buildStat(goalBody, 'Search', 'beam=' + String(opt.beam_width||1) + ', siblings=' + String(opt.candidates_per_parent||1)); + } goalCard.appendChild(goalBody); page.appendChild(goalCard); } @@ -1592,6 +1619,21 @@

Blind A/B Code Review

buildGradesPage(); buildResultsPage(); buildIterationsPage(); + +if(DATA && DATA.optimization && (!DATA.evals || !DATA.evals.length)) { + var title = document.querySelector('.header h1'); + var badge = document.querySelector('.blind-notice'); + if(title) title.textContent = 'Optimization Loop Review'; + if(badge) { + badge.textContent = (DATA.optimization.status || 'Snapshot Review').replace('_', ' '); + badge.classList.add('optimization'); + } + ['review','grades','results'].forEach(function(name){ + var btn = document.querySelector('.nav-btn[data-page="'+name+'"]'); + if(btn) btn.style.display = 'none'; + }); + setActivePage('iterations'); +} diff --git a/skills/skill-creator/scripts/eval_compare.py b/skills/skill-creator/scripts/eval_compare.py index 552100b..9a53e25 100644 --- a/skills/skill-creator/scripts/eval_compare.py +++ b/skills/skill-creator/scripts/eval_compare.py @@ -162,11 +162,22 @@ def find_iteration_dirs(workspace: Path) -> list[Path]: return [d for d in dirs if d.is_dir()] +def is_optimization_data(data: object) -> bool: + """Return True when the payload matches optimize_loop.py results.""" + if not isinstance(data, dict): + return False + iterations = data.get("iterations") + if not isinstance(iterations, list): + return False + if "baseline_score" not in data: + return False + if "target" not in data: + return False + return all(isinstance(item, dict) and "number" in item and "verdict" in item for item in iterations) + + def load_optimization_data(workspace: Path) -> dict | None: """Load optimization loop results when present in the workspace.""" - def looks_like_optimization_results(data: dict) -> bool: - return isinstance(data, dict) and "iterations" in data and "baseline_score" in data and "target" in data - candidates = [ workspace / "results.json", workspace / "evals" / "iterations" / "results.json", @@ -175,7 +186,7 @@ def looks_like_optimization_results(data: dict) -> bool: for path in candidates: if path.exists(): data = load_json_safe(path) - if data is not None and looks_like_optimization_results(data): + if is_optimization_data(data): return data return None diff --git a/skills/skill-eval/SKILL.md b/skills/skill-eval/SKILL.md index 8e5214f..ba89f3b 100644 --- a/skills/skill-eval/SKILL.md +++ b/skills/skill-eval/SKILL.md @@ -109,7 +109,6 @@ Automated loop that tests, improves, and re-tests descriptions using Claude with python3 -m scripts.skill_eval.run_loop \ --eval-set evals.json \ --skill-path \ - --model claude-opus-4-6 \ --max-iterations 5 \ --verbose ``` @@ -117,7 +116,7 @@ python3 -m scripts.skill_eval.run_loop \ This will: 1. Split eval set 60/40 train/test (stratified by should_trigger) — prevents overfitting to test cases 2. Evaluate current description on all queries (3 runs each for reliability) -3. Use Claude with extended thinking to propose improvements based on training failures +3. Use `claude -p` to propose improvements based on training failures 4. Re-evaluate the new description 5. Repeat until all pass or max iterations reached 6. Select best description by **test** score (not train score — prevents overfitting) @@ -207,9 +206,9 @@ If description optimization found a better description: **Cause**: Claude CLI not available for trigger evaluation **Solution**: Install Claude Code CLI. Trigger eval requires `claude -p` to test skill invocation. -### Error: "anthropic SDK not installed" -**Cause**: Description optimization requires the Anthropic Python SDK -**Solution**: `pip install anthropic`. Only needed for `improve_description.py` and `run_loop.py`. +### Error: "legacy SDK dependency" +**Cause**: Outdated instructions or an old checkout still expects a direct SDK client +**Solution**: Update to the current scripts. Description optimization now runs through `claude -p`. ### Error: "CLAUDECODE environment variable" **Cause**: Running eval from inside a Claude Code session blocks nested instances