diff --git a/pipelines/comprehensive-review/SKILL.md b/pipelines/comprehensive-review/SKILL.md index 8ba22ea..9ea2a61 100644 --- a/pipelines/comprehensive-review/SKILL.md +++ b/pipelines/comprehensive-review/SKILL.md @@ -1,23 +1,16 @@ --- name: comprehensive-review description: | - Unified 4-wave code review: Wave 0 auto-discovers packages/modules and - dispatches one language-specialist agent per package for deep per-package - analysis. Wave 1 dispatches 12 foundation reviewers in parallel (with Wave 0 - context). Wave 2 dispatches 10 deep-dive reviewers that receive Wave 0+1 - findings as context for targeted analysis. Wave 3 dispatches 4-5 adversarial - reviewers that challenge Wave 1+2 consensus — contrarian, skeptical senior, - user advocate, meta-process, and conditionally SAPCC structural. Aggregates - all findings by severity with wave-agreement labels (unanimous, majority, - contested), then auto-fixes ALL issues. Covers per-package deep review, - security, business logic, architecture, error handling, test coverage, type - design, code quality, comment analysis, language idioms, docs validation, - newcomer perspective, performance, concurrency, API contracts, dependencies, - error messages, dead code, naming, observability, config safety, migration - safety, and adversarial challenge. - Use for "comprehensive review", "full review", "review everything", "review - and fix", or "thorough code review". - Do NOT use for single-concern reviews (use individual agents instead). + Four-wave code review pipeline for large or high-risk changes. Wave 0 + auto-discovers packages/modules and runs per-package specialist review. Wave + 1 runs broad foundation reviewers in parallel. Wave 2 runs targeted deep-dive + reviewers using earlier findings as context. Wave 3 runs adversarial reviewers + that challenge consensus and surface missed risks. Aggregates findings by + severity and agreement level, deduplicates them, and can auto-fix issues + unless review-only mode is used. Use for "comprehensive review", "full + review", "review everything", "review and fix", or "thorough code review". + Do NOT use for narrow single-concern reviews; use smaller review skills + instead. effort: high version: 4.0.0 user-invocable: false diff --git a/scripts/tests/test_agent_comparison_optimize_loop.py b/scripts/tests/test_agent_comparison_optimize_loop.py new file mode 100644 index 0000000..a0f2faa --- /dev/null +++ b/scripts/tests/test_agent_comparison_optimize_loop.py @@ -0,0 +1,132 @@ +import importlib.util +import json +from pathlib import Path +import sys + + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def load_module(name: str, relative_path: str): + spec = importlib.util.spec_from_file_location(name, REPO_ROOT / relative_path) + module = importlib.util.module_from_spec(spec) + assert spec.loader is not None + spec.loader.exec_module(module) + return module + + +def test_assess_target_rejects_missing_frontmatter(tmp_path): + optimize_loop = load_module( + "agent_comparison_optimize_loop", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + target = tmp_path / "SKILL.md" + target.write_text("# no frontmatter\nbody\n") + + scores = optimize_loop.assess_target( + target, + [{"query": "write tests", "should_trigger": True}], + "improve routing precision", + dry_run=True, + ) + + assert scores["parses"] is False + assert optimize_loop.composite_score(scores) == 0.0 + + +def test_check_protected_sections_rejects_missing_blocks(): + optimize_loop = load_module( + "agent_comparison_optimize_loop", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + original = ( + "alpha\n" + "\n" + "keep me\n" + "\n" + "omega\n" + ) + relocated = "alpha\nomega\n" + + assert optimize_loop.check_protected_sections(original, relocated) is False + + +def test_restore_protected_does_not_silently_reinsert_missing_blocks(): + generate_variant = load_module( + "agent_comparison_generate_variant", + "skills/agent-comparison/scripts/generate_variant.py", + ) + original = ( + "alpha\n" + "\n" + "keep me\n" + "\n" + "omega\n" + ) + variant = "alpha\nomega\n" + + restored = generate_variant.restore_protected(original, variant) + + assert restored == variant + + +def test_generate_variant_main_reads_current_content_from_file(tmp_path, monkeypatch, capsys): + generate_variant = load_module( + "agent_comparison_generate_variant", + "skills/agent-comparison/scripts/generate_variant.py", + ) + + class FakeBlock: + def __init__(self, block_type: str, text: str): + self.type = block_type + if block_type == "thinking": + self.thinking = text + else: + self.text = text + + class FakeResponse: + def __init__(self): + self.content = [ + FakeBlock("thinking", "reasoning"), + FakeBlock( + "text", + "---\ndescription: updated\n---" + "updated", + ), + ] + self.usage = type("Usage", (), {"input_tokens": 1, "output_tokens": 2})() + + class FakeClient: + def __init__(self): + self.messages = type("Messages", (), {"create": lambda self, **kwargs: FakeResponse()})() + + class FakeAnthropicModule: + class Anthropic: + def __new__(cls): + return FakeClient() + + content_file = tmp_path / "current.md" + content_file.write_text("---\ndescription: current\n---\n") + + monkeypatch.setattr(generate_variant, "anthropic", FakeAnthropicModule) + monkeypatch.setattr( + sys, + "argv", + [ + "generate_variant.py", + "--target", + "skills/example/SKILL.md", + "--goal", + "improve routing precision", + "--current-content-file", + str(content_file), + "--model", + "fake-model", + ], + ) + + generate_variant.main() + output = json.loads(capsys.readouterr().out) + + assert output["variant"] == "---\ndescription: updated\n---" + assert output["tokens_used"] == 3 diff --git a/scripts/tests/test_eval_compare_optimization.py b/scripts/tests/test_eval_compare_optimization.py new file mode 100644 index 0000000..f687c7a --- /dev/null +++ b/scripts/tests/test_eval_compare_optimization.py @@ -0,0 +1,33 @@ +import importlib.util +import json +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def load_module(name: str, relative_path: str): + spec = importlib.util.spec_from_file_location(name, REPO_ROOT / relative_path) + module = importlib.util.module_from_spec(spec) + assert spec.loader is not None + spec.loader.exec_module(module) + return module + + +def test_load_optimization_data_ignores_unrelated_results_json(tmp_path): + eval_compare = load_module( + "skill_creator_eval_compare", + "skills/skill-creator/scripts/eval_compare.py", + ) + (tmp_path / "results.json").write_text(json.dumps({"status": "not-optimization"})) + (tmp_path / "evals" / "iterations").mkdir(parents=True) + expected = { + "target": "skills/example/SKILL.md", + "baseline_score": {"train": 1.0, "test": 1.0}, + "iterations": [], + } + (tmp_path / "evals" / "iterations" / "results.json").write_text(json.dumps(expected)) + + loaded = eval_compare.load_optimization_data(tmp_path) + + assert loaded == expected diff --git a/skills/agent-comparison/SKILL.md b/skills/agent-comparison/SKILL.md index 2fed9c1..7f7d1eb 100644 --- a/skills/agent-comparison/SKILL.md +++ b/skills/agent-comparison/SKILL.md @@ -2,12 +2,15 @@ name: agent-comparison description: | A/B test agent variants measuring quality and total session token cost - across simple and complex benchmarks. Use when creating compact agent - versions, validating agent changes, comparing internal vs external agents, - or deciding between variants for production. Use for "compare agents", - "A/B test", "benchmark agents", or "test agent efficiency". Route single-agent evaluation to agent-evaluation, testing skills, or optimizing prompts - without variant comparison. -version: 2.0.0 + across simple and complex benchmarks. Also supports automated optimization + loops (autoresearch) for frontmatter description and routing-trigger quality + using train/test eval sets. Use when creating compact agent versions, validating + agent changes, comparing internal vs external agents, optimizing a skill description, + or deciding between variants for production. Use for "compare agents", "A/B test", + "benchmark agents", "test agent efficiency", "optimize description", "optimize skill", + or "run autoresearch". Route single-agent evaluation to agent-evaluation, testing skills, + or simple prompt optimization without variant comparison. +version: 2.2.0 user-invocable: false allowed-tools: - Read @@ -22,6 +25,9 @@ routing: - "compare agents" - "A/B test agents" - "benchmark agents" + - "optimize skill" + - "optimize description" + - "run autoresearch" category: meta-tooling --- @@ -254,6 +260,129 @@ Remove temporary benchmark files and debug outputs. Keep only the comparison rep **Gate**: Report generated with all metrics. Verdict stated with evidence. Report saved to benchmark directory. +### Phase 5: OPTIMIZE (optional — invoked explicitly) + +**Goal**: Run an automated optimization loop that iteratively improves a markdown target's frontmatter `description` using trigger-rate eval tasks, then keeps only measured improvements. + +This phase is for routing/trigger optimization, not full code-generation benchmarking. Invoke it when the user says "optimize this skill", "optimize the description", or "run autoresearch". The existing manual A/B comparison (Phases 1-4) remains the path for full agent benchmarking. + +**Step 1: Validate optimization target and goal** + +Confirm the target file exists, has YAML frontmatter with a `description`, and the optimization goal is clear: + +```bash +# Target must be a markdown file with frontmatter description +test -f skills/{target}/SKILL.md +rg -n '^description:' skills/{target}/SKILL.md + +# Goal should be specific and measurable +# Good: "improve error handling instructions" +# Bad: "make it better" +``` + +**Step 2: Prepare trigger-rate eval tasks** + +```bash +python3 skills/agent-comparison/scripts/optimize_loop.py \ + --target skills/{target}/SKILL.md \ + --goal "{optimization goal}" \ + --benchmark-tasks skills/agent-comparison/references/optimization-tasks.example.json \ + --train-split 0.6 \ + --model claude-sonnet-4-20250514 \ + --verbose +``` + +Supported task schemas: +- Flat `tasks` list with optional `"split": "train" | "test"` per task +- Top-level `train` and `test` arrays + +Every task must include: +- `query`: the routing prompt to test +- `should_trigger`: whether the target should trigger for that prompt + +If no split markers are present, the loop does a reproducible random split with seed `42`. + +**Step 3: Run baseline evaluation** + +The loop automatically evaluates the unmodified target against the train set before starting iteration. This establishes the score to beat, and records a held-out baseline if test tasks exist. + +**Step 4: Enter optimization loop** + +The `optimize_loop.py` script handles the full loop: +- Calls `generate_variant.py` to propose changes (Claude with extended thinking) +- Evaluates each variant against train tasks +- Keeps variants that improve score by more than `--min-gain` (default 0.02) +- Reverts variants that don't improve, break hard gates, or delete sections without justification +- Checks held-out test set every 5 iterations for Goodhart divergence +- Stops on convergence (5 consecutive reverts), Goodhart alarm, or max iterations + +```bash +python3 skills/agent-comparison/scripts/optimize_loop.py \ + --target skills/{target}/SKILL.md \ + --goal "{optimization goal}" \ + --benchmark-tasks skills/agent-comparison/references/optimization-tasks.example.json \ + --max-iterations 20 \ + --min-gain 0.02 \ + --train-split 0.6 \ + --model claude-sonnet-4-20250514 \ + --report optimization-report.html \ + --output-dir evals/iterations \ + --verbose +``` + +The `--report` flag generates a live HTML dashboard that auto-refreshes every 10 seconds, showing a convergence chart, iteration table, and cherry-pick controls. + +**Step 5: Present results in UI** + +Open the generated `optimization-report.html` in a browser. The report shows: +- Progress dashboard (status, baseline vs best, kept/reverted counts) +- Convergence chart (train solid line, held-out dashed line, baseline dotted) +- Iteration table with verdict, composite score, delta, and change summary +- Expandable inline diffs per iteration (click any row) + +**Step 6: User cherry-picks improvements** + +Not all KEEP iterations are real improvements — some may be harness artifacts. The user reviews each kept iteration's diff and selects which to include: +- Check the "Pick" checkbox for desired iterations +- Click "Preview Combined" to see the merged diff +- Click "Export Selected" to download a JSON file with chosen diffs + +**Step 7: Apply selected improvements to target file** + +Apply the selected improvements to the original target file. + +- If you want the best single kept variant, use `evals/iterations/best_variant.md`. +- If you exported selected diffs, treat that JSON as review material for a manual follow-up apply step. It is not auto-applied by the current tooling. + +```bash +# Review the best kept variant before applying +cat evals/iterations/best_variant.md | head -20 + +# Replace the target with the best kept variant +cp evals/iterations/best_variant.md skills/{target}/SKILL.md +``` + +**Step 8: Run final evaluation on FULL task set (train + test)** + +After applying improvements, run a final evaluation on ALL tasks (not just train) to verify the improvements generalize: + +```bash +# Re-run optimize_loop.py against the same task file and inspect results.json/report output +``` + +Compare final scores to the baseline to confirm net improvement. + +**Step 9: Record in learning-db** + +```bash +python3 scripts/learning-db.py learn \ + --skill agent-comparison \ + "autoresearch: {target} improved {baseline}→{best} over {iterations} iterations. \ + Kept: {kept}/{total}. Stop: {reason}. Changes: {summaries}" +``` + +**Gate**: Optimization complete. Results reviewed. Cherry-picked improvements applied and verified against full task set. Results recorded. + ### Optional Extensions These are off by default. Enable explicitly when needed: diff --git a/skills/agent-comparison/references/benchmark-tasks.md b/skills/agent-comparison/references/benchmark-tasks.md index c3f0d4b..f6142d1 100644 --- a/skills/agent-comparison/references/benchmark-tasks.md +++ b/skills/agent-comparison/references/benchmark-tasks.md @@ -184,3 +184,76 @@ cd benchmark/{task-name}/compact && go test -race -v # Manual alternative: compare outputs side-by-side using diff diff benchmark/{task-name}/full/ benchmark/{task-name}/compact/ ``` + +## Optimization Loop Task Format + +The current optimization loop is for frontmatter-description and routing-trigger quality. It does not run full code-generation benchmarks. Use Phase 5 with trigger-rate eval tasks, then use Phases 1-4 for full agent benchmarking. + +### Supported Task File Schemas + +Flat list with optional split markers: + +```json +{ + "tasks": [ + { + "name": "go-testing-positive", + "split": "train", + "complexity": "complex", + "query": "write table-driven tests for a Go parser with subtests and helpers", + "should_trigger": true + }, + { + "name": "kubernetes-negative", + "split": "test", + "complexity": "complex", + "query": "debug a kubernetes pod stuck in CrashLoopBackOff", + "should_trigger": false + } + ] +} +``` + +Explicit top-level train/test sets: + +```json +{ + "train": [ + { + "name": "positive-1", + "query": "write Go benchmarks and race tests for a worker pool", + "should_trigger": true + } + ], + "test": [ + { + "name": "negative-1", + "query": "design a PostgreSQL indexing strategy", + "should_trigger": false + } + ] +} +``` + +### Required Fields + +- `query`: the prompt used to test routing behavior +- `should_trigger`: expected boolean outcome for the target description + +### Optional Fields + +- `name`: human-readable label shown in reports +- `split`: `train` or `test` when using a flat `tasks` list +- `complexity`: used for stratified splitting when no explicit split is provided + +### Split Strategy + +- `train` tasks are used during each optimization iteration. +- `test` tasks are held out and checked every 5 iterations for Goodhart divergence. +- If no split markers are present, the loop performs a reproducible random split with seed `42`, stratified by `complexity`. + +### Task Selection Principles for Optimization + +1. Cover both positive and negative routing examples. A description that only improves recall while tanking precision is not an improvement. +2. Put at least one out-of-domain prompt in the held-out set. This catches overfitting where the description starts matching benchmark phrasing instead of the real scope. +3. Use realistic user wording, not only canonical trigger phrases. Optimization on synthetic wording alone produces brittle routing behavior. diff --git a/skills/agent-comparison/references/optimization-guide.md b/skills/agent-comparison/references/optimization-guide.md new file mode 100644 index 0000000..2c32067 --- /dev/null +++ b/skills/agent-comparison/references/optimization-guide.md @@ -0,0 +1,181 @@ +# Autoresearch Optimization Guide + +## Scope + +The current autoresearch loop optimizes a markdown target's frontmatter +`description` using trigger-rate eval tasks. This is useful for improving +skill routing accuracy and similar description-driven dispatch behavior. + +It is not a replacement for the manual agent benchmark workflow in Phases 1-4. +If you want to compare real code-generation quality across benchmark tasks, use +the normal A/B process. + +## Supported Targets + +- `skills//SKILL.md` +- Other markdown targets with valid YAML frontmatter and a non-empty + `description` + +The loop rejects targets without parseable frontmatter or without a +`description`, because trigger-rate evaluation depends on the target text that +drives routing. + +## Supported Task Formats + +Every task must include: + +- `query`: the prompt to test +- `should_trigger`: whether the target should trigger for that prompt + +Optional fields: + +- `name`: label shown in logs and reports +- `split`: `train` or `test` +- `complexity`: used for stratified splitting when `split` is omitted + +Flat task list: + +```json +{ + "tasks": [ + { + "name": "positive-1", + "split": "train", + "complexity": "complex", + "query": "write table-driven Go tests with subtests and helper functions", + "should_trigger": true + }, + { + "name": "negative-1", + "split": "test", + "complexity": "complex", + "query": "debug a Kubernetes pod stuck in CrashLoopBackOff", + "should_trigger": false + } + ] +} +``` + +Explicit train/test sets: + +```json +{ + "train": [ + { + "name": "positive-1", + "query": "write race-safe Go tests for a worker pool", + "should_trigger": true + } + ], + "test": [ + { + "name": "negative-1", + "query": "optimize a PostgreSQL indexing strategy", + "should_trigger": false + } + ] +} +``` + +If no split markers are present, the loop performs a reproducible random split +using `--train-split` and seed `42`. + +## Command + +```bash +python3 skills/agent-comparison/scripts/optimize_loop.py \ + --target skills/go-testing/SKILL.md \ + --goal "improve routing precision without losing recall" \ + --benchmark-tasks skills/agent-comparison/references/optimization-tasks.example.json \ + --train-split 0.6 \ + --max-iterations 20 \ + --min-gain 0.02 \ + --model claude-sonnet-4-20250514 \ + --report optimization-report.html \ + --output-dir evals/iterations \ + --verbose +``` + +Useful flags: + +- `--dry-run`: exercise the loop mechanics without API calls +- `--report`: write a live HTML report +- `--output-dir`: persist iteration snapshots and `results.json` + +## Evaluation Model + +The loop follows the ADR-131 structure: + +1. Hard gates +2. Weighted composite score +3. Held-out regression checks + +### Layer 1: Hard Gates + +An iteration is rejected immediately if any of these fail: + +- `parses` +- `compiles` +- `tests_pass` +- `protected_intact` + +For description optimization, `parses` and `protected_intact` are the most +important gates. Protected sections fenced by `DO NOT OPTIMIZE` markers must be +preserved verbatim. + +### Layer 2: Composite Score + +The loop converts trigger-rate evaluation results into a weighted composite +score using the built-in weights in `optimize_loop.py`. A variant is kept only +if it beats the previous best by more than `--min-gain`. + +### Layer 3: Held-Out Regression Check + +Every 5 iterations, the current best variant is scored on the held-out test set. +If held-out performance drops below the baseline while train performance has +improved, the loop raises a Goodhart alarm and stops. + +## Deletion Safety Rule + +Deleting sections is allowed only with explicit justification. + +- `generate_variant.py` detects removed `##` headings +- the model must return a `deletion_justification` +- `optimize_loop.py` rejects deletions without one + +This enforces ADR-131's "no deletion without justification" rule. + +## Iteration Artifacts + +When `--output-dir` is set, the loop writes: + +- `001/variant.md` +- `001/scores.json` +- `001/verdict.json` +- `001/diff.patch` +- `best_variant.md` +- `results.json` + +When `--report` is set, it also writes a live HTML dashboard showing: + +- status, baseline, best score, kept/reverted counts +- convergence chart +- iteration table with diffs +- cherry-pick controls for kept iterations + +## Choosing Good Eval Tasks + +1. Include both positive and negative prompts. +2. Put realistic user phrasing in both train and held-out sets. +3. Keep at least one out-of-domain negative example in held-out. +4. Do not let the eval set collapse into benchmark keywords only. + +## Limitations + +Current limitations are intentional and documented: + +- The loop does not execute full code-generation benchmarks. +- Pattern-based benchmark tasks with `prompt`, `expected_patterns`, and + `forbidden_patterns` are not supported by `optimize_loop.py`. +- For full agent quality comparisons, continue to use the manual benchmark and + grading flow in Phases 1-4. diff --git a/skills/agent-comparison/references/optimization-tasks.example.json b/skills/agent-comparison/references/optimization-tasks.example.json new file mode 100644 index 0000000..098226e --- /dev/null +++ b/skills/agent-comparison/references/optimization-tasks.example.json @@ -0,0 +1,32 @@ +{ + "tasks": [ + { + "name": "positive-go-tests", + "split": "train", + "complexity": "complex", + "query": "write table-driven Go tests with subtests and helper functions", + "should_trigger": true + }, + { + "name": "positive-benchmarks", + "split": "train", + "complexity": "simple", + "query": "add Go benchmarks and race-safe test coverage for a worker pool", + "should_trigger": true + }, + { + "name": "negative-kubernetes", + "split": "test", + "complexity": "complex", + "query": "debug a kubernetes pod stuck in CrashLoopBackOff", + "should_trigger": false + }, + { + "name": "negative-sql", + "split": "test", + "complexity": "simple", + "query": "design a PostgreSQL indexing strategy for a reporting query", + "should_trigger": false + } + ] +} diff --git a/skills/agent-comparison/scripts/generate_variant.py b/skills/agent-comparison/scripts/generate_variant.py new file mode 100644 index 0000000..2378e50 --- /dev/null +++ b/skills/agent-comparison/scripts/generate_variant.py @@ -0,0 +1,297 @@ +#!/usr/bin/env python3 +"""Generate a variant of an agent/skill file using Claude with extended thinking. + +Proposes modifications to improve the target file based on the optimization +goal and previous iteration failures. Preserves protected sections marked +with DO NOT OPTIMIZE markers. + +Pattern: follows improve_description.py's Claude + extended thinking approach. + +Usage: + python3 skills/agent-comparison/scripts/generate_variant.py \ + --target agents/golang-general-engineer.md \ + --goal "improve error handling instructions" \ + --current-content "..." \ + --failures '[...]' \ + --model claude-sonnet-4-20250514 + +Output (JSON to stdout): + { + "variant": "full file content...", + "summary": "Added CRITICAL warning for error wrapping", + "deletion_justification": "", + "reasoning": "Extended thinking content...", + "tokens_used": 12345 + } + +See ADR-131 for safety rules. +""" + +from __future__ import annotations + +import argparse +import json +import re +import sys + +try: + import anthropic +except ImportError: # pragma: no cover - exercised in environments without the SDK + anthropic = None + +# --------------------------------------------------------------------------- +# Protected section handling +# --------------------------------------------------------------------------- + +_PROTECTED_RE = re.compile( + r"(.*?)", + re.DOTALL, +) + + +def extract_protected(content: str) -> list[str]: + """Extract all protected sections from content.""" + return _PROTECTED_RE.findall(content) + + +def restore_protected(original: str, variant: str) -> str: + """Restore protected sections from original into variant.""" + orig_sections = extract_protected(original) + var_sections = extract_protected(variant) + + if len(orig_sections) != len(var_sections): + print( + "Warning: Protected section count mismatch " + f"(original={len(orig_sections)}, variant={len(var_sections)}).", + file=sys.stderr, + ) + return variant + + result = variant + for orig_sec, var_sec in zip(orig_sections, var_sections): + result = result.replace(var_sec, orig_sec, 1) + + return result + + +# --------------------------------------------------------------------------- +# Deletion detection +# --------------------------------------------------------------------------- + + +def detect_deletions(original: str, variant: str) -> list[str]: + """Find sections that exist in original but are missing from variant. + + Returns list of deleted section headings. Only checks ## headings. + """ + orig_headings = set(re.findall(r"^##\s+(.+)$", original, re.MULTILINE)) + var_headings = set(re.findall(r"^##\s+(.+)$", variant, re.MULTILINE)) + return sorted(orig_headings - var_headings) + + +# --------------------------------------------------------------------------- +# Variant generation +# --------------------------------------------------------------------------- + + +def generate_variant( + client: anthropic.Anthropic, + target_path: str, + goal: str, + current_content: str, + failures: list[dict], + model: str, + history: list[dict] | None = None, +) -> dict: + """Call Claude to generate a variant of the target file. + + Returns dict with variant content, summary, reasoning, and token count. + """ + # Build the prompt + failure_section = "" + if failures: + failure_section = "\n\nFailed tasks from the last iteration:\n" + for f in failures: + failure_section += f" - {f.get('name', 'unnamed')}: {f.get('details', 'failed')}\n" + + history_section = "" + if history: + history_section = "\n\nPrevious attempts (do NOT repeat — try structurally different approaches):\n" + for h in history: + history_section += f" Iteration {h.get('number', '?')}: {h.get('verdict', '?')} — {h.get('change_summary', '')}\n" + + protected_sections = extract_protected(current_content) + protected_notice = "" + if protected_sections: + protected_notice = f""" + +CRITICAL SAFETY RULE: The file contains {len(protected_sections)} protected section(s) marked with + and markers. +You MUST preserve these sections EXACTLY as they are — character for character. +Do not add, remove, or modify anything between these markers. +This is non-negotiable: protected sections contain safety gates that must not be +removed even if removing them would improve test scores.""" + + prompt = f"""You are optimizing an agent/skill file to improve its performance. + +Target file: {target_path} +Optimization goal: {goal} + +Current content of the file: + +{current_content} + +{failure_section}{history_section}{protected_notice} + +SAFETY RULES: +1. Do NOT delete sections without replacing them with equivalent or better content. + If you remove a section heading that exists in the original, you must explain what + replaces the removed functionality. Pure deletion degrades unmeasured capabilities. + +2. Do NOT change the tools, SDKs, or interfaces the agent uses. The variant must work + in the same environment as the original (no switching from SDK to curl, etc.). + +3. Keep YAML frontmatter structure intact (name, description, routing, etc.). + +4. Focus on making the agent/skill better at achieving the stated goal. Common + improvements include: + - Moving critical information to more prominent positions (CRITICAL banners) + - Adding explicit planning steps before code generation + - Improving error handling instructions with specific patterns + - Adding concrete examples for ambiguous instructions + - Restructuring for clarity when sections are dense + +Please respond with the complete modified file content inside tags, +and a brief summary of what you changed and why inside tags. + +If you removed any existing `##` section heading, include a brief justification +inside tags. If you did not remove a section, return +empty tags. + + +[complete file content here] + + + +[1-2 sentence description of the change] + + + +[why any removed section was replaced safely, or leave blank] +""" + + try: + response = client.messages.create( + model=model, + max_tokens=16000, + thinking={ + "type": "enabled", + "budget_tokens": 10000, + }, + messages=[{"role": "user", "content": prompt}], + ) + except anthropic.APIStatusError as e: + print(f"Error: API returned status {e.status_code}: {e.message}", file=sys.stderr) + sys.exit(1) + except anthropic.APIConnectionError as e: + print(f"Error: API connection failed: {e}", file=sys.stderr) + sys.exit(1) + + # Extract thinking and text + thinking_text = "" + text = "" + for block in response.content: + if block.type == "thinking": + thinking_text = block.thinking + elif block.type == "text": + text = block.text + + # Parse variant content + variant_match = re.search(r"(.*?)", text, re.DOTALL) + if not variant_match: + print("Error: No tags in response", file=sys.stderr) + sys.exit(1) + + variant = variant_match.group(1).strip() + + # Parse summary + summary_match = re.search(r"(.*?)", text, re.DOTALL) + summary = summary_match.group(1).strip() if summary_match else "No summary provided" + + deletion_match = re.search(r"(.*?)", text, re.DOTALL) + deletion_justification = deletion_match.group(1).strip() if deletion_match else "" + + # Restore protected sections (safety net) + variant = restore_protected(current_content, variant) + + # Check for unauthorized deletions + deletions = detect_deletions(current_content, variant) + if deletions: + print(f"Warning: Deleted sections: {deletions}", file=sys.stderr) + + tokens_used = response.usage.input_tokens + response.usage.output_tokens + + return { + "variant": variant, + "summary": summary, + "deletion_justification": deletion_justification, + "reasoning": thinking_text, + "tokens_used": tokens_used, + "deletions": deletions, + } + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def main(): + parser = argparse.ArgumentParser(description="Generate agent/skill variant using Claude") + parser.add_argument("--target", required=True, help="Path to target file (for context)") + parser.add_argument("--goal", required=True, help="Optimization goal") + content_group = parser.add_mutually_exclusive_group(required=True) + content_group.add_argument("--current-content", help="Current file content") + content_group.add_argument("--current-content-file", help="Path to a file containing the current content") + parser.add_argument("--failures", default="[]", help="JSON list of failed tasks") + parser.add_argument("--history", default="[]", help="JSON list of previous iterations") + parser.add_argument("--model", default="claude-sonnet-4-20250514", help="Model to use") + args = parser.parse_args() + + try: + failures = json.loads(args.failures) + except json.JSONDecodeError as e: + print(f"Error: --failures is not valid JSON: {e}", file=sys.stderr) + sys.exit(1) + try: + history = json.loads(args.history) + except json.JSONDecodeError as e: + print(f"Error: --history is not valid JSON: {e}", file=sys.stderr) + sys.exit(1) + + if anthropic is None: + print("Error: anthropic SDK is not installed", file=sys.stderr) + sys.exit(1) + + current_content = ( + open(args.current_content_file, encoding="utf-8").read() + if args.current_content_file + else args.current_content + ) + + client = anthropic.Anthropic() + result = generate_variant( + client=client, + target_path=args.target, + goal=args.goal, + current_content=current_content, + failures=failures, + model=args.model, + history=history if history else None, + ) + + print(json.dumps(result, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/skills/agent-comparison/scripts/optimize_loop.py b/skills/agent-comparison/scripts/optimize_loop.py new file mode 100644 index 0000000..228dd1d --- /dev/null +++ b/skills/agent-comparison/scripts/optimize_loop.py @@ -0,0 +1,1107 @@ +#!/usr/bin/env python3 +"""Autoresearch optimization loop for agent/skill files. + +Wraps the existing agent-comparison evaluation infrastructure in an outer +loop that proposes variants, evaluates them, and keeps/reverts based on +score improvement. The keep/revert decision is arithmetic — no LLM +judgment in the loop itself. + +Usage: + python3 skills/agent-comparison/scripts/optimize_loop.py \ + --target agents/golang-general-engineer.md \ + --goal "improve error handling instructions" \ + --benchmark-tasks tasks.json \ + --max-iterations 20 \ + --min-gain 0.02 + +See ADR-131 for architecture details. +""" + +from __future__ import annotations + +import argparse +import json +import random +import re +import subprocess +import sys +import tempfile +import time +from pathlib import Path + +# --------------------------------------------------------------------------- +# Scoring helpers +# --------------------------------------------------------------------------- + +QUALITY_WEIGHTS = { + "correctness": 0.40, + "error_handling": 0.20, + "language_idioms": 0.15, + "testing": 0.15, + "efficiency": 0.10, +} + +HARD_GATE_KEYS = ["parses", "compiles", "tests_pass", "protected_intact"] + + +def passes_hard_gates(scores: dict) -> bool: + """Layer 1: Hard gates — score is 0 if any fail.""" + return all(scores.get(key, False) for key in HARD_GATE_KEYS) + + +def composite_score(scores: dict) -> float: + """Layer 2: Weighted quality score, conditional on hard gates passing.""" + if not passes_hard_gates(scores): + return 0.0 + total = 0.0 + for dim, weight in QUALITY_WEIGHTS.items(): + total += scores.get(dim, 0.0) * weight + return round(total, 4) + + +def holdout_diverges( + train_score: float, + holdout_score: float, + baseline_holdout: float, + baseline_train: float = 0.0, + threshold: float = 0.5, +) -> bool: + """Goodhart alarm: held-out score drops while train has improved.""" + holdout_dropped = (baseline_holdout - holdout_score) > threshold + train_improved = train_score > baseline_train + return holdout_dropped and train_improved + + +# --------------------------------------------------------------------------- +# Iteration snapshot +# --------------------------------------------------------------------------- + + +def save_iteration( + output_dir: Path, + iteration: int, + variant_content: str, + scores: dict, + verdict: str, + reasoning: str, + diff_text: str, + change_summary: str, + stop_reason: str | None = None, + deletions: list[str] | None = None, + deletion_justification: str = "", +) -> dict: + """Save a full iteration snapshot and return its metadata.""" + iter_dir = output_dir / f"{iteration:03d}" + iter_dir.mkdir(parents=True, exist_ok=True) + + (iter_dir / "variant.md").write_text(variant_content) + (iter_dir / "scores.json").write_text(json.dumps(scores, indent=2)) + + verdict_data = { + "iteration": iteration, + "verdict": verdict, + "composite_score": composite_score(scores), + "change_summary": change_summary, + "reasoning": reasoning, + "stop_reason": stop_reason, + "deletions": deletions or [], + "deletion_justification": deletion_justification, + } + (iter_dir / "verdict.json").write_text(json.dumps(verdict_data, indent=2)) + + if diff_text: + (iter_dir / "diff.patch").write_text(diff_text) + + return verdict_data + + +# --------------------------------------------------------------------------- +# Diff generation +# --------------------------------------------------------------------------- + + +def generate_diff(original: str, variant: str, label: str = "target") -> str: + """Generate a unified diff between two strings.""" + import difflib + + original_lines = original.splitlines(keepends=True) + variant_lines = variant.splitlines(keepends=True) + diff = difflib.unified_diff( + original_lines, + variant_lines, + fromfile=f"a/{label}", + tofile=f"b/{label}", + lineterm="\n", + ) + return "".join(diff) + + +def make_dry_run_variant(current_content: str, iteration: int) -> tuple[str, str, str]: + """Generate a deterministic local variant for --dry-run mode.""" + marker = f"" + if marker in current_content: + marker = f"" + if current_content.endswith("\n"): + variant = current_content + marker + "\n" + else: + variant = current_content + "\n" + marker + "\n" + return variant, "Synthetic dry-run mutation", "dry-run synthetic variant" + + +# --------------------------------------------------------------------------- +# HTML report generation +# --------------------------------------------------------------------------- + + +def _build_report_data( + target: str, + goal: str, + baseline_composite: float, + baseline_holdout: float | None, + train_size: int, + test_size: int, + iterations: list[dict], + max_iterations: int, + status: str, + total_tokens: int, +) -> dict: + """Build the data structure for HTML report generation.""" + return { + "target": target, + "goal": goal, + "status": status, + "baseline_score": {"train": baseline_composite, "test": baseline_holdout}, + "task_counts": {"train": train_size, "test": test_size}, + "max_iterations": max_iterations, + "total_tokens": total_tokens, + "iterations": iterations, + } + + +def generate_optimization_report(data: dict, auto_refresh: bool = False) -> str: + """Generate iteration history HTML report. + + The convergence chart is built client-side using safe DOM methods + (createElementNS, setAttribute, textContent) — no innerHTML. + All string data is escaped server-side via html.escape before + embedding in the template. + """ + import html as html_mod + + target = html_mod.escape(data.get("target", "")) + goal = html_mod.escape(data.get("goal", "")) + status = data.get("status", "RUNNING") + iterations = data.get("iterations", []) + baseline = data.get("baseline_score", {}) + task_counts = data.get("task_counts", {}) + + refresh = '' if auto_refresh else "" + + rows = "" + for it in iterations: + v = it["verdict"] + vcls = {"KEEP": "keep", "REVERT": "revert", "STOP": "stop"}.get(v, "") + sc = it["score"] + train_score = sc.get("train") + test_score = sc.get("test") + score_str = f'{train_score:.2f}' if isinstance(train_score, (int, float)) else "?" + if isinstance(test_score, (int, float)): + score_str += f' / {test_score:.2f}' + delta = str(it.get("delta", "")) + dcls = "d-pos" if delta.startswith("+") and delta != "+0" else "d-neg" if delta.startswith("-") else "d-zero" + summary = html_mod.escape(str(it.get("change_summary", ""))[:80]) + diff_esc = html_mod.escape(str(it.get("diff", ""))) + is_keep = v == "KEEP" + n = it["number"] + + rows += f""" + + {n} + {v} + {score_str} + {delta} + {summary} + + + +
{diff_esc}
+ """ + + chart_json = json.dumps([ + {"x": it["number"], "train": it["score"].get("train", 0), "test": it["score"].get("test")} + for it in iterations + ]) + diffs_json = json.dumps({it["number"]: str(it.get("diff", "")) for it in iterations}) + + bt = baseline.get("train", 0.0) + best = max((it["score"].get("train", bt) for it in iterations), default=bt) + kept = sum(1 for it in iterations if it["verdict"] == "KEEP") + reverted = sum(1 for it in iterations if it["verdict"] == "REVERT") + cur = len(iterations) + mx = data.get("max_iterations", 20) + scls = "running" if status == "RUNNING" else "done" if status in ("CONVERGED", "COMPLETE") else "alarm" + score_label = f"Train tasks: {task_counts.get('train', 0)}" + if task_counts.get("test"): + score_label += f" | Held-out tasks: {task_counts['test']}" + + return f""" + + +{refresh} +Optimization: {target} + + + +

Optimization: {target}

+

Goal: {goal}

+
+
Status{status}
+
Progress{cur}/{mx}
+
Baseline{bt:.2f}
+
Best{best:.2f} ({best - bt:+.2f})
+
Kept{kept}
+
Reverted{reverted}
+
+

{score_label}

+
+ + +{rows} +
#VerdictScoreDeltaChangePick
+
+ + +
+ + + +""" + + +# --------------------------------------------------------------------------- +# Task loading and splitting +# --------------------------------------------------------------------------- + + +def load_benchmark_tasks(path: Path) -> list[dict]: + """Load benchmark tasks from JSON file.""" + data = json.loads(path.read_text()) + if isinstance(data, list): + return data + if "tasks" in data: + return data["tasks"] + if "train" in data or "test" in data: + tasks = [] + for split_name in ("train", "test"): + for task in data.get(split_name, []): + normalized = dict(task) + normalized.setdefault("split", split_name) + tasks.append(normalized) + return tasks + raise ValueError("Task file must be a list, {'tasks': [...]}, or {'train': [...], 'test': [...]}.") + + +def split_tasks( + tasks: list[dict], + train_split: float, + seed: int = 42, +) -> tuple[list[dict], list[dict]]: + """Split tasks into train and test sets. + + Uses explicit 'split' field if present, otherwise random split + stratified by complexity. + """ + has_explicit = any("split" in t for t in tasks) + if has_explicit: + train = [t for t in tasks if t.get("split", "train") == "train"] + test = [t for t in tasks if t.get("split") == "test"] + return train, test + + rng = random.Random(seed) + by_complexity: dict[str, list[dict]] = {} + for t in tasks: + by_complexity.setdefault(t.get("complexity", "medium"), []).append(t) + + train, test = [], [] + for group in by_complexity.values(): + rng.shuffle(group) + n_train = max(1, int(len(group) * train_split)) + train.extend(group[:n_train]) + test.extend(group[n_train:]) + + return train, test + + +# --------------------------------------------------------------------------- +# Frontmatter parsing +# --------------------------------------------------------------------------- + + +def _parse_frontmatter(content: str) -> tuple[bool, str]: + """Parse YAML frontmatter, returning (valid, description).""" + if not content.startswith("---"): + return False, "" + lines = content.split("\n") + end_idx = None + for i, line in enumerate(lines[1:], start=1): + if line.strip() == "---": + end_idx = i + break + if end_idx is None: + return False, "" + + description = "" + fm_lines = lines[1:end_idx] + idx = 0 + while idx < len(fm_lines): + line = fm_lines[idx] + if line.startswith("description:"): + value = line[len("description:"):].strip() + if value in (">", "|", ">-", "|-"): + parts: list[str] = [] + idx += 1 + while idx < len(fm_lines) and (fm_lines[idx].startswith(" ") or fm_lines[idx].startswith("\t")): + parts.append(fm_lines[idx].strip()) + idx += 1 + description = " ".join(parts) + continue + else: + description = value.strip('"').strip("'") + idx += 1 + return True, description + + +def _is_trigger_task(task: dict) -> bool: + return "query" in task and "should_trigger" in task + + +def _is_pattern_task(task: dict) -> bool: + return "prompt" in task and ( + "expected_patterns" in task or "forbidden_patterns" in task or "weight" in task + ) + + +def _validate_task_set(tasks: list[dict]) -> None: + """Reject unsupported or mixed task formats early with a clear error.""" + if not tasks: + raise ValueError("Task file is empty.") + + trigger_tasks = sum(1 for task in tasks if _is_trigger_task(task)) + pattern_tasks = sum(1 for task in tasks if _is_pattern_task(task)) + + if trigger_tasks and pattern_tasks: + raise ValueError("Task file mixes trigger-rate and pattern benchmark formats. Use one format per run.") + + if trigger_tasks == len(tasks): + return + + if pattern_tasks == len(tasks): + raise ValueError( + "Pattern benchmark tasks are not supported by optimize_loop.py yet. " + "Use trigger-rate tasks with 'query' and 'should_trigger' fields." + ) + + raise ValueError( + "Unsupported task format. Expected trigger-rate tasks with 'query' and 'should_trigger' fields." + ) + + +# --------------------------------------------------------------------------- +# Trigger-rate evaluator (uses existing run_eval infrastructure) +# --------------------------------------------------------------------------- + + +def _run_trigger_rate( + target_path: Path, + description: str, + tasks: list[dict], + num_workers: int = 5, + timeout: int = 30, + verbose: bool = False, +) -> dict: + """Run trigger-rate assessment using the skill_eval infrastructure. + + Tasks must have 'query' and 'should_trigger' fields. + Returns run_eval-style results dict. + """ + import os + import tempfile + + task_file = None + try: + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(tasks, f) + task_file = f.name + + with tempfile.TemporaryDirectory() as skill_dir: + skill_md = Path(skill_dir) / "SKILL.md" + skill_md.write_text(target_path.read_text()) + + project_root = Path.cwd() + for parent in [project_root, *project_root.parents]: + if (parent / ".claude").is_dir(): + project_root = parent + break + + cmd = [ + sys.executable, "-m", "scripts.skill_eval.run_eval", + "--eval-set", task_file, + "--skill-path", skill_dir, + "--description", description, + "--num-workers", str(num_workers), + "--timeout", str(timeout), + "--runs-per-query", "1", + ] + if verbose: + cmd.append("--verbose") + print(f"Running trigger assessment: {len(tasks)} queries", file=sys.stderr) + + env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} + + result = subprocess.run( + cmd, capture_output=True, text=True, + cwd=str(project_root), env=env, timeout=600, + ) + + if result.returncode != 0: + if verbose: + print(f"Trigger assessment failed: {result.stderr[:300]}", file=sys.stderr) + return {"results": [], "summary": {"total": 0, "passed": 0, "failed": 0}} + + return json.loads(result.stdout) + finally: + if task_file: + Path(task_file).unlink(missing_ok=True) + + +# --------------------------------------------------------------------------- +# Evaluation bridge +# --------------------------------------------------------------------------- + + +def assess_target( + target_path: Path, + tasks: list[dict], + goal: str, + verbose: bool = False, + dry_run: bool = False, +) -> dict: + """Assess a target file against tasks. + + Supports three modes: + - Trigger-rate: tasks have 'query' + 'should_trigger' fields. + Uses existing run_eval infrastructure via claude -p. + - Dry-run: returns synthetic scores for testing loop mechanics. + - Benchmark (NYI): tasks have 'prompt' + 'name' fields. + + Returns scores dict with hard gate booleans and quality dimensions. + """ + scores: dict = { + "parses": True, + "compiles": True, + "tests_pass": True, + "protected_intact": True, + "correctness": 0.0, + "error_handling": 0.0, + "language_idioms": 0.0, + "testing": 0.0, + "efficiency": 0.0, + "task_results": [], + } + + content = target_path.read_text() + valid, description = _parse_frontmatter(content) + if not valid or not description: + scores["parses"] = False + return scores + + # Dry-run mode: content-dependent synthetic scores for testing loop mechanics. + # Hard gates always pass (the point is testing keep/revert logic). + # Quality scores vary deterministically based on content hash so that + # different variants produce different scores. + if dry_run: + import hashlib + h = int(hashlib.sha256(content.encode()).hexdigest()[:8], 16) + base = (h % 30 + 70) / 100.0 # 0.70-1.00 range — always decent + scores["correctness"] = round(base * 10, 2) + scores["error_handling"] = round(base * 8, 2) + scores["language_idioms"] = round(base * 7, 2) + scores["testing"] = round(base * 7, 2) + scores["efficiency"] = round(base * 6, 2) + scores["tests_pass"] = True # always pass in dry-run + for task in tasks: + name = task.get("name", task.get("query", "unnamed"))[:40] + scores["task_results"].append({ + "name": name, "passed": True, + "score": base, "details": "dry-run", + }) + return scores + + # Detect assessment mode from task format + is_trigger = all(_is_trigger_task(task) for task in tasks) + + if is_trigger: + results = _run_trigger_rate(target_path, description, tasks, verbose=verbose) + summary = results.get("summary", {}) + total = summary.get("total", 0) + passed = summary.get("passed", 0) + if total == 0: + return scores + + accuracy = passed / total + scores["correctness"] = round(accuracy * 10, 2) + scores["error_handling"] = round(accuracy * 8, 2) + scores["language_idioms"] = round(accuracy * 7, 2) + scores["testing"] = round(accuracy * 8, 2) + scores["efficiency"] = round(min(1.0, accuracy + 0.1) * 6, 2) + scores["tests_pass"] = passed == total + + for r in results.get("results", []): + scores["task_results"].append({ + "name": r.get("query", "unnamed")[:40], + "passed": r.get("pass", False), + "score": 1.0 if r.get("pass", False) else 0.0, + "details": f"trigger_rate={r.get('trigger_rate', 0):.2f}", + }) + return scores + + # Benchmark behavioral assessment — not yet implemented. + # Use trigger-rate format (tasks with 'query' + 'should_trigger') + # as the recommended starting point per ADR-131 research findings. + raise NotImplementedError( + "Pattern benchmark tasks are not yet implemented. " + "Use trigger-rate tasks with 'query' and 'should_trigger' fields. " + "See optimization-guide.md." + ) + + +# --------------------------------------------------------------------------- +# Protected section validation +# --------------------------------------------------------------------------- + +_PROTECTED_RE = re.compile( + r"(.*?)", + re.DOTALL, +) + + +def check_protected_sections(original: str, variant: str) -> bool: + """Verify DO NOT OPTIMIZE sections are preserved verbatim.""" + orig = list(_PROTECTED_RE.finditer(original)) + var = list(_PROTECTED_RE.finditer(variant)) + if len(orig) != len(var): + return False + return all(orig_match.group(0) == var_match.group(0) for orig_match, var_match in zip(orig, var)) + + +# --------------------------------------------------------------------------- +# Main loop +# --------------------------------------------------------------------------- + + +def run_optimization_loop( + target_path: Path, + goal: str, + benchmark_tasks_path: Path, + max_iterations: int = 20, + min_gain: float = 0.02, + train_split: float = 0.6, + model: str = "claude-sonnet-4-20250514", + verbose: bool = False, + report_path: Path | None = None, + output_dir: Path | None = None, + dry_run: bool = False, +) -> dict: + """Run the autoresearch optimization loop.""" + if output_dir is None: + output_dir = Path("evals/iterations") + output_dir.mkdir(parents=True, exist_ok=True) + + all_tasks = load_benchmark_tasks(benchmark_tasks_path) + _validate_task_set(all_tasks) + train_tasks, test_tasks = split_tasks(all_tasks, train_split) + + if verbose: + print(f"Tasks: {len(train_tasks)} train, {len(test_tasks)} test", file=sys.stderr) + + original_content = target_path.read_text() + target_valid, target_description = _parse_frontmatter(original_content) + if not target_valid or not target_description: + raise ValueError( + "Target must have YAML frontmatter with a non-empty description. " + "optimize_loop.py currently supports frontmatter-description optimization only." + ) + current_content = original_content + target_label = target_path.name + + if verbose: + print("Running baseline evaluation...", file=sys.stderr) + + baseline_scores = assess_target(target_path, train_tasks, goal, verbose, dry_run) + baseline_composite = composite_score(baseline_scores) + best_score = baseline_composite + best_content = current_content + best_iteration = 0 + + baseline_holdout_scores = assess_target(target_path, test_tasks, goal, verbose, dry_run) if test_tasks else None + baseline_holdout = composite_score(baseline_holdout_scores) if baseline_holdout_scores else None + + if verbose: + holdout_display = f"{baseline_holdout:.4f}" if baseline_holdout is not None else "n/a" + print(f"Baseline: train={baseline_composite:.4f}, holdout={holdout_display}", file=sys.stderr) + + iterations: list[dict] = [] + consecutive_reverts = 0 + exit_reason = "unknown" + status = "RUNNING" + total_tokens = 0 + + for i in range(1, max_iterations + 1): + if verbose: + print(f"\n{'=' * 60}\nIteration {i}/{max_iterations} (best={best_score:.4f})", file=sys.stderr) + + # 1. Generate variant + t0 = time.time() + last_failures = [] + if iterations: + last_scores_data = iterations[-1].get("scores", {}) + last_failures = [t for t in last_scores_data.get("task_results", []) if not t.get("passed")] + history = [ + { + "number": item["number"], + "verdict": item["verdict"], + "change_summary": item["change_summary"], + "delta": item["delta"], + } + for item in iterations[-5:] + ] + + if dry_run: + variant_content, change_summary, reasoning = make_dry_run_variant(current_content, i) + variant_output = { + "variant": variant_content, + "summary": change_summary, + "reasoning": reasoning, + "tokens_used": 0, + "deletions": [], + "deletion_justification": "", + } + deletions = [] + deletion_justification = "" + else: + with tempfile.NamedTemporaryFile(mode="w", suffix=target_path.suffix, encoding="utf-8") as current_file: + current_file.write(current_content) + current_file.flush() + variant_result = subprocess.run( + [ + sys.executable, + str(Path(__file__).parent / "generate_variant.py"), + "--target", str(target_path), + "--goal", goal, + "--current-content-file", current_file.name, + "--failures", json.dumps(last_failures), + "--history", json.dumps(history), + "--model", model, + ], + capture_output=True, text=True, timeout=120, + ) + + if variant_result.returncode != 0: + if verbose: + print(f"Variant generation failed: {variant_result.stderr}", file=sys.stderr) + consecutive_reverts += 1 + iterations.append({ + "number": i, "verdict": "REVERT", + "score": {"train": best_score}, + "delta": "0", "change_summary": "Variant generation failed", + "reasoning": variant_result.stderr[:200], "diff": "", + }) + if consecutive_reverts >= 5: + exit_reason = f"converged (5 consecutive reverts at iteration {i})" + status = "CONVERGED" + break + continue + + try: + variant_output = json.loads(variant_result.stdout) + variant_content = variant_output["variant"] + change_summary = variant_output.get("summary", "") + reasoning = variant_output.get("reasoning", "") + total_tokens += variant_output.get("tokens_used", 0) + deletions = variant_output.get("deletions", []) + deletion_justification = variant_output.get("deletion_justification", "").strip() + except (json.JSONDecodeError, KeyError) as e: + if verbose: + print(f"Parse error: {e}", file=sys.stderr) + consecutive_reverts += 1 + iterations.append({ + "number": i, "verdict": "REVERT", + "score": {"train": best_score}, + "delta": "0", "change_summary": f"Parse error: {e}", + "reasoning": "", "diff": "", + }) + if consecutive_reverts >= 5: + exit_reason = f"converged (5 consecutive reverts at iteration {i})" + status = "CONVERGED" + break + continue + + gen_elapsed = time.time() - t0 + + # 2. Validate protected sections + if not check_protected_sections(original_content, variant_content): + if verbose: + print("REJECTED: Protected sections modified", file=sys.stderr) + diff_text = generate_diff(current_content, variant_content, target_label) + save_iteration(output_dir, i, variant_content, {"protected_intact": False}, + "REVERT", "Protected sections modified", diff_text, change_summary) + iterations.append({ + "number": i, "verdict": "REVERT", + "score": {"train": 0.0}, + "delta": "0", "change_summary": "Protected sections modified", + "reasoning": reasoning, "diff": diff_text, + }) + consecutive_reverts += 1 + if consecutive_reverts >= 5: + exit_reason = f"converged (5 consecutive reverts at iteration {i})" + status = "CONVERGED" + break + continue + + if deletions and not deletion_justification: + if verbose: + print(f"REJECTED: Deleted sections without justification: {deletions}", file=sys.stderr) + diff_text = generate_diff(current_content, variant_content, target_label) + save_iteration( + output_dir, + i, + variant_content, + {"protected_intact": True}, + "REVERT", + "Deleted sections without justification", + diff_text, + change_summary, + deletions=deletions, + ) + iterations.append({ + "number": i, + "verdict": "REVERT", + "score": {"train": best_score}, + "delta": "0", + "change_summary": "Deleted sections without justification", + "reasoning": reasoning, + "diff": diff_text, + "deletions": deletions, + "deletion_justification": "", + }) + consecutive_reverts += 1 + if consecutive_reverts >= 5: + exit_reason = f"converged (5 consecutive reverts at iteration {i})" + status = "CONVERGED" + break + continue + + # 3. Evaluate variant + temp_target = target_path.parent / f".{target_path.stem}_variant{target_path.suffix}" + temp_target.write_text(variant_content) + try: + t0 = time.time() + variant_scores = assess_target(temp_target, train_tasks, goal, verbose, dry_run) + eval_elapsed = time.time() - t0 + variant_composite = composite_score(variant_scores) + finally: + temp_target.unlink(missing_ok=True) + + diff_text = generate_diff(current_content, variant_content, target_label) + + if verbose: + print(f"Score: {variant_composite:.4f} (gain={variant_composite - best_score:.4f}, gen={gen_elapsed:.1f}s, eval={eval_elapsed:.1f}s)", file=sys.stderr) + + # 4. Keep/revert (deterministic arithmetic) + gain = variant_composite - best_score + if gain > min_gain: + verdict = "KEEP" + best_score = variant_composite + best_content = variant_content + best_iteration = i + current_content = variant_content + consecutive_reverts = 0 + delta_str = f"+{gain:.2f}" + else: + verdict = "REVERT" + consecutive_reverts += 1 + delta_str = f"{gain:+.2f}" if gain != 0 else "0" + + if deletions and deletion_justification: + change_summary = f"{change_summary} [deletion justified]" + + save_iteration(output_dir, i, variant_content, variant_scores, + verdict, reasoning, diff_text, change_summary, + deletions=deletions, deletion_justification=deletion_justification) + + iteration_data: dict = { + "number": i, "verdict": verdict, + "score": {"train": variant_composite, "test": None}, + "delta": delta_str, "change_summary": change_summary, + "reasoning": reasoning, "diff": diff_text, + "tokens_used": variant_output.get("tokens_used", 0), + "scores": variant_scores, + "deletions": deletions, + "deletion_justification": deletion_justification, + } + + # 5. Goodhart alarm — every 5 iterations, check held-out set + if test_tasks and i % 5 == 0: + try: + temp_target.write_text(best_content) + holdout_scores = assess_target(temp_target, test_tasks, goal, verbose, dry_run) + holdout_composite = composite_score(holdout_scores) + iteration_data["score"]["test"] = holdout_composite + finally: + temp_target.unlink(missing_ok=True) + + if holdout_diverges(best_score, holdout_composite, baseline_holdout, baseline_composite): + if verbose: + print(f"GOODHART ALARM: holdout={holdout_composite:.4f} vs baseline={baseline_holdout:.4f}", file=sys.stderr) + exit_reason = f"goodhart_alarm (iteration {i})" + status = "GOODHART_ALARM" + iterations.append(iteration_data) + break + + iterations.append(iteration_data) + + # 6. Convergence check + if consecutive_reverts >= 5: + exit_reason = f"converged (5 consecutive reverts at iteration {i})" + status = "CONVERGED" + break + + # Regenerate live report + if report_path: + rd = _build_report_data(target_label, goal, baseline_composite, baseline_holdout, + len(train_tasks), len(test_tasks), iterations, max_iterations, + status, total_tokens) + report_path.write_text(generate_optimization_report(rd, auto_refresh=True)) + + else: + exit_reason = f"max_iterations ({max_iterations})" + status = "COMPLETE" + + # Final report + if report_path: + rd = _build_report_data(target_label, goal, baseline_composite, baseline_holdout, + len(train_tasks), len(test_tasks), iterations, max_iterations, + status, total_tokens) + report_path.write_text(generate_optimization_report(rd, auto_refresh=False)) + + if best_iteration > 0: + best_path = output_dir / "best_variant.md" + best_path.write_text(best_content) + if verbose: + print(f"\nBest variant saved to: {best_path}", file=sys.stderr) + + result = { + "exit_reason": exit_reason, "status": status, + "target": str(target_path), "goal": goal, + "baseline_score": {"train": baseline_composite, "test": baseline_holdout}, + "baseline_train_score": baseline_composite, + "baseline_holdout_score": baseline_holdout, + "best_score": best_score, + "best_iteration": best_iteration, "iterations_run": len(iterations), + "max_iterations": max_iterations, + "improvements_found": sum(1 for it in iterations if it["verdict"] == "KEEP"), + "total_tokens": total_tokens, + "train_size": len(train_tasks), "test_size": len(test_tasks), + "iterations": iterations, + } + (output_dir / "results.json").write_text(json.dumps(result, indent=2)) + return result + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def main(): + parser = argparse.ArgumentParser(description="Autoresearch optimization loop for agent/skill files") + parser.add_argument("--target", required=True, help="Path to agent/skill file to optimize") + parser.add_argument("--goal", required=True, help="Optimization objective") + parser.add_argument("--benchmark-tasks", required=True, help="Path to benchmark tasks JSON") + parser.add_argument("--max-iterations", type=int, default=20, help="Max iterations (default: 20)") + parser.add_argument("--min-gain", type=float, default=0.02, help="Min score gain to keep (default: 0.02)") + parser.add_argument("--train-split", type=float, default=0.6, help="Train fraction (default: 0.6)") + parser.add_argument("--model", required=True, help="Model for variant generation") + parser.add_argument("--verbose", action="store_true", help="Print progress to stderr") + parser.add_argument("--dry-run", action="store_true", help="Use synthetic scores (test loop mechanics without API)") + parser.add_argument("--report", default=None, help="Path for live HTML report") + parser.add_argument("--output-dir", default=None, help="Directory for iteration snapshots") + args = parser.parse_args() + + target = Path(args.target) + if not target.exists(): + print(f"Error: Target not found: {target}", file=sys.stderr) + sys.exit(1) + + tasks_path = Path(args.benchmark_tasks) + if not tasks_path.exists(): + print(f"Error: Tasks not found: {tasks_path}", file=sys.stderr) + sys.exit(1) + + try: + result = run_optimization_loop( + target_path=target, goal=args.goal, + benchmark_tasks_path=tasks_path, + max_iterations=args.max_iterations, min_gain=args.min_gain, + train_split=args.train_split, model=args.model, + verbose=args.verbose, + report_path=Path(args.report) if args.report else None, + output_dir=Path(args.output_dir) if args.output_dir else None, + dry_run=args.dry_run, + ) + except ValueError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + print(json.dumps(result, indent=2)) + if args.verbose: + print(f"\nExit: {result['exit_reason']}", file=sys.stderr) + print(f"Best: {result['best_score']:.4f} (iteration {result['best_iteration']})", file=sys.stderr) + print(f"Improvements: {result['improvements_found']}/{result['iterations_run']}", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/skills/skill-creator/assets/eval_viewer.html b/skills/skill-creator/assets/eval_viewer.html index 636532b..48822b6 100644 --- a/skills/skill-creator/assets/eval_viewer.html +++ b/skills/skill-creator/assets/eval_viewer.html @@ -641,6 +641,98 @@ @media (prefers-reduced-motion: reduce) { *, *::before, *::after { transition: none !important; animation: none !important; } } + +/* ─── Iterations Tab ───────────────────────────────────────────────── */ +.opt-dashboard { + background: var(--surface); + border: 1px solid var(--border); + border-radius: var(--radius-lg); + padding: 16px 20px; + margin-bottom: 20px; + display: grid; + grid-template-columns: repeat(auto-fit, minmax(160px, 1fr)); + gap: 12px; +} + +.opt-dash-item { + display: flex; + flex-direction: column; + gap: 2px; +} + +.opt-dash-label { + font-size: 11px; + color: var(--muted); + text-transform: uppercase; + letter-spacing: 0.06em; +} + +.opt-dash-value { + font-size: 16px; + font-weight: 600; + color: var(--bright); + font-variant-numeric: tabular-nums; +} + +.opt-running { color: var(--accent); } +.opt-done { color: var(--green); } +.opt-alarm { color: var(--red); } + +.opt-chart { margin-bottom: 20px; } + +.iter-row { cursor: pointer; transition: background 0.1s; } +.iter-row:hover { background: var(--surface-2); } + +.iter-diff-row td { padding: 0; } + +.iter-diff-block { + background: var(--code-bg); + border: 1px solid var(--border); + padding: 12px 14px; + font-size: 11.5px; + font-family: var(--font-mono); + max-height: 400px; + overflow: auto; + white-space: pre; + line-height: 1.6; + color: #8899bb; +} + +.verdict-keep { color: var(--green); font-weight: 600; } +.verdict-revert { color: var(--red); font-weight: 600; } +.verdict-stop { color: var(--yellow); font-weight: 600; } + +.opt-actions { + margin-top: 16px; + display: flex; + gap: 10px; + align-items: center; +} + +.opt-btn { + padding: 8px 18px; + border-radius: var(--radius); + border: 1px solid var(--border-2); + background: var(--surface-2); + color: var(--text); + cursor: pointer; + font-size: 12px; + font-family: var(--font-sans); + font-weight: 500; + transition: all 0.15s; +} + +.opt-btn:hover { color: var(--bright); background: var(--surface-3); } +.opt-btn:focus-visible { outline: 2px solid var(--accent); outline-offset: 2px; } + +.opt-btn-primary { + background: var(--accent); + color: #fff; + border: none; + box-shadow: 0 1px 4px rgba(77, 142, 245, 0.3); +} + +.opt-btn-primary:hover { background: #5a99f8; } @@ -656,12 +748,14 @@

Blind A/B Code Review

+
+
diff --git a/skills/skill-creator/scripts/eval_compare.py b/skills/skill-creator/scripts/eval_compare.py index 58f1849..552100b 100644 --- a/skills/skill-creator/scripts/eval_compare.py +++ b/skills/skill-creator/scripts/eval_compare.py @@ -162,6 +162,24 @@ def find_iteration_dirs(workspace: Path) -> list[Path]: return [d for d in dirs if d.is_dir()] +def load_optimization_data(workspace: Path) -> dict | None: + """Load optimization loop results when present in the workspace.""" + def looks_like_optimization_results(data: dict) -> bool: + return isinstance(data, dict) and "iterations" in data and "baseline_score" in data and "target" in data + + candidates = [ + workspace / "results.json", + workspace / "evals" / "iterations" / "results.json", + workspace / "out" / "results.json", + ] + for path in candidates: + if path.exists(): + data = load_json_safe(path) + if data is not None and looks_like_optimization_results(data): + return data + return None + + def build_data(workspace: Path) -> dict: """Build full comparison data.""" evals_path = workspace / "evals" / "evals.json" @@ -185,6 +203,7 @@ def build_data(workspace: Path) -> dict: "variantAName": "Variant A", "variantBName": "Variant B", "variantCName": "Variant C", + "optimization": load_optimization_data(workspace), } iteration = iterations[-1] # Latest iteration @@ -239,6 +258,7 @@ def build_data(workspace: Path) -> dict: "variantAName": variants.get("A", {}).get("name", "Variant A"), "variantBName": variants.get("B", {}).get("name", "Variant B"), "variantCName": variants.get("C", {}).get("name", "Variant C"), + "optimization": load_optimization_data(workspace), }