diff --git a/pipelines/comprehensive-review/SKILL.md b/pipelines/comprehensive-review/SKILL.md
index 8ba22ea..9ea2a61 100644
--- a/pipelines/comprehensive-review/SKILL.md
+++ b/pipelines/comprehensive-review/SKILL.md
@@ -1,23 +1,16 @@
 ---
 name: comprehensive-review
 description: |
-  Unified 4-wave code review: Wave 0 auto-discovers packages/modules and
-  dispatches one language-specialist agent per package for deep per-package
-  analysis. Wave 1 dispatches 12 foundation reviewers in parallel (with Wave 0
-  context). Wave 2 dispatches 10 deep-dive reviewers that receive Wave 0+1
-  findings as context for targeted analysis. Wave 3 dispatches 4-5 adversarial
-  reviewers that challenge Wave 1+2 consensus — contrarian, skeptical senior,
-  user advocate, meta-process, and conditionally SAPCC structural. Aggregates
-  all findings by severity with wave-agreement labels (unanimous, majority,
-  contested), then auto-fixes ALL issues. Covers per-package deep review,
-  security, business logic, architecture, error handling, test coverage, type
-  design, code quality, comment analysis, language idioms, docs validation,
-  newcomer perspective, performance, concurrency, API contracts, dependencies,
-  error messages, dead code, naming, observability, config safety, migration
-  safety, and adversarial challenge.
-  Use for "comprehensive review", "full review", "review everything", "review
-  and fix", or "thorough code review".
-  Do NOT use for single-concern reviews (use individual agents instead).
+  Four-wave code review pipeline for large or high-risk changes. Wave 0
+  auto-discovers packages/modules and runs per-package specialist review. Wave
+  1 runs broad foundation reviewers in parallel. Wave 2 runs targeted deep-dive
+  reviewers using earlier findings as context. Wave 3 runs adversarial reviewers
+  that challenge consensus and surface missed risks. Aggregates findings by
+  severity and agreement level, deduplicates them, and can auto-fix issues
+  unless review-only mode is used. Use for "comprehensive review", "full
+  review", "review everything", "review and fix", or "thorough code review".
+  Do NOT use for narrow single-concern reviews; use smaller review skills
+  instead.
 effort: high
 version: 4.0.0
 user-invocable: false
diff --git a/scripts/tests/test_agent_comparison_optimize_loop.py b/scripts/tests/test_agent_comparison_optimize_loop.py
new file mode 100644
index 0000000..a0f2faa
--- /dev/null
+++ b/scripts/tests/test_agent_comparison_optimize_loop.py
@@ -0,0 +1,132 @@
+import importlib.util
+import json
+from pathlib import Path
+import sys
+
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+
+
+def load_module(name: str, relative_path: str):
+    spec = importlib.util.spec_from_file_location(name, REPO_ROOT / relative_path)
+    module = importlib.util.module_from_spec(spec)
+    assert spec.loader is not None
+    spec.loader.exec_module(module)
+    return module
+
+
+def test_assess_target_rejects_missing_frontmatter(tmp_path):
+    optimize_loop = load_module(
+        "agent_comparison_optimize_loop",
+        "skills/agent-comparison/scripts/optimize_loop.py",
+    )
+    target = tmp_path / "SKILL.md"
+    target.write_text("# no frontmatter\nbody\n")
+
+    scores = optimize_loop.assess_target(
+        target,
+        [{"query": "write tests", "should_trigger": True}],
+        "improve routing precision",
+        dry_run=True,
+    )
+
+    assert scores["parses"] is False
+    assert optimize_loop.composite_score(scores) == 0.0
+
+
+def test_check_protected_sections_rejects_missing_blocks():
+    optimize_loop = load_module(
+        "agent_comparison_optimize_loop",
+        "skills/agent-comparison/scripts/optimize_loop.py",
+    )
+    original = (
+        "alpha\n"
+        "<!-- DO NOT OPTIMIZE -->\n"
+        "keep me\n"
+        "<!-- END DO NOT OPTIMIZE -->\n"
+        "omega\n"
+    )
+    relocated = "alpha\nomega\n"
+
+    assert optimize_loop.check_protected_sections(original, relocated) is False
+
+
+def test_restore_protected_does_not_silently_reinsert_missing_blocks():
+    generate_variant = load_module(
+        "agent_comparison_generate_variant",
+        "skills/agent-comparison/scripts/generate_variant.py",
+    )
+    original = (
+        "alpha\n"
+        "<!-- DO NOT OPTIMIZE -->\n"
+        "keep me\n"
+        "<!-- END DO NOT OPTIMIZE -->\n"
+        "omega\n"
+    )
+    variant = "alpha\nomega\n"
+
+    restored = generate_variant.restore_protected(original, variant)
+
+    assert restored == variant
+
+
+def test_generate_variant_main_reads_current_content_from_file(tmp_path, monkeypatch, capsys):
+    generate_variant = load_module(
+        "agent_comparison_generate_variant",
+        "skills/agent-comparison/scripts/generate_variant.py",
+    )
+
+    class FakeBlock:
+        def __init__(self, block_type: str, text: str):
+            self.type = block_type
+            if block_type == "thinking":
+                self.thinking = text
+            else:
+                self.text = text
+
+    class FakeResponse:
+        def __init__(self):
+            self.content = [
+                FakeBlock("thinking", "reasoning"),
+                FakeBlock(
+                    "text",
+                    "<variant>---\ndescription: updated\n---</variant>"
+                    "<summary>updated</summary><deletion_justification></deletion_justification>",
+                ),
+            ]
+            self.usage = type("Usage", (), {"input_tokens": 1, "output_tokens": 2})()
+
+    class FakeClient:
+        def __init__(self):
+            self.messages = type("Messages", (), {"create": lambda self, **kwargs: FakeResponse()})()
+
+    class FakeAnthropicModule:
+        class Anthropic:
+            def __new__(cls):
+                return FakeClient()
+
+    content_file = tmp_path / "current.md"
+    content_file.write_text("---\ndescription: current\n---\n")
+
+    monkeypatch.setattr(generate_variant, "anthropic", FakeAnthropicModule)
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        [
+            "generate_variant.py",
+            "--target",
+            "skills/example/SKILL.md",
+            "--goal",
+            "improve routing precision",
+            "--current-content-file",
+            str(content_file),
+            "--model",
+            "fake-model",
+        ],
+    )
+
+    generate_variant.main()
+    output = json.loads(capsys.readouterr().out)
+
+    assert output["variant"] == "---\ndescription: updated\n---"
+    assert output["tokens_used"] == 3
diff --git a/scripts/tests/test_eval_compare_optimization.py b/scripts/tests/test_eval_compare_optimization.py
new file mode 100644
index 0000000..f687c7a
--- /dev/null
+++ b/scripts/tests/test_eval_compare_optimization.py
@@ -0,0 +1,33 @@
+import importlib.util
+import json
+from pathlib import Path
+
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+
+
+def load_module(name: str, relative_path: str):
+    spec = importlib.util.spec_from_file_location(name, REPO_ROOT / relative_path)
+    module = importlib.util.module_from_spec(spec)
+    assert spec.loader is not None
+    spec.loader.exec_module(module)
+    return module
+
+
+def test_load_optimization_data_ignores_unrelated_results_json(tmp_path):
+    eval_compare = load_module(
+        "skill_creator_eval_compare",
+        "skills/skill-creator/scripts/eval_compare.py",
+    )
+    (tmp_path / "results.json").write_text(json.dumps({"status": "not-optimization"}))
+    (tmp_path / "evals" / "iterations").mkdir(parents=True)
+    expected = {
+        "target": "skills/example/SKILL.md",
+        "baseline_score": {"train": 1.0, "test": 1.0},
+        "iterations": [],
+    }
+    (tmp_path / "evals" / "iterations" / "results.json").write_text(json.dumps(expected))
+
+    loaded = eval_compare.load_optimization_data(tmp_path)
+
+    assert loaded == expected
diff --git a/skills/agent-comparison/SKILL.md b/skills/agent-comparison/SKILL.md
index 2fed9c1..7f7d1eb 100644
--- a/skills/agent-comparison/SKILL.md
+++ b/skills/agent-comparison/SKILL.md
@@ -2,12 +2,15 @@
 name: agent-comparison
 description: |
   A/B test agent variants measuring quality and total session token cost
-  across simple and complex benchmarks. Use when creating compact agent
-  versions, validating agent changes, comparing internal vs external agents,
-  or deciding between variants for production. Use for "compare agents",
-  "A/B test", "benchmark agents", or "test agent efficiency". Route single-agent evaluation to agent-evaluation, testing skills, or optimizing prompts
-  without variant comparison.
-version: 2.0.0
+  across simple and complex benchmarks. Also supports automated optimization
+  loops (autoresearch) for frontmatter description and routing-trigger quality
+  using train/test eval sets. Use when creating compact agent versions, validating
+  agent changes, comparing internal vs external agents, optimizing a skill description,
+  or deciding between variants for production. Use for "compare agents", "A/B test",
+  "benchmark agents", "test agent efficiency", "optimize description", "optimize skill",
+  or "run autoresearch". Route single-agent evaluation to agent-evaluation, testing skills,
+  or simple prompt optimization without variant comparison.
+version: 2.2.0
 user-invocable: false
 allowed-tools:
   - Read
@@ -22,6 +25,9 @@ routing:
     - "compare agents"
     - "A/B test agents"
     - "benchmark agents"
+    - "optimize skill"
+    - "optimize description"
+    - "run autoresearch"
   category: meta-tooling
 ---
 
@@ -254,6 +260,129 @@ Remove temporary benchmark files and debug outputs. Keep only the comparison rep
 
 **Gate**: Report generated with all metrics. Verdict stated with evidence. Report saved to benchmark directory.
 
+### Phase 5: OPTIMIZE (optional — invoked explicitly)
+
+**Goal**: Run an automated optimization loop that iteratively improves a markdown target's frontmatter `description` using trigger-rate eval tasks, then keeps only measured improvements.
+
+This phase is for routing/trigger optimization, not full code-generation benchmarking. Invoke it when the user says "optimize this skill", "optimize the description", or "run autoresearch". The existing manual A/B comparison (Phases 1-4) remains the path for full agent benchmarking.
+
+**Step 1: Validate optimization target and goal**
+
+Confirm the target file exists, has YAML frontmatter with a `description`, and the optimization goal is clear:
+
+```bash
+# Target must be a markdown file with frontmatter description
+test -f skills/{target}/SKILL.md
+rg -n '^description:' skills/{target}/SKILL.md
+
+# Goal should be specific and measurable
+# Good: "improve error handling instructions"
+# Bad: "make it better"
+```
+
+**Step 2: Prepare trigger-rate eval tasks**
+
+```bash
+python3 skills/agent-comparison/scripts/optimize_loop.py \
+    --target skills/{target}/SKILL.md \
+    --goal "{optimization goal}" \
+    --benchmark-tasks skills/agent-comparison/references/optimization-tasks.example.json \
+    --train-split 0.6 \
+    --model claude-sonnet-4-20250514 \
+    --verbose
+```
+
+Supported task schemas:
+- Flat `tasks` list with optional `"split": "train" | "test"` per task
+- Top-level `train` and `test` arrays
+
+Every task must include:
+- `query`: the routing prompt to test
+- `should_trigger`: whether the target should trigger for that prompt
+
+If no split markers are present, the loop does a reproducible random split with seed `42`.
+
+**Step 3: Run baseline evaluation**
+
+The loop automatically evaluates the unmodified target against the train set before starting iteration. This establishes the score to beat, and records a held-out baseline if test tasks exist.
+
+**Step 4: Enter optimization loop**
+
+The `optimize_loop.py` script handles the full loop:
+- Calls `generate_variant.py` to propose changes (Claude with extended thinking)
+- Evaluates each variant against train tasks
+- Keeps variants that improve score by more than `--min-gain` (default 0.02)
+- Reverts variants that don't improve, break hard gates, or delete sections without justification
+- Checks held-out test set every 5 iterations for Goodhart divergence
+- Stops on convergence (5 consecutive reverts), Goodhart alarm, or max iterations
+
+```bash
+python3 skills/agent-comparison/scripts/optimize_loop.py \
+    --target skills/{target}/SKILL.md \
+    --goal "{optimization goal}" \
+    --benchmark-tasks skills/agent-comparison/references/optimization-tasks.example.json \
+    --max-iterations 20 \
+    --min-gain 0.02 \
+    --train-split 0.6 \
+    --model claude-sonnet-4-20250514 \
+    --report optimization-report.html \
+    --output-dir evals/iterations \
+    --verbose
+```
+
+The `--report` flag generates a live HTML dashboard that auto-refreshes every 10 seconds, showing a convergence chart, iteration table, and cherry-pick controls.
+
+**Step 5: Present results in UI**
+
+Open the generated `optimization-report.html` in a browser. The report shows:
+- Progress dashboard (status, baseline vs best, kept/reverted counts)
+- Convergence chart (train solid line, held-out dashed line, baseline dotted)
+- Iteration table with verdict, composite score, delta, and change summary
+- Expandable inline diffs per iteration (click any row)
+
+**Step 6: User cherry-picks improvements**
+
+Not all KEEP iterations are real improvements — some may be harness artifacts. The user reviews each kept iteration's diff and selects which to include:
+- Check the "Pick" checkbox for desired iterations
+- Click "Preview Combined" to see the merged diff
+- Click "Export Selected" to download a JSON file with chosen diffs
+
+**Step 7: Apply selected improvements to target file**
+
+Apply the selected improvements to the original target file.
+
+- If you want the best single kept variant, use `evals/iterations/best_variant.md`.
+- If you exported selected diffs, treat that JSON as review material for a manual follow-up apply step. It is not auto-applied by the current tooling.
+
+```bash
+# Review the best kept variant before applying
+cat evals/iterations/best_variant.md | head -20
+
+# Replace the target with the best kept variant
+cp evals/iterations/best_variant.md skills/{target}/SKILL.md
+```
+
+**Step 8: Run final evaluation on FULL task set (train + test)**
+
+After applying improvements, run a final evaluation on ALL tasks (not just train) to verify the improvements generalize:
+
+```bash
+# Re-run optimize_loop.py against the same task file and inspect results.json/report output
+```
+
+Compare final scores to the baseline to confirm net improvement.
+
+**Step 9: Record in learning-db**
+
+```bash
+python3 scripts/learning-db.py learn \
+    --skill agent-comparison \
+    "autoresearch: {target} improved {baseline}→{best} over {iterations} iterations. \
+     Kept: {kept}/{total}. Stop: {reason}. Changes: {summaries}"
+```
+
+**Gate**: Optimization complete. Results reviewed. Cherry-picked improvements applied and verified against full task set. Results recorded.
+
 ### Optional Extensions
 
 These are off by default. Enable explicitly when needed:
diff --git a/skills/agent-comparison/references/benchmark-tasks.md b/skills/agent-comparison/references/benchmark-tasks.md
index c3f0d4b..f6142d1 100644
--- a/skills/agent-comparison/references/benchmark-tasks.md
+++ b/skills/agent-comparison/references/benchmark-tasks.md
@@ -184,3 +184,76 @@ cd benchmark/{task-name}/compact && go test -race -v
 # Manual alternative: compare outputs side-by-side using diff
 diff benchmark/{task-name}/full/ benchmark/{task-name}/compact/
 ```
+
+## Optimization Loop Task Format
+
+The current optimization loop is for frontmatter-description and routing-trigger quality. It does not run full code-generation benchmarks. Use Phase 5 with trigger-rate eval tasks, then use Phases 1-4 for full agent benchmarking.
+
+### Supported Task File Schemas
+
+Flat list with optional split markers:
+
+```json
+{
+  "tasks": [
+    {
+      "name": "go-testing-positive",
+      "split": "train",
+      "complexity": "complex",
+      "query": "write table-driven tests for a Go parser with subtests and helpers",
+      "should_trigger": true
+    },
+    {
+      "name": "kubernetes-negative",
+      "split": "test",
+      "complexity": "complex",
+      "query": "debug a kubernetes pod stuck in CrashLoopBackOff",
+      "should_trigger": false
+    }
+  ]
+}
+```
+
+Explicit top-level train/test sets:
+
+```json
+{
+  "train": [
+    {
+      "name": "positive-1",
+      "query": "write Go benchmarks and race tests for a worker pool",
+      "should_trigger": true
+    }
+  ],
+  "test": [
+    {
+      "name": "negative-1",
+      "query": "design a PostgreSQL indexing strategy",
+      "should_trigger": false
+    }
+  ]
+}
+```
+
+### Required Fields
+
+- `query`: the prompt used to test routing behavior
+- `should_trigger`: expected boolean outcome for the target description
+
+### Optional Fields
+
+- `name`: human-readable label shown in reports
+- `split`: `train` or `test` when using a flat `tasks` list
+- `complexity`: used for stratified splitting when no explicit split is provided
+
+### Split Strategy
+
+- `train` tasks are used during each optimization iteration.
+- `test` tasks are held out and checked every 5 iterations for Goodhart divergence.
+- If no split markers are present, the loop performs a reproducible random split with seed `42`, stratified by `complexity`.
+
+### Task Selection Principles for Optimization
+
+1. Cover both positive and negative routing examples. A description that only improves recall while tanking precision is not an improvement.
+2. Put at least one out-of-domain prompt in the held-out set. This catches overfitting where the description starts matching benchmark phrasing instead of the real scope.
+3. Use realistic user wording, not only canonical trigger phrases. Optimization on synthetic wording alone produces brittle routing behavior.
diff --git a/skills/agent-comparison/references/optimization-guide.md b/skills/agent-comparison/references/optimization-guide.md
new file mode 100644
index 0000000..2c32067
--- /dev/null
+++ b/skills/agent-comparison/references/optimization-guide.md
@@ -0,0 +1,181 @@
+# Autoresearch Optimization Guide
+
+## Scope
+
+The current autoresearch loop optimizes a markdown target's frontmatter
+`description` using trigger-rate eval tasks. This is useful for improving
+skill routing accuracy and similar description-driven dispatch behavior.
+
+It is not a replacement for the manual agent benchmark workflow in Phases 1-4.
+If you want to compare real code-generation quality across benchmark tasks, use
+the normal A/B process.
+
+## Supported Targets
+
+- `skills/<name>/SKILL.md`
+- Other markdown targets with valid YAML frontmatter and a non-empty
+  `description`
+
+The loop rejects targets without parseable frontmatter or without a
+`description`, because trigger-rate evaluation depends on the target text that
+drives routing.
+
+## Supported Task Formats
+
+Every task must include:
+
+- `query`: the prompt to test
+- `should_trigger`: whether the target should trigger for that prompt
+
+Optional fields:
+
+- `name`: label shown in logs and reports
+- `split`: `train` or `test`
+- `complexity`: used for stratified splitting when `split` is omitted
+
+Flat task list:
+
+```json
+{
+  "tasks": [
+    {
+      "name": "positive-1",
+      "split": "train",
+      "complexity": "complex",
+      "query": "write table-driven Go tests with subtests and helper functions",
+      "should_trigger": true
+    },
+    {
+      "name": "negative-1",
+      "split": "test",
+      "complexity": "complex",
+      "query": "debug a Kubernetes pod stuck in CrashLoopBackOff",
+      "should_trigger": false
+    }
+  ]
+}
+```
+
+Explicit train/test sets:
+
+```json
+{
+  "train": [
+    {
+      "name": "positive-1",
+      "query": "write race-safe Go tests for a worker pool",
+      "should_trigger": true
+    }
+  ],
+  "test": [
+    {
+      "name": "negative-1",
+      "query": "optimize a PostgreSQL indexing strategy",
+      "should_trigger": false
+    }
+  ]
+}
+```
+
+If no split markers are present, the loop performs a reproducible random split
+using `--train-split` and seed `42`.
+
+## Command
+
+```bash
+python3 skills/agent-comparison/scripts/optimize_loop.py \
+  --target skills/go-testing/SKILL.md \
+  --goal "improve routing precision without losing recall" \
+  --benchmark-tasks skills/agent-comparison/references/optimization-tasks.example.json \
+  --train-split 0.6 \
+  --max-iterations 20 \
+  --min-gain 0.02 \
+  --model claude-sonnet-4-20250514 \
+  --report optimization-report.html \
+  --output-dir evals/iterations \
+  --verbose
+```
+
+Useful flags:
+
+- `--dry-run`: exercise the loop mechanics without API calls
+- `--report`: write a live HTML report
+- `--output-dir`: persist iteration snapshots and `results.json`
+
+## Evaluation Model
+
+The loop follows the ADR-131 structure:
+
+1. Hard gates
+2. Weighted composite score
+3. Held-out regression checks
+
+### Layer 1: Hard Gates
+
+An iteration is rejected immediately if any of these fail:
+
+- `parses`
+- `compiles`
+- `tests_pass`
+- `protected_intact`
+
+For description optimization, `parses` and `protected_intact` are the most
+important gates. Protected sections fenced by `DO NOT OPTIMIZE` markers must be
+preserved verbatim.
+
+### Layer 2: Composite Score
+
+The loop converts trigger-rate evaluation results into a weighted composite
+score using the built-in weights in `optimize_loop.py`. A variant is kept only
+if it beats the previous best by more than `--min-gain`.
+
+### Layer 3: Held-Out Regression Check
+
+Every 5 iterations, the current best variant is scored on the held-out test set.
+If held-out performance drops below the baseline while train performance has
+improved, the loop raises a Goodhart alarm and stops.
+
+## Deletion Safety Rule
+
+Deleting sections is allowed only with explicit justification.
+
+- `generate_variant.py` detects removed `##` headings
+- the model must return a `deletion_justification`
+- `optimize_loop.py` rejects deletions without one
+
+This enforces ADR-131's "no deletion without justification" rule.
+
+## Iteration Artifacts
+
+When `--output-dir` is set, the loop writes:
+
+- `001/variant.md`
+- `001/scores.json`
+- `001/verdict.json`
+- `001/diff.patch`
+- `best_variant.md`
+- `results.json`
+
+When `--report` is set, it also writes a live HTML dashboard showing:
+
+- status, baseline, best score, kept/reverted counts
+- convergence chart
+- iteration table with diffs
+- cherry-pick controls for kept iterations
+
+## Choosing Good Eval Tasks
+
+1. Include both positive and negative prompts.
+2. Put realistic user phrasing in both train and held-out sets.
+3. Keep at least one out-of-domain negative example in held-out.
+4. Do not let the eval set collapse into benchmark keywords only.
+
+## Limitations
+
+Current limitations are intentional and documented:
+
+- The loop does not execute full code-generation benchmarks.
+- Pattern-based benchmark tasks with `prompt`, `expected_patterns`, and
+  `forbidden_patterns` are not supported by `optimize_loop.py`.
+- For full agent quality comparisons, continue to use the manual benchmark and
+  grading flow in Phases 1-4.
diff --git a/skills/agent-comparison/references/optimization-tasks.example.json b/skills/agent-comparison/references/optimization-tasks.example.json
new file mode 100644
index 0000000..098226e
--- /dev/null
+++ b/skills/agent-comparison/references/optimization-tasks.example.json
@@ -0,0 +1,32 @@
+{
+  "tasks": [
+    {
+      "name": "positive-go-tests",
+      "split": "train",
+      "complexity": "complex",
+      "query": "write table-driven Go tests with subtests and helper functions",
+      "should_trigger": true
+    },
+    {
+      "name": "positive-benchmarks",
+      "split": "train",
+      "complexity": "simple",
+      "query": "add Go benchmarks and race-safe test coverage for a worker pool",
+      "should_trigger": true
+    },
+    {
+      "name": "negative-kubernetes",
+      "split": "test",
+      "complexity": "complex",
+      "query": "debug a kubernetes pod stuck in CrashLoopBackOff",
+      "should_trigger": false
+    },
+    {
+      "name": "negative-sql",
+      "split": "test",
+      "complexity": "simple",
+      "query": "design a PostgreSQL indexing strategy for a reporting query",
+      "should_trigger": false
+    }
+  ]
+}
diff --git a/skills/agent-comparison/scripts/generate_variant.py b/skills/agent-comparison/scripts/generate_variant.py
new file mode 100644
index 0000000..2378e50
--- /dev/null
+++ b/skills/agent-comparison/scripts/generate_variant.py
@@ -0,0 +1,297 @@
+#!/usr/bin/env python3
+"""Generate a variant of an agent/skill file using Claude with extended thinking.
+
+Proposes modifications to improve the target file based on the optimization
+goal and previous iteration failures. Preserves protected sections marked
+with DO NOT OPTIMIZE markers.
+
+Pattern: follows improve_description.py's Claude + extended thinking approach.
+
+Usage:
+    python3 skills/agent-comparison/scripts/generate_variant.py \
+        --target agents/golang-general-engineer.md \
+        --goal "improve error handling instructions" \
+        --current-content "..." \
+        --failures '[...]' \
+        --model claude-sonnet-4-20250514
+
+Output (JSON to stdout):
+    {
+        "variant": "full file content...",
+        "summary": "Added CRITICAL warning for error wrapping",
+        "deletion_justification": "",
+        "reasoning": "Extended thinking content...",
+        "tokens_used": 12345
+    }
+
+See ADR-131 for safety rules.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+
+try:
+    import anthropic
+except ImportError:  # pragma: no cover - exercised in environments without the SDK
+    anthropic = None
+
+# ---------------------------------------------------------------------------
+# Protected section handling
+# ---------------------------------------------------------------------------
+
+_PROTECTED_RE = re.compile(
+    r"(<!--\s*DO NOT OPTIMIZE\s*-->.*?<!--\s*END DO NOT OPTIMIZE\s*-->)",
+    re.DOTALL,
+)
+
+
+def extract_protected(content: str) -> list[str]:
+    """Extract all protected sections from content."""
+    return _PROTECTED_RE.findall(content)
+
+
+def restore_protected(original: str, variant: str) -> str:
+    """Restore protected sections from original into variant."""
+    orig_sections = extract_protected(original)
+    var_sections = extract_protected(variant)
+
+    if len(orig_sections) != len(var_sections):
+        print(
+            "Warning: Protected section count mismatch "
+            f"(original={len(orig_sections)}, variant={len(var_sections)}).",
+            file=sys.stderr,
+        )
+        return variant
+
+    result = variant
+    for orig_sec, var_sec in zip(orig_sections, var_sections):
+        result = result.replace(var_sec, orig_sec, 1)
+
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Deletion detection
+# ---------------------------------------------------------------------------
+
+
+def detect_deletions(original: str, variant: str) -> list[str]:
+    """Find sections that exist in original but are missing from variant.
+
+    Returns list of deleted section headings. Only checks ## headings.
+    """
+    orig_headings = set(re.findall(r"^##\s+(.+)$", original, re.MULTILINE))
+    var_headings = set(re.findall(r"^##\s+(.+)$", variant, re.MULTILINE))
+    return sorted(orig_headings - var_headings)
+
+
+# ---------------------------------------------------------------------------
+# Variant generation
+# ---------------------------------------------------------------------------
+
+
+def generate_variant(
+    client: anthropic.Anthropic,
+    target_path: str,
+    goal: str,
+    current_content: str,
+    failures: list[dict],
+    model: str,
+    history: list[dict] | None = None,
+) -> dict:
+    """Call Claude to generate a variant of the target file.
+
+    Returns dict with variant content, summary, reasoning, and token count.
+    """
+    # Build the prompt
+    failure_section = ""
+    if failures:
+        failure_section = "\n\nFailed tasks from the last iteration:\n"
+        for f in failures:
+            failure_section += f"  - {f.get('name', 'unnamed')}: {f.get('details', 'failed')}\n"
+
+    history_section = ""
+    if history:
+        history_section = "\n\nPrevious attempts (do NOT repeat — try structurally different approaches):\n"
+        for h in history:
+            history_section += f"  Iteration {h.get('number', '?')}: {h.get('verdict', '?')} — {h.get('change_summary', '')}\n"
+
+    protected_sections = extract_protected(current_content)
+    protected_notice = ""
+    if protected_sections:
+        protected_notice = f"""
+
+CRITICAL SAFETY RULE: The file contains {len(protected_sections)} protected section(s) marked with
+<!-- DO NOT OPTIMIZE --> and <!-- END DO NOT OPTIMIZE --> markers.
+You MUST preserve these sections EXACTLY as they are — character for character.
+Do not add, remove, or modify anything between these markers.
+This is non-negotiable: protected sections contain safety gates that must not be
+removed even if removing them would improve test scores."""
+
+    prompt = f"""You are optimizing an agent/skill file to improve its performance.
+
+Target file: {target_path}
+Optimization goal: {goal}
+
+Current content of the file:
+<current_content>
+{current_content}
+</current_content>
+{failure_section}{history_section}{protected_notice}
+
+SAFETY RULES:
+1. Do NOT delete sections without replacing them with equivalent or better content.
+   If you remove a section heading that exists in the original, you must explain what
+   replaces the removed functionality. Pure deletion degrades unmeasured capabilities.
+
+2. Do NOT change the tools, SDKs, or interfaces the agent uses. The variant must work
+   in the same environment as the original (no switching from SDK to curl, etc.).
+
+3. Keep YAML frontmatter structure intact (name, description, routing, etc.).
+
+4. Focus on making the agent/skill better at achieving the stated goal. Common
+   improvements include:
+   - Moving critical information to more prominent positions (CRITICAL banners)
+   - Adding explicit planning steps before code generation
+   - Improving error handling instructions with specific patterns
+   - Adding concrete examples for ambiguous instructions
+   - Restructuring for clarity when sections are dense
+
+Please respond with the complete modified file content inside <variant> tags,
+and a brief summary of what you changed and why inside <summary> tags.
+
+If you removed any existing `##` section heading, include a brief justification
+inside <deletion_justification> tags. If you did not remove a section, return
+empty tags.
+
+<variant>
+[complete file content here]
+</variant>
+
+<summary>
+[1-2 sentence description of the change]
+</summary>
+
+<deletion_justification>
+[why any removed section was replaced safely, or leave blank]
+</deletion_justification>"""
+
+    try:
+        response = client.messages.create(
+            model=model,
+            max_tokens=16000,
+            thinking={
+                "type": "enabled",
+                "budget_tokens": 10000,
+            },
+            messages=[{"role": "user", "content": prompt}],
+        )
+    except anthropic.APIStatusError as e:
+        print(f"Error: API returned status {e.status_code}: {e.message}", file=sys.stderr)
+        sys.exit(1)
+    except anthropic.APIConnectionError as e:
+        print(f"Error: API connection failed: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    # Extract thinking and text
+    thinking_text = ""
+    text = ""
+    for block in response.content:
+        if block.type == "thinking":
+            thinking_text = block.thinking
+        elif block.type == "text":
+            text = block.text
+
+    # Parse variant content
+    variant_match = re.search(r"<variant>(.*?)</variant>", text, re.DOTALL)
+    if not variant_match:
+        print("Error: No <variant> tags in response", file=sys.stderr)
+        sys.exit(1)
+
+    variant = variant_match.group(1).strip()
+
+    # Parse summary
+    summary_match = re.search(r"<summary>(.*?)</summary>", text, re.DOTALL)
+    summary = summary_match.group(1).strip() if summary_match else "No summary provided"
+
+    deletion_match = re.search(r"<deletion_justification>(.*?)</deletion_justification>", text, re.DOTALL)
+    deletion_justification = deletion_match.group(1).strip() if deletion_match else ""
+
+    # Restore protected sections (safety net)
+    variant = restore_protected(current_content, variant)
+
+    # Check for unauthorized deletions
+    deletions = detect_deletions(current_content, variant)
+    if deletions:
+        print(f"Warning: Deleted sections: {deletions}", file=sys.stderr)
+
+    tokens_used = response.usage.input_tokens + response.usage.output_tokens
+
+    return {
+        "variant": variant,
+        "summary": summary,
+        "deletion_justification": deletion_justification,
+        "reasoning": thinking_text,
+        "tokens_used": tokens_used,
+        "deletions": deletions,
+    }
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate agent/skill variant using Claude")
+    parser.add_argument("--target", required=True, help="Path to target file (for context)")
+    parser.add_argument("--goal", required=True, help="Optimization goal")
+    content_group = parser.add_mutually_exclusive_group(required=True)
+    content_group.add_argument("--current-content", help="Current file content")
+    content_group.add_argument("--current-content-file", help="Path to a file containing the current content")
+    parser.add_argument("--failures", default="[]", help="JSON list of failed tasks")
+    parser.add_argument("--history", default="[]", help="JSON list of previous iterations")
+    parser.add_argument("--model", default="claude-sonnet-4-20250514", help="Model to use")
+    args = parser.parse_args()
+
+    try:
+        failures = json.loads(args.failures)
+    except json.JSONDecodeError as e:
+        print(f"Error: --failures is not valid JSON: {e}", file=sys.stderr)
+        sys.exit(1)
+    try:
+        history = json.loads(args.history)
+    except json.JSONDecodeError as e:
+        print(f"Error: --history is not valid JSON: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    if anthropic is None:
+        print("Error: anthropic SDK is not installed", file=sys.stderr)
+        sys.exit(1)
+
+    current_content = (
+        open(args.current_content_file, encoding="utf-8").read()
+        if args.current_content_file
+        else args.current_content
+    )
+
+    client = anthropic.Anthropic()
+    result = generate_variant(
+        client=client,
+        target_path=args.target,
+        goal=args.goal,
+        current_content=current_content,
+        failures=failures,
+        model=args.model,
+        history=history if history else None,
+    )
+
+    print(json.dumps(result, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/skills/agent-comparison/scripts/optimize_loop.py b/skills/agent-comparison/scripts/optimize_loop.py
new file mode 100644
index 0000000..228dd1d
--- /dev/null
+++ b/skills/agent-comparison/scripts/optimize_loop.py
@@ -0,0 +1,1107 @@
+#!/usr/bin/env python3
+"""Autoresearch optimization loop for agent/skill files.
+
+Wraps the existing agent-comparison evaluation infrastructure in an outer
+loop that proposes variants, evaluates them, and keeps/reverts based on
+score improvement. The keep/revert decision is arithmetic — no LLM
+judgment in the loop itself.
+
+Usage:
+    python3 skills/agent-comparison/scripts/optimize_loop.py \
+        --target agents/golang-general-engineer.md \
+        --goal "improve error handling instructions" \
+        --benchmark-tasks tasks.json \
+        --max-iterations 20 \
+        --min-gain 0.02
+
+See ADR-131 for architecture details.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import random
+import re
+import subprocess
+import sys
+import tempfile
+import time
+from pathlib import Path
+
+# ---------------------------------------------------------------------------
+# Scoring helpers
+# ---------------------------------------------------------------------------
+
+QUALITY_WEIGHTS = {
+    "correctness": 0.40,
+    "error_handling": 0.20,
+    "language_idioms": 0.15,
+    "testing": 0.15,
+    "efficiency": 0.10,
+}
+
+HARD_GATE_KEYS = ["parses", "compiles", "tests_pass", "protected_intact"]
+
+
+def passes_hard_gates(scores: dict) -> bool:
+    """Layer 1: Hard gates — score is 0 if any fail."""
+    return all(scores.get(key, False) for key in HARD_GATE_KEYS)
+
+
+def composite_score(scores: dict) -> float:
+    """Layer 2: Weighted quality score, conditional on hard gates passing."""
+    if not passes_hard_gates(scores):
+        return 0.0
+    total = 0.0
+    for dim, weight in QUALITY_WEIGHTS.items():
+        total += scores.get(dim, 0.0) * weight
+    return round(total, 4)
+
+
+def holdout_diverges(
+    train_score: float,
+    holdout_score: float,
+    baseline_holdout: float,
+    baseline_train: float = 0.0,
+    threshold: float = 0.5,
+) -> bool:
+    """Goodhart alarm: held-out score drops while train has improved."""
+    holdout_dropped = (baseline_holdout - holdout_score) > threshold
+    train_improved = train_score > baseline_train
+    return holdout_dropped and train_improved
+
+
+# ---------------------------------------------------------------------------
+# Iteration snapshot
+# ---------------------------------------------------------------------------
+
+
+def save_iteration(
+    output_dir: Path,
+    iteration: int,
+    variant_content: str,
+    scores: dict,
+    verdict: str,
+    reasoning: str,
+    diff_text: str,
+    change_summary: str,
+    stop_reason: str | None = None,
+    deletions: list[str] | None = None,
+    deletion_justification: str = "",
+) -> dict:
+    """Save a full iteration snapshot and return its metadata."""
+    iter_dir = output_dir / f"{iteration:03d}"
+    iter_dir.mkdir(parents=True, exist_ok=True)
+
+    (iter_dir / "variant.md").write_text(variant_content)
+    (iter_dir / "scores.json").write_text(json.dumps(scores, indent=2))
+
+    verdict_data = {
+        "iteration": iteration,
+        "verdict": verdict,
+        "composite_score": composite_score(scores),
+        "change_summary": change_summary,
+        "reasoning": reasoning,
+        "stop_reason": stop_reason,
+        "deletions": deletions or [],
+        "deletion_justification": deletion_justification,
+    }
+    (iter_dir / "verdict.json").write_text(json.dumps(verdict_data, indent=2))
+
+    if diff_text:
+        (iter_dir / "diff.patch").write_text(diff_text)
+
+    return verdict_data
+
+
+# ---------------------------------------------------------------------------
+# Diff generation
+# ---------------------------------------------------------------------------
+
+
+def generate_diff(original: str, variant: str, label: str = "target") -> str:
+    """Generate a unified diff between two strings."""
+    import difflib
+
+    original_lines = original.splitlines(keepends=True)
+    variant_lines = variant.splitlines(keepends=True)
+    diff = difflib.unified_diff(
+        original_lines,
+        variant_lines,
+        fromfile=f"a/{label}",
+        tofile=f"b/{label}",
+        lineterm="\n",
+    )
+    return "".join(diff)
+
+
+def make_dry_run_variant(current_content: str, iteration: int) -> tuple[str, str, str]:
+    """Generate a deterministic local variant for --dry-run mode."""
+    marker = f"<!-- dry-run iteration {iteration} -->"
+    if marker in current_content:
+        marker = f"<!-- dry-run iteration {iteration}b -->"
+    if current_content.endswith("\n"):
+        variant = current_content + marker + "\n"
+    else:
+        variant = current_content + "\n" + marker + "\n"
+    return variant, "Synthetic dry-run mutation", "dry-run synthetic variant"
+
+
+# ---------------------------------------------------------------------------
+# HTML report generation
+# ---------------------------------------------------------------------------
+
+
+def _build_report_data(
+    target: str,
+    goal: str,
+    baseline_composite: float,
+    baseline_holdout: float | None,
+    train_size: int,
+    test_size: int,
+    iterations: list[dict],
+    max_iterations: int,
+    status: str,
+    total_tokens: int,
+) -> dict:
+    """Build the data structure for HTML report generation."""
+    return {
+        "target": target,
+        "goal": goal,
+        "status": status,
+        "baseline_score": {"train": baseline_composite, "test": baseline_holdout},
+        "task_counts": {"train": train_size, "test": test_size},
+        "max_iterations": max_iterations,
+        "total_tokens": total_tokens,
+        "iterations": iterations,
+    }
+
+
+def generate_optimization_report(data: dict, auto_refresh: bool = False) -> str:
+    """Generate iteration history HTML report.
+
+    The convergence chart is built client-side using safe DOM methods
+    (createElementNS, setAttribute, textContent) — no innerHTML.
+    All string data is escaped server-side via html.escape before
+    embedding in the template.
+    """
+    import html as html_mod
+
+    target = html_mod.escape(data.get("target", ""))
+    goal = html_mod.escape(data.get("goal", ""))
+    status = data.get("status", "RUNNING")
+    iterations = data.get("iterations", [])
+    baseline = data.get("baseline_score", {})
+    task_counts = data.get("task_counts", {})
+
+    refresh = '<meta http-equiv="refresh" content="10">' if auto_refresh else ""
+
+    rows = ""
+    for it in iterations:
+        v = it["verdict"]
+        vcls = {"KEEP": "keep", "REVERT": "revert", "STOP": "stop"}.get(v, "")
+        sc = it["score"]
+        train_score = sc.get("train")
+        test_score = sc.get("test")
+        score_str = f'{train_score:.2f}' if isinstance(train_score, (int, float)) else "?"
+        if isinstance(test_score, (int, float)):
+            score_str += f' / {test_score:.2f}'
+        delta = str(it.get("delta", ""))
+        dcls = "d-pos" if delta.startswith("+") and delta != "+0" else "d-neg" if delta.startswith("-") else "d-zero"
+        summary = html_mod.escape(str(it.get("change_summary", ""))[:80])
+        diff_esc = html_mod.escape(str(it.get("diff", "")))
+        is_keep = v == "KEEP"
+        n = it["number"]
+
+        rows += f"""
+        <tr class="iter-row" data-iteration="{n}">
+          <td>{n}</td>
+          <td><span class="verdict-{vcls}">{v}</span></td>
+          <td>{score_str}</td>
+          <td class="{dcls}">{delta}</td>
+          <td>{summary}</td>
+          <td><label><input type="checkbox" class="cherry-pick-cb" data-iteration="{n}" {"checked" if is_keep else ""} {"disabled" if not is_keep else ""}> Pick</label></td>
+        </tr>
+        <tr class="diff-row hidden" id="diff-{n}">
+          <td colspan="6"><pre class="diff-block">{diff_esc}</pre></td>
+        </tr>"""
+
+    chart_json = json.dumps([
+        {"x": it["number"], "train": it["score"].get("train", 0), "test": it["score"].get("test")}
+        for it in iterations
+    ])
+    diffs_json = json.dumps({it["number"]: str(it.get("diff", "")) for it in iterations})
+
+    bt = baseline.get("train", 0.0)
+    best = max((it["score"].get("train", bt) for it in iterations), default=bt)
+    kept = sum(1 for it in iterations if it["verdict"] == "KEEP")
+    reverted = sum(1 for it in iterations if it["verdict"] == "REVERT")
+    cur = len(iterations)
+    mx = data.get("max_iterations", 20)
+    scls = "running" if status == "RUNNING" else "done" if status in ("CONVERGED", "COMPLETE") else "alarm"
+    score_label = f"Train tasks: {task_counts.get('train', 0)}"
+    if task_counts.get("test"):
+        score_label += f" | Held-out tasks: {task_counts['test']}"
+
+    return f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">{refresh}
+<title>Optimization: {target}</title>
+<style>
+:root {{ --bg:#0a0c10;--surface:#111318;--surface-2:#161a22;--border:#222832;--text:#b8c4d4;--muted:#5c6a7e;--bright:#e8edf5;--accent:#4d8ef5;--green:#3dba6c;--green-dim:#0d2420;--red:#e05454;--red-dim:#2a1015;--yellow:#d4a830;--font-sans:-apple-system,BlinkMacSystemFont,'Segoe UI',system-ui,sans-serif;--font-mono:'SF Mono','Cascadia Code','Fira Code',monospace;--radius:8px; }}
+*,*::before,*::after {{ margin:0;padding:0;box-sizing:border-box; }}
+body {{ font-family:var(--font-sans);background:var(--bg);color:var(--text);font-size:14px;padding:24px 32px; }}
+h1 {{ font-size:18px;color:var(--bright);margin-bottom:4px; }}
+.subtitle {{ color:var(--muted);font-size:13px;margin-bottom:20px; }}
+.dashboard {{ background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:16px 20px;margin-bottom:20px;display:grid;grid-template-columns:repeat(auto-fit,minmax(160px,1fr));gap:12px; }}
+.dash-item {{ display:flex;flex-direction:column;gap:2px; }}
+.dash-label {{ font-size:11px;color:var(--muted);text-transform:uppercase;letter-spacing:0.06em; }}
+.dash-value {{ font-size:16px;font-weight:600;color:var(--bright);font-variant-numeric:tabular-nums; }}
+.dash-value.running {{ color:var(--accent); }}
+.dash-value.done {{ color:var(--green); }}
+.dash-value.alarm {{ color:var(--red); }}
+.chart-box {{ background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:16px;margin-bottom:20px; }}
+table {{ width:100%;border-collapse:collapse;font-size:13px; }}
+th,td {{ padding:8px 12px;text-align:left;border-bottom:1px solid var(--border); }}
+th {{ color:var(--muted);font-size:11px;text-transform:uppercase;letter-spacing:0.06em;background:var(--surface-2); }}
+.iter-row {{ cursor:pointer;transition:background 0.1s; }}
+.iter-row:hover {{ background:var(--surface-2); }}
+.diff-row td {{ padding:0; }}
+.diff-block {{ background:#080b0f;padding:12px;font-family:var(--font-mono);font-size:11px;max-height:400px;overflow:auto;white-space:pre;line-height:1.5;color:var(--muted); }}
+.verdict-keep {{ color:var(--green);font-weight:600; }}
+.verdict-revert {{ color:var(--red);font-weight:600; }}
+.verdict-stop {{ color:var(--yellow);font-weight:600; }}
+.d-pos {{ color:var(--green);font-weight:600; }}
+.d-neg {{ color:var(--red);font-weight:600; }}
+.d-zero {{ color:var(--muted); }}
+.hidden {{ display:none; }}
+.actions {{ margin-top:16px;display:flex;gap:10px; }}
+.btn {{ padding:8px 18px;border-radius:var(--radius);border:1px solid var(--border);background:var(--surface-2);color:var(--text);cursor:pointer;font-size:13px;font-family:var(--font-sans); }}
+.btn:hover {{ background:var(--surface);color:var(--bright); }}
+.btn-primary {{ background:var(--accent);color:#fff;border:none; }}
+.btn-primary:hover {{ background:#5a99f8; }}
+</style>
+</head>
+<body>
+<h1>Optimization: {target}</h1>
+<p class="subtitle">Goal: {goal}</p>
+<div class="dashboard">
+  <div class="dash-item"><span class="dash-label">Status</span><span class="dash-value {scls}">{status}</span></div>
+  <div class="dash-item"><span class="dash-label">Progress</span><span class="dash-value">{cur}/{mx}</span></div>
+  <div class="dash-item"><span class="dash-label">Baseline</span><span class="dash-value">{bt:.2f}</span></div>
+  <div class="dash-item"><span class="dash-label">Best</span><span class="dash-value">{best:.2f} ({best - bt:+.2f})</span></div>
+  <div class="dash-item"><span class="dash-label">Kept</span><span class="dash-value">{kept}</span></div>
+  <div class="dash-item"><span class="dash-label">Reverted</span><span class="dash-value">{reverted}</span></div>
+</div>
+<p class="subtitle">{score_label}</p>
+<div class="chart-box" id="chart"></div>
+<table>
+<thead><tr><th>#</th><th>Verdict</th><th>Score</th><th>Delta</th><th>Change</th><th>Pick</th></tr></thead>
+<tbody>{rows}</tbody>
+</table>
+<div class="actions">
+  <button class="btn btn-primary" id="btn-preview">Preview Combined</button>
+  <button class="btn" id="btn-export">Export Selected</button>
+</div>
+<div id="preview-area" class="hidden" style="margin-top:16px">
+  <h3 style="color:var(--bright);margin-bottom:8px">Combined Preview</h3>
+  <pre class="diff-block" id="preview-content"></pre>
+</div>
+<script>
+// Toggle diff rows
+document.querySelectorAll('.iter-row').forEach(function(row) {{
+  row.addEventListener('click', function(e) {{
+    if (e.target.type === 'checkbox') return;
+    document.getElementById('diff-' + row.dataset.iteration).classList.toggle('hidden');
+  }});
+}});
+
+// Convergence chart — safe DOM construction only (no innerHTML)
+var points = {chart_json};
+var bscore = {bt};
+var scoreCandidates = points.reduce(function(acc, point) {{
+  if (point.train != null) acc.push(point.train);
+  if (point.test != null) acc.push(point.test);
+  return acc;
+}}, [bscore]);
+
+function drawChart() {{
+  var box = document.getElementById('chart');
+  if (!points.length) {{ box.textContent = 'No iterations yet'; return; }}
+  var W = Math.min(box.clientWidth - 32, 800), H = 200;
+  var pad = {{l:40, r:20, t:10, b:30}};
+  var pW = W - pad.l - pad.r, pH = H - pad.t - pad.b;
+  var xMax = Math.max.apply(null, points.map(function(p){{return p.x}}));
+  if (xMax < 1) xMax = 1;
+  var rawMax = Math.max.apply(null, scoreCandidates);
+  var yMin = Math.max(0, Math.floor(Math.min.apply(null, scoreCandidates)) - 0.5);
+  var yMax = Math.ceil(rawMax + 0.5);
+  if (yMax <= yMin) yMax = yMin + 1;
+  function sx(x) {{ return pad.l + (x / xMax) * pW; }}
+  function sy(y) {{ return pad.t + pH - ((y - yMin) / (yMax - yMin)) * pH; }}
+  var NS = 'http://www.w3.org/2000/svg';
+  var svg = document.createElementNS(NS, 'svg');
+  svg.setAttribute('width', String(W));
+  svg.setAttribute('height', String(H));
+  svg.style.display = 'block';
+  function line(x1,y1,x2,y2,s,w,d) {{
+    var l = document.createElementNS(NS,'line');
+    l.setAttribute('x1',x1);l.setAttribute('y1',y1);l.setAttribute('x2',x2);l.setAttribute('y2',y2);
+    l.setAttribute('stroke',s);l.setAttribute('stroke-width',w);
+    if(d)l.setAttribute('stroke-dasharray',d);
+    svg.appendChild(l);
+  }}
+  function circ(cx,cy,r,f,s) {{
+    var c = document.createElementNS(NS,'circle');
+    c.setAttribute('cx',cx);c.setAttribute('cy',cy);c.setAttribute('r',r);
+    c.setAttribute('fill',f||'none');if(s)c.setAttribute('stroke',s);
+    svg.appendChild(c);
+  }}
+  function txt(x,y,t,f,sz,a) {{
+    var e = document.createElementNS(NS,'text');
+    e.setAttribute('x',x);e.setAttribute('y',y);e.setAttribute('fill',f);
+    e.setAttribute('font-size',sz);if(a)e.setAttribute('text-anchor',a);
+    e.textContent = t; svg.appendChild(e);
+  }}
+  function path(d,s,w,da) {{
+    var p = document.createElementNS(NS,'path');
+    p.setAttribute('d',d);p.setAttribute('fill','none');
+    p.setAttribute('stroke',s);p.setAttribute('stroke-width',w);
+    if(da)p.setAttribute('stroke-dasharray',da);
+    svg.appendChild(p);
+  }}
+  for(var y=yMin;y<=yMax+0.001;y+=0.5){{line(pad.l,sy(y),W-pad.r,sy(y),'#222832',1);txt(pad.l-6,sy(y)+4,y.toFixed(1),'#5c6a7e',10,'end');}}
+  line(pad.l,sy(bscore),W-pad.r,sy(bscore),'#d4a830',1,'4,4');
+  var tp=points.filter(function(p){{return p.train!=null}});
+  if(tp.length>1){{var d=tp.map(function(p,i){{return(i===0?'M':'L')+sx(p.x)+','+sy(p.train)}}).join(' ');path(d,'#4d8ef5',2);}}
+  tp.forEach(function(p){{circ(sx(p.x),sy(p.train),3,'#4d8ef5');}});
+  var hp=points.filter(function(p){{return p.test!=null}});
+  if(hp.length>1){{var d2=hp.map(function(p,i){{return(i===0?'M':'L')+sx(p.x)+','+sy(p.test)}}).join(' ');path(d2,'#3dba6c',2,'6,3');}}
+  hp.forEach(function(p){{circ(sx(p.x),sy(p.test),3,'none','#3dba6c');}});
+  for(var x=1;x<=xMax;x++){{txt(sx(x),H-5,String(x),'#5c6a7e',10,'middle');}}
+  txt(pad.l+10,pad.t+14,'Train','#4d8ef5',10);
+  txt(pad.l+50,pad.t+14,'Held-out','#3dba6c',10);
+  txt(pad.l+110,pad.t+14,'Baseline','#d4a830',10);
+  box.replaceChildren(svg);
+}}
+drawChart();
+window.addEventListener('resize', drawChart);
+
+var iterDiffs = {diffs_json};
+function getSelected(){{return Array.from(document.querySelectorAll('.cherry-pick-cb:checked')).map(function(cb){{return parseInt(cb.dataset.iteration)}});}}
+document.getElementById('btn-preview').addEventListener('click',function(){{
+  var sel=getSelected();if(!sel.length){{alert('No iterations selected');return;}}
+  var combined=sel.map(function(n){{return'--- Iteration '+n+' ---\\n'+(iterDiffs[String(n)]||'(no diff)')}}).join('\\n\\n');
+  document.getElementById('preview-content').textContent=combined;
+  document.getElementById('preview-area').classList.remove('hidden');
+}});
+document.getElementById('btn-export').addEventListener('click',function(){{
+  var sel=getSelected();if(!sel.length){{alert('No iterations selected');return;}}
+  var out={{selected_iterations:sel,diffs:{{}}}};
+  sel.forEach(function(n){{out.diffs[String(n)]=iterDiffs[String(n)]||''}});
+  var blob=new Blob([JSON.stringify(out,null,2)],{{type:'application/json'}});
+  var url=URL.createObjectURL(blob);
+  var a=document.createElement('a');a.href=url;a.download='cherry-picked-iterations.json';a.click();
+  URL.revokeObjectURL(url);
+}});
+</script>
+</body>
+</html>"""
+
+
+# ---------------------------------------------------------------------------
+# Task loading and splitting
+# ---------------------------------------------------------------------------
+
+
+def load_benchmark_tasks(path: Path) -> list[dict]:
+    """Load benchmark tasks from JSON file."""
+    data = json.loads(path.read_text())
+    if isinstance(data, list):
+        return data
+    if "tasks" in data:
+        return data["tasks"]
+    if "train" in data or "test" in data:
+        tasks = []
+        for split_name in ("train", "test"):
+            for task in data.get(split_name, []):
+                normalized = dict(task)
+                normalized.setdefault("split", split_name)
+                tasks.append(normalized)
+        return tasks
+    raise ValueError("Task file must be a list, {'tasks': [...]}, or {'train': [...], 'test': [...]}.")
+
+
+def split_tasks(
+    tasks: list[dict],
+    train_split: float,
+    seed: int = 42,
+) -> tuple[list[dict], list[dict]]:
+    """Split tasks into train and test sets.
+
+    Uses explicit 'split' field if present, otherwise random split
+    stratified by complexity.
+    """
+    has_explicit = any("split" in t for t in tasks)
+    if has_explicit:
+        train = [t for t in tasks if t.get("split", "train") == "train"]
+        test = [t for t in tasks if t.get("split") == "test"]
+        return train, test
+
+    rng = random.Random(seed)
+    by_complexity: dict[str, list[dict]] = {}
+    for t in tasks:
+        by_complexity.setdefault(t.get("complexity", "medium"), []).append(t)
+
+    train, test = [], []
+    for group in by_complexity.values():
+        rng.shuffle(group)
+        n_train = max(1, int(len(group) * train_split))
+        train.extend(group[:n_train])
+        test.extend(group[n_train:])
+
+    return train, test
+
+
+# ---------------------------------------------------------------------------
+# Frontmatter parsing
+# ---------------------------------------------------------------------------
+
+
+def _parse_frontmatter(content: str) -> tuple[bool, str]:
+    """Parse YAML frontmatter, returning (valid, description)."""
+    if not content.startswith("---"):
+        return False, ""
+    lines = content.split("\n")
+    end_idx = None
+    for i, line in enumerate(lines[1:], start=1):
+        if line.strip() == "---":
+            end_idx = i
+            break
+    if end_idx is None:
+        return False, ""
+
+    description = ""
+    fm_lines = lines[1:end_idx]
+    idx = 0
+    while idx < len(fm_lines):
+        line = fm_lines[idx]
+        if line.startswith("description:"):
+            value = line[len("description:"):].strip()
+            if value in (">", "|", ">-", "|-"):
+                parts: list[str] = []
+                idx += 1
+                while idx < len(fm_lines) and (fm_lines[idx].startswith("  ") or fm_lines[idx].startswith("\t")):
+                    parts.append(fm_lines[idx].strip())
+                    idx += 1
+                description = " ".join(parts)
+                continue
+            else:
+                description = value.strip('"').strip("'")
+        idx += 1
+    return True, description
+
+
+def _is_trigger_task(task: dict) -> bool:
+    return "query" in task and "should_trigger" in task
+
+
+def _is_pattern_task(task: dict) -> bool:
+    return "prompt" in task and (
+        "expected_patterns" in task or "forbidden_patterns" in task or "weight" in task
+    )
+
+
+def _validate_task_set(tasks: list[dict]) -> None:
+    """Reject unsupported or mixed task formats early with a clear error."""
+    if not tasks:
+        raise ValueError("Task file is empty.")
+
+    trigger_tasks = sum(1 for task in tasks if _is_trigger_task(task))
+    pattern_tasks = sum(1 for task in tasks if _is_pattern_task(task))
+
+    if trigger_tasks and pattern_tasks:
+        raise ValueError("Task file mixes trigger-rate and pattern benchmark formats. Use one format per run.")
+
+    if trigger_tasks == len(tasks):
+        return
+
+    if pattern_tasks == len(tasks):
+        raise ValueError(
+            "Pattern benchmark tasks are not supported by optimize_loop.py yet. "
+            "Use trigger-rate tasks with 'query' and 'should_trigger' fields."
+        )
+
+    raise ValueError(
+        "Unsupported task format. Expected trigger-rate tasks with 'query' and 'should_trigger' fields."
+    )
+
+
+# ---------------------------------------------------------------------------
+# Trigger-rate evaluator (uses existing run_eval infrastructure)
+# ---------------------------------------------------------------------------
+
+
+def _run_trigger_rate(
+    target_path: Path,
+    description: str,
+    tasks: list[dict],
+    num_workers: int = 5,
+    timeout: int = 30,
+    verbose: bool = False,
+) -> dict:
+    """Run trigger-rate assessment using the skill_eval infrastructure.
+
+    Tasks must have 'query' and 'should_trigger' fields.
+    Returns run_eval-style results dict.
+    """
+    import os
+    import tempfile
+
+    task_file = None
+    try:
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+            json.dump(tasks, f)
+            task_file = f.name
+
+        with tempfile.TemporaryDirectory() as skill_dir:
+            skill_md = Path(skill_dir) / "SKILL.md"
+            skill_md.write_text(target_path.read_text())
+
+            project_root = Path.cwd()
+            for parent in [project_root, *project_root.parents]:
+                if (parent / ".claude").is_dir():
+                    project_root = parent
+                    break
+
+            cmd = [
+                sys.executable, "-m", "scripts.skill_eval.run_eval",
+                "--eval-set", task_file,
+                "--skill-path", skill_dir,
+                "--description", description,
+                "--num-workers", str(num_workers),
+                "--timeout", str(timeout),
+                "--runs-per-query", "1",
+            ]
+            if verbose:
+                cmd.append("--verbose")
+                print(f"Running trigger assessment: {len(tasks)} queries", file=sys.stderr)
+
+            env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
+
+            result = subprocess.run(
+                cmd, capture_output=True, text=True,
+                cwd=str(project_root), env=env, timeout=600,
+            )
+
+            if result.returncode != 0:
+                if verbose:
+                    print(f"Trigger assessment failed: {result.stderr[:300]}", file=sys.stderr)
+                return {"results": [], "summary": {"total": 0, "passed": 0, "failed": 0}}
+
+            return json.loads(result.stdout)
+    finally:
+        if task_file:
+            Path(task_file).unlink(missing_ok=True)
+
+
+# ---------------------------------------------------------------------------
+# Evaluation bridge
+# ---------------------------------------------------------------------------
+
+
+def assess_target(
+    target_path: Path,
+    tasks: list[dict],
+    goal: str,
+    verbose: bool = False,
+    dry_run: bool = False,
+) -> dict:
+    """Assess a target file against tasks.
+
+    Supports three modes:
+    - Trigger-rate: tasks have 'query' + 'should_trigger' fields.
+      Uses existing run_eval infrastructure via claude -p.
+    - Dry-run: returns synthetic scores for testing loop mechanics.
+    - Benchmark (NYI): tasks have 'prompt' + 'name' fields.
+
+    Returns scores dict with hard gate booleans and quality dimensions.
+    """
+    scores: dict = {
+        "parses": True,
+        "compiles": True,
+        "tests_pass": True,
+        "protected_intact": True,
+        "correctness": 0.0,
+        "error_handling": 0.0,
+        "language_idioms": 0.0,
+        "testing": 0.0,
+        "efficiency": 0.0,
+        "task_results": [],
+    }
+
+    content = target_path.read_text()
+    valid, description = _parse_frontmatter(content)
+    if not valid or not description:
+        scores["parses"] = False
+        return scores
+
+    # Dry-run mode: content-dependent synthetic scores for testing loop mechanics.
+    # Hard gates always pass (the point is testing keep/revert logic).
+    # Quality scores vary deterministically based on content hash so that
+    # different variants produce different scores.
+    if dry_run:
+        import hashlib
+        h = int(hashlib.sha256(content.encode()).hexdigest()[:8], 16)
+        base = (h % 30 + 70) / 100.0  # 0.70-1.00 range — always decent
+        scores["correctness"] = round(base * 10, 2)
+        scores["error_handling"] = round(base * 8, 2)
+        scores["language_idioms"] = round(base * 7, 2)
+        scores["testing"] = round(base * 7, 2)
+        scores["efficiency"] = round(base * 6, 2)
+        scores["tests_pass"] = True  # always pass in dry-run
+        for task in tasks:
+            name = task.get("name", task.get("query", "unnamed"))[:40]
+            scores["task_results"].append({
+                "name": name, "passed": True,
+                "score": base, "details": "dry-run",
+            })
+        return scores
+
+    # Detect assessment mode from task format
+    is_trigger = all(_is_trigger_task(task) for task in tasks)
+
+    if is_trigger:
+        results = _run_trigger_rate(target_path, description, tasks, verbose=verbose)
+        summary = results.get("summary", {})
+        total = summary.get("total", 0)
+        passed = summary.get("passed", 0)
+        if total == 0:
+            return scores
+
+        accuracy = passed / total
+        scores["correctness"] = round(accuracy * 10, 2)
+        scores["error_handling"] = round(accuracy * 8, 2)
+        scores["language_idioms"] = round(accuracy * 7, 2)
+        scores["testing"] = round(accuracy * 8, 2)
+        scores["efficiency"] = round(min(1.0, accuracy + 0.1) * 6, 2)
+        scores["tests_pass"] = passed == total
+
+        for r in results.get("results", []):
+            scores["task_results"].append({
+                "name": r.get("query", "unnamed")[:40],
+                "passed": r.get("pass", False),
+                "score": 1.0 if r.get("pass", False) else 0.0,
+                "details": f"trigger_rate={r.get('trigger_rate', 0):.2f}",
+            })
+        return scores
+
+    # Benchmark behavioral assessment — not yet implemented.
+    # Use trigger-rate format (tasks with 'query' + 'should_trigger')
+    # as the recommended starting point per ADR-131 research findings.
+    raise NotImplementedError(
+        "Pattern benchmark tasks are not yet implemented. "
+        "Use trigger-rate tasks with 'query' and 'should_trigger' fields. "
+        "See optimization-guide.md."
+    )
+
+
+# ---------------------------------------------------------------------------
+# Protected section validation
+# ---------------------------------------------------------------------------
+
+_PROTECTED_RE = re.compile(
+    r"<!--\s*DO NOT OPTIMIZE\s*-->(.*?)<!--\s*END DO NOT OPTIMIZE\s*-->",
+    re.DOTALL,
+)
+
+
+def check_protected_sections(original: str, variant: str) -> bool:
+    """Verify DO NOT OPTIMIZE sections are preserved verbatim."""
+    orig = list(_PROTECTED_RE.finditer(original))
+    var = list(_PROTECTED_RE.finditer(variant))
+    if len(orig) != len(var):
+        return False
+    return all(orig_match.group(0) == var_match.group(0) for orig_match, var_match in zip(orig, var))
+
+
+# ---------------------------------------------------------------------------
+# Main loop
+# ---------------------------------------------------------------------------
+
+
+def run_optimization_loop(
+    target_path: Path,
+    goal: str,
+    benchmark_tasks_path: Path,
+    max_iterations: int = 20,
+    min_gain: float = 0.02,
+    train_split: float = 0.6,
+    model: str = "claude-sonnet-4-20250514",
+    verbose: bool = False,
+    report_path: Path | None = None,
+    output_dir: Path | None = None,
+    dry_run: bool = False,
+) -> dict:
+    """Run the autoresearch optimization loop."""
+    if output_dir is None:
+        output_dir = Path("evals/iterations")
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    all_tasks = load_benchmark_tasks(benchmark_tasks_path)
+    _validate_task_set(all_tasks)
+    train_tasks, test_tasks = split_tasks(all_tasks, train_split)
+
+    if verbose:
+        print(f"Tasks: {len(train_tasks)} train, {len(test_tasks)} test", file=sys.stderr)
+
+    original_content = target_path.read_text()
+    target_valid, target_description = _parse_frontmatter(original_content)
+    if not target_valid or not target_description:
+        raise ValueError(
+            "Target must have YAML frontmatter with a non-empty description. "
+            "optimize_loop.py currently supports frontmatter-description optimization only."
+        )
+    current_content = original_content
+    target_label = target_path.name
+
+    if verbose:
+        print("Running baseline evaluation...", file=sys.stderr)
+
+    baseline_scores = assess_target(target_path, train_tasks, goal, verbose, dry_run)
+    baseline_composite = composite_score(baseline_scores)
+    best_score = baseline_composite
+    best_content = current_content
+    best_iteration = 0
+
+    baseline_holdout_scores = assess_target(target_path, test_tasks, goal, verbose, dry_run) if test_tasks else None
+    baseline_holdout = composite_score(baseline_holdout_scores) if baseline_holdout_scores else None
+
+    if verbose:
+        holdout_display = f"{baseline_holdout:.4f}" if baseline_holdout is not None else "n/a"
+        print(f"Baseline: train={baseline_composite:.4f}, holdout={holdout_display}", file=sys.stderr)
+
+    iterations: list[dict] = []
+    consecutive_reverts = 0
+    exit_reason = "unknown"
+    status = "RUNNING"
+    total_tokens = 0
+
+    for i in range(1, max_iterations + 1):
+        if verbose:
+            print(f"\n{'=' * 60}\nIteration {i}/{max_iterations} (best={best_score:.4f})", file=sys.stderr)
+
+        # 1. Generate variant
+        t0 = time.time()
+        last_failures = []
+        if iterations:
+            last_scores_data = iterations[-1].get("scores", {})
+            last_failures = [t for t in last_scores_data.get("task_results", []) if not t.get("passed")]
+        history = [
+            {
+                "number": item["number"],
+                "verdict": item["verdict"],
+                "change_summary": item["change_summary"],
+                "delta": item["delta"],
+            }
+            for item in iterations[-5:]
+        ]
+
+        if dry_run:
+            variant_content, change_summary, reasoning = make_dry_run_variant(current_content, i)
+            variant_output = {
+                "variant": variant_content,
+                "summary": change_summary,
+                "reasoning": reasoning,
+                "tokens_used": 0,
+                "deletions": [],
+                "deletion_justification": "",
+            }
+            deletions = []
+            deletion_justification = ""
+        else:
+            with tempfile.NamedTemporaryFile(mode="w", suffix=target_path.suffix, encoding="utf-8") as current_file:
+                current_file.write(current_content)
+                current_file.flush()
+                variant_result = subprocess.run(
+                    [
+                        sys.executable,
+                        str(Path(__file__).parent / "generate_variant.py"),
+                        "--target", str(target_path),
+                        "--goal", goal,
+                        "--current-content-file", current_file.name,
+                        "--failures", json.dumps(last_failures),
+                        "--history", json.dumps(history),
+                        "--model", model,
+                    ],
+                    capture_output=True, text=True, timeout=120,
+                )
+
+            if variant_result.returncode != 0:
+                if verbose:
+                    print(f"Variant generation failed: {variant_result.stderr}", file=sys.stderr)
+                consecutive_reverts += 1
+                iterations.append({
+                    "number": i, "verdict": "REVERT",
+                    "score": {"train": best_score},
+                    "delta": "0", "change_summary": "Variant generation failed",
+                    "reasoning": variant_result.stderr[:200], "diff": "",
+                })
+                if consecutive_reverts >= 5:
+                    exit_reason = f"converged (5 consecutive reverts at iteration {i})"
+                    status = "CONVERGED"
+                    break
+                continue
+
+            try:
+                variant_output = json.loads(variant_result.stdout)
+                variant_content = variant_output["variant"]
+                change_summary = variant_output.get("summary", "")
+                reasoning = variant_output.get("reasoning", "")
+                total_tokens += variant_output.get("tokens_used", 0)
+                deletions = variant_output.get("deletions", [])
+                deletion_justification = variant_output.get("deletion_justification", "").strip()
+            except (json.JSONDecodeError, KeyError) as e:
+                if verbose:
+                    print(f"Parse error: {e}", file=sys.stderr)
+                consecutive_reverts += 1
+                iterations.append({
+                    "number": i, "verdict": "REVERT",
+                    "score": {"train": best_score},
+                    "delta": "0", "change_summary": f"Parse error: {e}",
+                    "reasoning": "", "diff": "",
+                })
+                if consecutive_reverts >= 5:
+                    exit_reason = f"converged (5 consecutive reverts at iteration {i})"
+                    status = "CONVERGED"
+                    break
+                continue
+
+        gen_elapsed = time.time() - t0
+
+        # 2. Validate protected sections
+        if not check_protected_sections(original_content, variant_content):
+            if verbose:
+                print("REJECTED: Protected sections modified", file=sys.stderr)
+            diff_text = generate_diff(current_content, variant_content, target_label)
+            save_iteration(output_dir, i, variant_content, {"protected_intact": False},
+                           "REVERT", "Protected sections modified", diff_text, change_summary)
+            iterations.append({
+                "number": i, "verdict": "REVERT",
+                "score": {"train": 0.0},
+                "delta": "0", "change_summary": "Protected sections modified",
+                "reasoning": reasoning, "diff": diff_text,
+            })
+            consecutive_reverts += 1
+            if consecutive_reverts >= 5:
+                exit_reason = f"converged (5 consecutive reverts at iteration {i})"
+                status = "CONVERGED"
+                break
+            continue
+
+        if deletions and not deletion_justification:
+            if verbose:
+                print(f"REJECTED: Deleted sections without justification: {deletions}", file=sys.stderr)
+            diff_text = generate_diff(current_content, variant_content, target_label)
+            save_iteration(
+                output_dir,
+                i,
+                variant_content,
+                {"protected_intact": True},
+                "REVERT",
+                "Deleted sections without justification",
+                diff_text,
+                change_summary,
+                deletions=deletions,
+            )
+            iterations.append({
+                "number": i,
+                "verdict": "REVERT",
+                "score": {"train": best_score},
+                "delta": "0",
+                "change_summary": "Deleted sections without justification",
+                "reasoning": reasoning,
+                "diff": diff_text,
+                "deletions": deletions,
+                "deletion_justification": "",
+            })
+            consecutive_reverts += 1
+            if consecutive_reverts >= 5:
+                exit_reason = f"converged (5 consecutive reverts at iteration {i})"
+                status = "CONVERGED"
+                break
+            continue
+
+        # 3. Evaluate variant
+        temp_target = target_path.parent / f".{target_path.stem}_variant{target_path.suffix}"
+        temp_target.write_text(variant_content)
+        try:
+            t0 = time.time()
+            variant_scores = assess_target(temp_target, train_tasks, goal, verbose, dry_run)
+            eval_elapsed = time.time() - t0
+            variant_composite = composite_score(variant_scores)
+        finally:
+            temp_target.unlink(missing_ok=True)
+
+        diff_text = generate_diff(current_content, variant_content, target_label)
+
+        if verbose:
+            print(f"Score: {variant_composite:.4f} (gain={variant_composite - best_score:.4f}, gen={gen_elapsed:.1f}s, eval={eval_elapsed:.1f}s)", file=sys.stderr)
+
+        # 4. Keep/revert (deterministic arithmetic)
+        gain = variant_composite - best_score
+        if gain > min_gain:
+            verdict = "KEEP"
+            best_score = variant_composite
+            best_content = variant_content
+            best_iteration = i
+            current_content = variant_content
+            consecutive_reverts = 0
+            delta_str = f"+{gain:.2f}"
+        else:
+            verdict = "REVERT"
+            consecutive_reverts += 1
+            delta_str = f"{gain:+.2f}" if gain != 0 else "0"
+
+        if deletions and deletion_justification:
+            change_summary = f"{change_summary} [deletion justified]"
+
+        save_iteration(output_dir, i, variant_content, variant_scores,
+                       verdict, reasoning, diff_text, change_summary,
+                       deletions=deletions, deletion_justification=deletion_justification)
+
+        iteration_data: dict = {
+            "number": i, "verdict": verdict,
+            "score": {"train": variant_composite, "test": None},
+            "delta": delta_str, "change_summary": change_summary,
+            "reasoning": reasoning, "diff": diff_text,
+            "tokens_used": variant_output.get("tokens_used", 0),
+            "scores": variant_scores,
+            "deletions": deletions,
+            "deletion_justification": deletion_justification,
+        }
+
+        # 5. Goodhart alarm — every 5 iterations, check held-out set
+        if test_tasks and i % 5 == 0:
+            try:
+                temp_target.write_text(best_content)
+                holdout_scores = assess_target(temp_target, test_tasks, goal, verbose, dry_run)
+                holdout_composite = composite_score(holdout_scores)
+                iteration_data["score"]["test"] = holdout_composite
+            finally:
+                temp_target.unlink(missing_ok=True)
+
+            if holdout_diverges(best_score, holdout_composite, baseline_holdout, baseline_composite):
+                if verbose:
+                    print(f"GOODHART ALARM: holdout={holdout_composite:.4f} vs baseline={baseline_holdout:.4f}", file=sys.stderr)
+                exit_reason = f"goodhart_alarm (iteration {i})"
+                status = "GOODHART_ALARM"
+                iterations.append(iteration_data)
+                break
+
+        iterations.append(iteration_data)
+
+        # 6. Convergence check
+        if consecutive_reverts >= 5:
+            exit_reason = f"converged (5 consecutive reverts at iteration {i})"
+            status = "CONVERGED"
+            break
+
+        # Regenerate live report
+        if report_path:
+            rd = _build_report_data(target_label, goal, baseline_composite, baseline_holdout,
+                                    len(train_tasks), len(test_tasks), iterations, max_iterations,
+                                    status, total_tokens)
+            report_path.write_text(generate_optimization_report(rd, auto_refresh=True))
+
+    else:
+        exit_reason = f"max_iterations ({max_iterations})"
+        status = "COMPLETE"
+
+    # Final report
+    if report_path:
+        rd = _build_report_data(target_label, goal, baseline_composite, baseline_holdout,
+                                len(train_tasks), len(test_tasks), iterations, max_iterations,
+                                status, total_tokens)
+        report_path.write_text(generate_optimization_report(rd, auto_refresh=False))
+
+    if best_iteration > 0:
+        best_path = output_dir / "best_variant.md"
+        best_path.write_text(best_content)
+        if verbose:
+            print(f"\nBest variant saved to: {best_path}", file=sys.stderr)
+
+    result = {
+        "exit_reason": exit_reason, "status": status,
+        "target": str(target_path), "goal": goal,
+        "baseline_score": {"train": baseline_composite, "test": baseline_holdout},
+        "baseline_train_score": baseline_composite,
+        "baseline_holdout_score": baseline_holdout,
+        "best_score": best_score,
+        "best_iteration": best_iteration, "iterations_run": len(iterations),
+        "max_iterations": max_iterations,
+        "improvements_found": sum(1 for it in iterations if it["verdict"] == "KEEP"),
+        "total_tokens": total_tokens,
+        "train_size": len(train_tasks), "test_size": len(test_tasks),
+        "iterations": iterations,
+    }
+    (output_dir / "results.json").write_text(json.dumps(result, indent=2))
+    return result
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Autoresearch optimization loop for agent/skill files")
+    parser.add_argument("--target", required=True, help="Path to agent/skill file to optimize")
+    parser.add_argument("--goal", required=True, help="Optimization objective")
+    parser.add_argument("--benchmark-tasks", required=True, help="Path to benchmark tasks JSON")
+    parser.add_argument("--max-iterations", type=int, default=20, help="Max iterations (default: 20)")
+    parser.add_argument("--min-gain", type=float, default=0.02, help="Min score gain to keep (default: 0.02)")
+    parser.add_argument("--train-split", type=float, default=0.6, help="Train fraction (default: 0.6)")
+    parser.add_argument("--model", required=True, help="Model for variant generation")
+    parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
+    parser.add_argument("--dry-run", action="store_true", help="Use synthetic scores (test loop mechanics without API)")
+    parser.add_argument("--report", default=None, help="Path for live HTML report")
+    parser.add_argument("--output-dir", default=None, help="Directory for iteration snapshots")
+    args = parser.parse_args()
+
+    target = Path(args.target)
+    if not target.exists():
+        print(f"Error: Target not found: {target}", file=sys.stderr)
+        sys.exit(1)
+
+    tasks_path = Path(args.benchmark_tasks)
+    if not tasks_path.exists():
+        print(f"Error: Tasks not found: {tasks_path}", file=sys.stderr)
+        sys.exit(1)
+
+    try:
+        result = run_optimization_loop(
+            target_path=target, goal=args.goal,
+            benchmark_tasks_path=tasks_path,
+            max_iterations=args.max_iterations, min_gain=args.min_gain,
+            train_split=args.train_split, model=args.model,
+            verbose=args.verbose,
+            report_path=Path(args.report) if args.report else None,
+            output_dir=Path(args.output_dir) if args.output_dir else None,
+            dry_run=args.dry_run,
+        )
+    except ValueError as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    print(json.dumps(result, indent=2))
+    if args.verbose:
+        print(f"\nExit: {result['exit_reason']}", file=sys.stderr)
+        print(f"Best: {result['best_score']:.4f} (iteration {result['best_iteration']})", file=sys.stderr)
+        print(f"Improvements: {result['improvements_found']}/{result['iterations_run']}", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/skills/skill-creator/assets/eval_viewer.html b/skills/skill-creator/assets/eval_viewer.html
index 636532b..48822b6 100644
--- a/skills/skill-creator/assets/eval_viewer.html
+++ b/skills/skill-creator/assets/eval_viewer.html
@@ -641,6 +641,98 @@
 @media (prefers-reduced-motion: reduce) {
   *, *::before, *::after { transition: none !important; animation: none !important; }
 }
+
+/* ─── Iterations Tab ───────────────────────────────────────────────── */
+.opt-dashboard {
+  background: var(--surface);
+  border: 1px solid var(--border);
+  border-radius: var(--radius-lg);
+  padding: 16px 20px;
+  margin-bottom: 20px;
+  display: grid;
+  grid-template-columns: repeat(auto-fit, minmax(160px, 1fr));
+  gap: 12px;
+}
+
+.opt-dash-item {
+  display: flex;
+  flex-direction: column;
+  gap: 2px;
+}
+
+.opt-dash-label {
+  font-size: 11px;
+  color: var(--muted);
+  text-transform: uppercase;
+  letter-spacing: 0.06em;
+}
+
+.opt-dash-value {
+  font-size: 16px;
+  font-weight: 600;
+  color: var(--bright);
+  font-variant-numeric: tabular-nums;
+}
+
+.opt-running { color: var(--accent); }
+.opt-done { color: var(--green); }
+.opt-alarm { color: var(--red); }
+
+.opt-chart { margin-bottom: 20px; }
+
+.iter-row { cursor: pointer; transition: background 0.1s; }
+.iter-row:hover { background: var(--surface-2); }
+
+.iter-diff-row td { padding: 0; }
+
+.iter-diff-block {
+  background: var(--code-bg);
+  border: 1px solid var(--border);
+  padding: 12px 14px;
+  font-size: 11.5px;
+  font-family: var(--font-mono);
+  max-height: 400px;
+  overflow: auto;
+  white-space: pre;
+  line-height: 1.6;
+  color: #8899bb;
+}
+
+.verdict-keep { color: var(--green); font-weight: 600; }
+.verdict-revert { color: var(--red); font-weight: 600; }
+.verdict-stop { color: var(--yellow); font-weight: 600; }
+
+.opt-actions {
+  margin-top: 16px;
+  display: flex;
+  gap: 10px;
+  align-items: center;
+}
+
+.opt-btn {
+  padding: 8px 18px;
+  border-radius: var(--radius);
+  border: 1px solid var(--border-2);
+  background: var(--surface-2);
+  color: var(--text);
+  cursor: pointer;
+  font-size: 12px;
+  font-family: var(--font-sans);
+  font-weight: 500;
+  transition: all 0.15s;
+}
+
+.opt-btn:hover { color: var(--bright); background: var(--surface-3); }
+.opt-btn:focus-visible { outline: 2px solid var(--accent); outline-offset: 2px; }
+
+.opt-btn-primary {
+  background: var(--accent);
+  color: #fff;
+  border: none;
+  box-shadow: 0 1px 4px rgba(77, 142, 245, 0.3);
+}
+
+.opt-btn-primary:hover { background: #5a99f8; }
 </style>
 </head>
 <body>
@@ -656,12 +748,14 @@ <h1>Blind A/B Code Review</h1>
   <button class="nav-btn active" data-page="review">Review</button>
   <button class="nav-btn" data-page="grades">Grades</button>
   <button class="nav-btn" data-page="results">Results</button>
+  <button class="nav-btn" data-page="iterations">Iterations</button>
 </nav>
 
 <main>
   <div id="review-page" class="page active" role="tabpanel"></div>
   <div id="grades-page" class="page" role="tabpanel"></div>
   <div id="results-page" class="page" role="tabpanel"></div>
+  <div id="iterations-page" class="page" role="tabpanel"></div>
 </main>
 
 <script>
@@ -681,6 +775,8 @@ <h1>Blind A/B Code Review</h1>
     document.querySelectorAll('.page').forEach(p=>p.classList.remove('active'));
     this.classList.add('active');
     document.getElementById(this.dataset.page+'-page').classList.add('active');
+    // Re-render iterations chart when tab becomes visible (clientWidth is 0 when hidden)
+    if(this.dataset.page==='iterations' && typeof window.drawOptChart==='function') window.drawOptChart();
   });
 });
 
@@ -1181,9 +1277,321 @@ <h1>Blind A/B Code Review</h1>
   page.appendChild(keySection);
 }
 
+function buildIterationsPage(){
+  var page = document.getElementById('iterations-page');
+  page.replaceChildren();
+  page.style.cssText = '';
+  if(!DATA || !DATA.optimization) {
+    page.textContent = 'No optimization data. Run optimize_loop.py to populate.';
+    page.style.cssText = 'display:none;padding:28px 32px;color:var(--muted);font-size:13px';
+    // Hide the Iterations tab if no data
+    var iterBtn = document.querySelector('[data-page="iterations"]');
+    if(iterBtn) iterBtn.style.display = 'none';
+    return;
+  }
+
+  var opt = DATA.optimization;
+  var iterations = opt.iterations || [];
+  var baselineRaw = opt.baseline_score;
+  var baseline = (baselineRaw && typeof baselineRaw === 'object') ? baselineRaw : {train: baselineRaw, test: opt.baseline_holdout_score};
+  var taskCounts = opt.task_counts || {train: opt.train_size, test: opt.test_size};
+  var status = opt.status || 'UNKNOWN';
+  var bt = typeof baseline.train === 'number' ? baseline.train : 0;
+
+  // Dashboard
+  var dash = el('div','opt-dashboard');
+  var best = bt;
+  iterations.forEach(function(it){ if(it.score && typeof it.score.train === 'number' && it.score.train > best) best = it.score.train; });
+  var kept = iterations.filter(function(it){return it.verdict==='KEEP'}).length;
+  var reverted = iterations.filter(function(it){return it.verdict==='REVERT'}).length;
+  var statusCls = status==='RUNNING'?'opt-running':status==='GOODHART_ALARM'?'opt-alarm':'opt-done';
+
+  function dashItem(label, value, cls) {
+    var item = el('div','opt-dash-item');
+    item.appendChild(el('span','opt-dash-label',label));
+    var val = el('span','opt-dash-value'+(cls?' '+cls:''),String(value));
+    item.appendChild(val);
+    dash.appendChild(item);
+  }
+
+  dashItem('Status', status, statusCls);
+  dashItem('Progress', iterations.length + '/' + (opt.max_iterations||20));
+  dashItem('Baseline', bt.toFixed(2));
+  dashItem('Best', best.toFixed(2) + ' (' + (best-bt >= 0 ? '+' : '') + (best-bt).toFixed(2) + ')');
+  dashItem('Kept', String(kept));
+  dashItem('Reverted', String(reverted));
+  if(opt.total_tokens) dashItem('Tokens', Math.round(opt.total_tokens/1000)+'k');
+
+  page.appendChild(dash);
+  if(taskCounts.train || taskCounts.test) {
+    var counts = el('p','muted','Train tasks: '+String(taskCounts.train||0)+(taskCounts.test ? ' | Held-out tasks: '+String(taskCounts.test) : ''));
+    counts.style.marginBottom = '20px';
+    page.appendChild(counts);
+  }
+
+  // Convergence chart (safe DOM construction)
+  var chartBox = el('div','card opt-chart');
+  var chartHead = el('div','card-head');
+  chartHead.appendChild(el('h3','','Convergence'));
+  chartBox.appendChild(chartHead);
+  var chartBody = el('div','card-body');
+  chartBody.id = 'opt-chart-area';
+  chartBox.appendChild(chartBody);
+  page.appendChild(chartBox);
+
+  // Draw chart using safe SVG DOM methods
+  function drawOptChart() {
+    var area = document.getElementById('opt-chart-area');
+    if(!iterations.length) { area.textContent = 'No iterations yet'; return; }
+
+    var W = Math.min(area.clientWidth - 16, 800), H = 200;
+    var pad = {l:40, r:20, t:10, b:30};
+    var pW = W-pad.l-pad.r, pH = H-pad.t-pad.b;
+    var pts = iterations.map(function(it){
+      return {x:it.number, train:it.score?it.score.train:0, test:it.score?it.score.test:null};
+    });
+    var xMax = Math.max.apply(null, pts.map(function(p){return p.x}));
+    if(xMax<1) xMax=1;
+    var yValues = [bt];
+    pts.forEach(function(p){
+      if(typeof p.train === 'number') yValues.push(p.train);
+      if(typeof p.test === 'number') yValues.push(p.test);
+    });
+    var yMin = Math.max(0, Math.floor(Math.min.apply(null, yValues)) - 0.5);
+    var yMax = Math.ceil(Math.max.apply(null, yValues) + 0.5);
+    if(yMax <= yMin) yMax = yMin + 1;
+    function sx(x){return pad.l+(x/xMax)*pW;}
+    function sy(y){return pad.t+pH-((y-yMin)/(yMax-yMin))*pH;}
+
+    var NS = 'http://www.w3.org/2000/svg';
+    var svg = document.createElementNS(NS,'svg');
+    svg.setAttribute('width',String(W));
+    svg.setAttribute('height',String(H));
+    svg.style.display='block';
+
+    function svgLine(x1,y1,x2,y2,stroke,w,d){
+      var l=document.createElementNS(NS,'line');
+      l.setAttribute('x1',x1);l.setAttribute('y1',y1);l.setAttribute('x2',x2);l.setAttribute('y2',y2);
+      l.setAttribute('stroke',stroke);l.setAttribute('stroke-width',w);
+      if(d)l.setAttribute('stroke-dasharray',d);svg.appendChild(l);
+    }
+    function svgCircle(cx,cy,r,fill,stroke){
+      var c=document.createElementNS(NS,'circle');
+      c.setAttribute('cx',cx);c.setAttribute('cy',cy);c.setAttribute('r',r);
+      c.setAttribute('fill',fill||'none');if(stroke)c.setAttribute('stroke',stroke);
+      svg.appendChild(c);
+    }
+    function svgText(x,y,t,fill,sz,anchor){
+      var e=document.createElementNS(NS,'text');
+      e.setAttribute('x',x);e.setAttribute('y',y);e.setAttribute('fill',fill);
+      e.setAttribute('font-size',sz);if(anchor)e.setAttribute('text-anchor',anchor);
+      e.textContent=t;svg.appendChild(e);
+    }
+    function svgPath(d,stroke,w,dash){
+      var p=document.createElementNS(NS,'path');
+      p.setAttribute('d',d);p.setAttribute('fill','none');
+      p.setAttribute('stroke',stroke);p.setAttribute('stroke-width',w);
+      if(dash)p.setAttribute('stroke-dasharray',dash);svg.appendChild(p);
+    }
+
+    // Grid
+    for(var y=yMin;y<=yMax+0.001;y+=0.5){svgLine(pad.l,sy(y),W-pad.r,sy(y),'#222832',1);svgText(pad.l-6,sy(y)+4,y.toFixed(1),'#5c6a7e',10,'end');}
+    // Baseline
+    svgLine(pad.l,sy(bt),W-pad.r,sy(bt),'#d4a830',1,'4,4');
+    // Train (solid blue)
+    var tp=pts.filter(function(p){return p.train!=null});
+    if(tp.length>1){var d=tp.map(function(p,i){return(i===0?'M':'L')+sx(p.x)+','+sy(p.train)}).join(' ');svgPath(d,'#4d8ef5',2);}
+    tp.forEach(function(p){svgCircle(sx(p.x),sy(p.train),3,'#4d8ef5');});
+    // Test (dashed green)
+    var hp=pts.filter(function(p){return p.test!=null});
+    if(hp.length>1){var d2=hp.map(function(p,i){return(i===0?'M':'L')+sx(p.x)+','+sy(p.test)}).join(' ');svgPath(d2,'#3dba6c',2,'6,3');}
+    hp.forEach(function(p){svgCircle(sx(p.x),sy(p.test),3,'none','#3dba6c');});
+    // X labels
+    for(var x=1;x<=xMax;x++){svgText(sx(x),H-5,String(x),'#5c6a7e',10,'middle');}
+    // Legend
+    svgText(pad.l+10,pad.t+14,'Train','#4d8ef5',10);
+    svgText(pad.l+50,pad.t+14,'Held-out','#3dba6c',10);
+    svgText(pad.l+110,pad.t+14,'Baseline','#d4a830',10);
+
+    area.replaceChildren(svg);
+  }
+  drawOptChart();
+  window.drawOptChart = drawOptChart;
+  window.addEventListener('resize', drawOptChart);
+
+  // Iteration table
+  var tableCard = el('div','card');
+  tableCard.style.marginBottom = '20px';
+  var tHead = el('div','card-head');
+  tHead.appendChild(el('h3','','Iterations'));
+  tableCard.appendChild(tHead);
+  var tBody = el('div','card-body');
+
+  var tbl = document.createElement('table');
+  var thead = document.createElement('thead');
+  var hrow = document.createElement('tr');
+  ['#','Verdict','Score','Delta','Change','Pick'].forEach(function(h){hrow.appendChild(el('th','',h));});
+  thead.appendChild(hrow);tbl.appendChild(thead);
+
+  var tbody = document.createElement('tbody');
+  iterations.forEach(function(it){
+    var tr = document.createElement('tr');
+    tr.className = 'iter-row';
+    tr.dataset.iteration = String(it.number);
+
+    tr.appendChild(el('td','',String(it.number)));
+
+    var vtd = document.createElement('td');
+    var vspan = el('span','verdict-'+(it.verdict||'').toLowerCase(),it.verdict||'?');
+    vtd.appendChild(vspan);
+    tr.appendChild(vtd);
+
+    var sc = it.score || {};
+    var scoreText = typeof sc.train === 'number' ? sc.train.toFixed(2) : '?';
+    if(typeof sc.test === 'number') scoreText += ' / ' + sc.test.toFixed(2);
+    tr.appendChild(el('td','',scoreText));
+
+    var delta = String(it.delta||'');
+    var dcls = delta.indexOf('+')===0&&delta!=='+0'?'d-pos':delta.indexOf('-')===0?'d-neg':'d-zero';
+    tr.appendChild(el('td',dcls,delta));
+
+    tr.appendChild(el('td','',String(it.change_summary||'').substring(0,60)));
+
+    var pickTd = document.createElement('td');
+    var pickLabel = document.createElement('label');
+    var pickCb = document.createElement('input');
+    pickCb.type = 'checkbox';
+    pickCb.className = 'cherry-pick-cb';
+    pickCb.dataset.iteration = String(it.number);
+    if(it.verdict==='KEEP') pickCb.checked = true;
+    if(it.verdict!=='KEEP') pickCb.disabled = true;
+    pickLabel.appendChild(pickCb);
+    pickLabel.appendChild(document.createTextNode(' Pick'));
+    pickTd.appendChild(pickLabel);
+    tr.appendChild(pickTd);
+
+    // Click to toggle diff
+    tr.addEventListener('click', function(e){
+      if(e.target.closest && e.target.closest('label')) return;
+      if(e.target.type==='checkbox') return;
+      var diffRow = document.getElementById('opt-diff-'+it.number);
+      if(diffRow) diffRow.classList.toggle('hidden');
+    });
+    tbody.appendChild(tr);
+
+    // Diff row (hidden by default)
+    var diffTr = document.createElement('tr');
+    diffTr.className = 'iter-diff-row hidden';
+    diffTr.id = 'opt-diff-'+it.number;
+    var diffTd = document.createElement('td');
+    diffTd.colSpan = 6;
+    var diffPre = el('pre','iter-diff-block',it.diff||'(no diff)');
+    diffTd.appendChild(diffPre);
+    diffTr.appendChild(diffTd);
+    tbody.appendChild(diffTr);
+  });
+  tbl.appendChild(tbody);
+  tBody.appendChild(tbl);
+  tableCard.appendChild(tBody);
+  page.appendChild(tableCard);
+
+  // Actions bar
+  var actions = el('div','opt-actions');
+
+  var previewBtn = el('button','opt-btn opt-btn-primary','Preview Combined');
+  previewBtn.addEventListener('click', function(){
+    var selected = Array.from(document.querySelectorAll('#iterations-page .cherry-pick-cb:checked'))
+      .map(function(cb){return parseInt(cb.dataset.iteration);});
+    if(!selected.length){alert('No iterations selected');return;}
+    var combined = selected.map(function(n){
+      var it = iterations.find(function(i){return i.number===n;});
+      return '--- Iteration '+n+' ---\n'+(it&&it.diff||'(no diff)');
+    }).join('\n\n');
+    var preview = document.getElementById('opt-preview-content');
+    preview.textContent = combined;
+    document.getElementById('opt-preview-area').classList.remove('hidden');
+  });
+  actions.appendChild(previewBtn);
+
+  var exportBtn = el('button','opt-btn','Export Selected');
+  exportBtn.addEventListener('click', function(){
+    var selected = Array.from(document.querySelectorAll('#iterations-page .cherry-pick-cb:checked'))
+      .map(function(cb){return parseInt(cb.dataset.iteration);});
+    if(!selected.length){alert('No iterations selected');return;}
+    var output = {selected_iterations:selected, diffs:{}};
+    selected.forEach(function(n){
+      var it = iterations.find(function(i){return i.number===n;});
+      output.diffs[String(n)] = it&&it.diff||'';
+    });
+    var blob = new Blob([JSON.stringify(output,null,2)],{type:'application/json'});
+    var url = URL.createObjectURL(blob);
+    var a = document.createElement('a');a.href=url;a.download='cherry-picked-iterations.json';a.click();
+    URL.revokeObjectURL(url);
+  });
+  actions.appendChild(exportBtn);
+
+  page.appendChild(actions);
+
+  // Preview area
+  var previewArea = el('div','hidden');
+  previewArea.id = 'opt-preview-area';
+  previewArea.style.marginTop = '16px';
+  previewArea.appendChild(el('h3','','Combined Preview'));
+  previewArea.lastChild.style.cssText = 'color:var(--bright);margin-bottom:8px;font-size:14px';
+  var previewPre = el('pre','iter-diff-block','');
+  previewPre.id = 'opt-preview-content';
+  previewArea.appendChild(previewPre);
+  page.appendChild(previewArea);
+
+  // Summary stats card
+  if(opt.summary || iterations.length) {
+    var statsCard = el('div','card');
+    statsCard.style.marginTop = '20px';
+    var statsHead = el('div','card-head');
+    statsHead.appendChild(el('h3','','Summary'));
+    statsCard.appendChild(statsHead);
+    var statsBody = el('div','card-body');
+
+    var totalIter = iterations.length;
+    var improvRate = totalIter>0 ? (kept/totalIter*100).toFixed(1)+'%' : '0%';
+    var totalImprove = best - bt;
+    var avgPerKeep = kept>0 ? (totalImprove/kept).toFixed(1) : '0';
+    var tokensK = opt.total_tokens ? Math.round(opt.total_tokens/1000)+'k' : '?';
+    var tokPerImprov = kept>0&&opt.total_tokens ? Math.round(opt.total_tokens/kept/1000)+'k' : '?';
+    var stopReason = opt.exit_reason || opt.status || '?';
+
+    buildStat(statsBody, 'Total Iterations', String(totalIter));
+    buildStat(statsBody, 'Improvements Found', kept+' ('+improvRate+')');
+    buildStat(statsBody, 'Total Score Improvement', (totalImprove >= 0 ? '+' : '')+totalImprove.toFixed(2)+' ('+bt.toFixed(2)+'→'+best.toFixed(2)+')');
+    buildStat(statsBody, 'Avg Improvement per KEEP', avgPerKeep);
+    buildStat(statsBody, 'Total Tokens', tokensK);
+    buildStat(statsBody, 'Tokens per Improvement', tokPerImprov);
+    buildStat(statsBody, 'Stop Reason', stopReason);
+
+    statsCard.appendChild(statsBody);
+    page.appendChild(statsCard);
+  }
+
+  // Goal card
+  if(opt.goal) {
+    var goalCard = el('div','card');
+    goalCard.style.marginTop = '16px';
+    var goalHead = el('div','card-head');
+    goalHead.appendChild(el('h3','','Optimization Target'));
+    goalCard.appendChild(goalHead);
+    var goalBody = el('div','card-body');
+    buildStat(goalBody, 'Target', opt.target||'?');
+    buildStat(goalBody, 'Goal', opt.goal);
+    buildStat(goalBody, 'Train/Test Split', String(taskCounts.train||opt.train_size||'?')+'/'+String(taskCounts.test||opt.test_size||'?'));
+    goalCard.appendChild(goalBody);
+    page.appendChild(goalCard);
+  }
+}
+
 buildReviewPage();
 buildGradesPage();
 buildResultsPage();
+buildIterationsPage();
 </script>
 </body>
 </html>
diff --git a/skills/skill-creator/scripts/eval_compare.py b/skills/skill-creator/scripts/eval_compare.py
index 58f1849..552100b 100644
--- a/skills/skill-creator/scripts/eval_compare.py
+++ b/skills/skill-creator/scripts/eval_compare.py
@@ -162,6 +162,24 @@ def find_iteration_dirs(workspace: Path) -> list[Path]:
     return [d for d in dirs if d.is_dir()]
 
 
+def load_optimization_data(workspace: Path) -> dict | None:
+    """Load optimization loop results when present in the workspace."""
+    def looks_like_optimization_results(data: dict) -> bool:
+        return isinstance(data, dict) and "iterations" in data and "baseline_score" in data and "target" in data
+
+    candidates = [
+        workspace / "results.json",
+        workspace / "evals" / "iterations" / "results.json",
+        workspace / "out" / "results.json",
+    ]
+    for path in candidates:
+        if path.exists():
+            data = load_json_safe(path)
+            if data is not None and looks_like_optimization_results(data):
+                return data
+    return None
+
+
 def build_data(workspace: Path) -> dict:
     """Build full comparison data."""
     evals_path = workspace / "evals" / "evals.json"
@@ -185,6 +203,7 @@ def build_data(workspace: Path) -> dict:
             "variantAName": "Variant A",
             "variantBName": "Variant B",
             "variantCName": "Variant C",
+            "optimization": load_optimization_data(workspace),
         }
 
     iteration = iterations[-1]  # Latest iteration
@@ -239,6 +258,7 @@ def build_data(workspace: Path) -> dict:
         "variantAName": variants.get("A", {}).get("name", "Variant A"),
         "variantBName": variants.get("B", {}).get("name", "Variant B"),
         "variantCName": variants.get("C", {}).get("name", "Variant C"),
+        "optimization": load_optimization_data(workspace),
     }