EveryInc · huntharo · Mar 29, 2026 · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026
diff --git a/docs/brainstorms/2026-03-29-iterative-optimization-loop-requirements.md b/docs/brainstorms/2026-03-29-iterative-optimization-loop-requirements.md
diff --git a/docs/plans/2026-03-29-001-feat-iterative-optimization-loop-skill-beta-plan.md b/docs/plans/2026-03-29-001-feat-iterative-optimization-loop-skill-beta-plan.md
diff --git a/plugins/compound-engineering/README.md b/plugins/compound-engineering/README.md
@@ -7,7 +7,8 @@ AI-powered development tools that get smarter with every use. Make each unit of
 | Component | Count |
 |-----------|-------|
 | Agents | 35+ |
-| Skills | 40+ |
+| Skills | 41+ |
+| MCP Servers | 1 |
 
 ## Skills
 
@@ -24,6 +25,9 @@ The primary entry points for engineering work, invoked as slash commands:
 | `/ce:work` | Execute work items systematically |
 | `/ce:compound` | Document solved problems to compound team knowledge |
 | `/ce:compound-refresh` | Refresh stale or drifting learnings and decide whether to keep, update, replace, or archive them |
+| `/ce:optimize` | Run iterative optimization loops with parallel experiments, measurement gates, and LLM-as-judge quality scoring |
+
+For `/ce:optimize`, start from the checked-in example specs under `skills/ce-optimize/references/` and keep the first run serial, small, and cheap until the measurement harness is trustworthy.
 
 ### Git Workflow
 

diff --git a/plugins/compound-engineering/skills/ce-optimize/SKILL.md b/plugins/compound-engineering/skills/ce-optimize/SKILL.md
diff --git a/plugins/compound-engineering/skills/ce-optimize/references/example-hard-spec.yaml b/plugins/compound-engineering/skills/ce-optimize/references/example-hard-spec.yaml
@@ -0,0 +1,64 @@
+# Minimal first-run template for objective metrics.
+# Start here when "better" is a scalar value from the measurement harness.
+
+name: improve-build-latency
+description: Reduce build latency without regressing correctness
+
+metric:
+  primary:
+    type: hard
+    name: build_seconds
+    direction: minimize
+  degenerate_gates:
+    - name: build_passed
+      check: "== 1"
+      description: The build must stay green
+    - name: test_pass_rate
+      check: ">= 1.0"
+      description: Required tests must keep passing
+  diagnostics:
+    - name: artifact_size_mb
+    - name: peak_memory_mb
+
+measurement:
+  command: "python evaluate.py"
+  timeout_seconds: 300
+  working_directory: "tools/eval"
+  stability:
+    mode: repeat
+    repeat_count: 3
+    aggregation: median
+    noise_threshold: 0.05
+
+scope:
+  mutable:
+    - "src/build/"
+    - "config/build.yaml"
+  immutable:
+    - "tools/eval/evaluate.py"
+    - "tests/fixtures/"
+    - "scripts/ci/"
+
+execution:
+  mode: serial
+  backend: worktree
+  max_concurrent: 1
+
+parallel:
+  port_strategy: none
+  shared_files: []
+
+dependencies:
+  approved: []
+
+constraints:
+  - "Keep output artifacts backward compatible"
+  - "Do not skip required validation steps"
+
+stopping:
+  max_iterations: 4
+  max_hours: 1
+  plateau_iterations: 3
+  target_reached: true
+
+max_runner_up_merges_per_batch: 0
diff --git a/plugins/compound-engineering/skills/ce-optimize/references/example-judge-spec.yaml b/plugins/compound-engineering/skills/ce-optimize/references/example-judge-spec.yaml
@@ -0,0 +1,78 @@
+# Minimal first-run template for qualitative metrics.
+# Start here when true quality requires semantic judgment, not a proxy metric.
+
+name: improve-search-relevance
+description: Improve semantic relevance of search results without obvious failures
+
+metric:
+  primary:
+    type: judge
+    name: mean_score
+    direction: maximize
+  degenerate_gates:
+    - name: result_count
+      check: ">= 5"
+      description: Return enough results to judge quality
+    - name: empty_query_failures
+      check: "== 0"
+      description: Empty or trivial queries must not fail
+  diagnostics:
+    - name: latency_ms
+    - name: recall_at_10
+  judge:
+    rubric: |
+      Rate each result set from 1-5 for relevance:
+      - 5: Results are directly relevant and well ordered
+      - 4: Mostly relevant with minor ordering issues
+      - 3: Mixed relevance or one obvious miss
+      - 2: Weak relevance, several misses, or poor ordering
+      - 1: Mostly irrelevant
+      Also report: ambiguous (boolean)
+    scoring:
+      primary: mean_score
+      secondary:
+        - ambiguous_rate
+    model: haiku
+    sample_size: 10
+    batch_size: 5
+    sample_seed: 42
+    minimum_improvement: 0.2
+    max_total_cost_usd: 5
+
+measurement:
+  command: "python eval_search.py"
+  timeout_seconds: 300
+  working_directory: "tools/eval"
+
+scope:
+  mutable:
+    - "src/search/"
+    - "config/search.yaml"
+  immutable:
+    - "tools/eval/eval_search.py"
+    - "tests/fixtures/"
+    - "docs/"
+
+execution:
+  mode: serial
+  backend: worktree
+  max_concurrent: 1
+
+parallel:
+  port_strategy: none
+  shared_files: []
+
+dependencies:
+  approved: []
+
+constraints:
+  - "Preserve the existing search response shape"
+  - "Do not add new dependencies on the first run"
+
+stopping:
+  max_iterations: 4
+  max_hours: 1
+  plateau_iterations: 3
+  target_reached: true
+
+max_runner_up_merges_per_batch: 0