tavian-dev · tavian-dev · Apr 4, 2026 · Apr 4, 2026 · Apr 4, 2026 · Apr 4, 2026
diff --git a/deliberate_eval/__init__.py b/deliberate_eval/__init__.py
@@ -0,0 +1,93 @@
+"""deliberate-eval — measure whether AI planning tools improve agent outcomes."""
+
+import json
+from dataclasses import dataclass, field, asdict
+from enum import Enum
+from pathlib import Path
+from typing import Optional
+
+
+class Treatment(Enum):
+    """Planning treatment levels."""
+    CLASS_A = "class_a"  # No planning — just do it
+    CLASS_B = "class_b"  # Brief with checklist
+    CLASS_C = "class_c"  # Full campaign (spec → plan → tasks)
+
+
+class AgentType(Enum):
+    """Supported agent types."""
+    CLAUDE = "claude"
+    CODEX = "codex"
+
+
+@dataclass
+class Task:
+    """A coding task to evaluate."""
+    id: str
+    description: str
+    repo: str              # owner/repo or local path
+    test_command: str       # command to verify success
+    difficulty: str = "medium"  # trivial, medium, hard
+    issue_url: str = ""
+    repo_ref: str = "main"  # git ref to check out
+    setup_command: str = ""  # optional setup before agent runs
+
+    def to_dict(self) -> dict:
+        return asdict(self)
+
+    @classmethod
+    def from_dict(cls, d: dict) -> "Task":
+        return cls(**{k: v for k, v in d.items() if k in cls.__dataclass_fields__})
+
+
+@dataclass
+class Trajectory:
+    """Captured metrics from a single agent run."""
+    input_tokens: int = 0
+    output_tokens: int = 0
+    cache_read_tokens: int = 0
+    total_cost_usd: float = 0.0
+    duration_ms: int = 0
+    num_turns: int = 0
+    files_changed: list[str] = field(default_factory=list)
+    lines_added: int = 0
+    lines_removed: int = 0
+    passed: bool = False
+    test_output: str = ""
+    error: str = ""
+
+    @property
+    def total_tokens(self) -> int:
+        return self.input_tokens + self.output_tokens
+
+    def to_dict(self) -> dict:
+        d = asdict(self)
+        d["total_tokens"] = self.total_tokens
+        return d
+
+
+@dataclass
+class Run:
+    """A single eval run: one task × one treatment × one seed."""
+    task_id: str
+    treatment: str
+    agent: str
+    seed: int = 0
+    trajectory: Trajectory = field(default_factory=Trajectory)
+    status: str = "pending"  # pending, running, completed, failed
+
+    def to_dict(self) -> dict:
+        d = asdict(self)
+        d["trajectory"]["total_tokens"] = self.trajectory.total_tokens
+        return d
+
+    def to_jsonl(self) -> str:
+        return json.dumps(self.to_dict())
+
+    @classmethod
+    def from_dict(cls, d: dict) -> "Run":
+        d = dict(d)  # don't mutate caller's dict
+        traj_data = d.pop("trajectory", {})
+        traj_data = {k: v for k, v in traj_data.items() if k in Trajectory.__dataclass_fields__}
+        traj = Trajectory(**traj_data)
+        return cls(trajectory=traj, **{k: v for k, v in d.items() if k in cls.__dataclass_fields__})
diff --git a/deliberate_eval/agents.py b/deliberate_eval/agents.py
@@ -0,0 +1,130 @@
+"""Agent adapters for headless execution."""
+
+import json
+import subprocess
+import time
+from pathlib import Path
+from typing import Optional
+
+from deliberate_eval import Trajectory, AgentType
+
+
+def run_claude(
+    prompt: str,
+    workdir: Path,
+    timeout: int = 300,
+    max_turns: int = 50,
+    env: Optional[dict] = None,
+) -> Trajectory:
+    """Run Claude Code in headless mode and capture trajectory.
+
+    Uses `claude -p --output-format json` for structured output.
+    """
+    cmd = [
+        "claude", "-p",
+        "--output-format", "json",
+        "--max-turns", str(max_turns),
+        "--no-session-persistence",
+        "--dangerously-skip-permissions",
+    ]
+
+    start = time.monotonic()
+    try:
+        result = subprocess.run(
+            cmd,
+            input=prompt,
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+            cwd=workdir,
+            env=env,
+        )
+        duration_ms = int((time.monotonic() - start) * 1000)
+
+        if result.returncode != 0 and not result.stdout.strip():
+            return Trajectory(
+                duration_ms=duration_ms,
+                error=result.stderr[:1000],
+            )
+
+        data = json.loads(result.stdout)
+        usage = data.get("usage", {})
+
+        # Claude API token breakdown:
+        #   input_tokens: fresh (non-cached) input
+        #   cache_read_input_tokens: replayed context (cheap, ~10x less)
+        #   cache_creation_input_tokens: new context written to cache
+        # We store fresh input separately from cache reads so metrics
+        # reflect actual work, not context replay overhead.
+        input_tok = (usage.get("input_tokens", 0)
+                     + usage.get("cache_creation_input_tokens", 0))
+
+        return Trajectory(
+            input_tokens=input_tok,
+            output_tokens=usage.get("output_tokens", 0),
+            cache_read_tokens=usage.get("cache_read_input_tokens", 0),
+            total_cost_usd=data.get("total_cost_usd", 0.0),
+            duration_ms=data.get("duration_ms", duration_ms),
+            num_turns=data.get("num_turns", 0),
+            error="" if not data.get("is_error") else data.get("result", "unknown error"),
+        )
+
+    except subprocess.TimeoutExpired:
+        duration_ms = int((time.monotonic() - start) * 1000)
+        return Trajectory(duration_ms=duration_ms, error="timeout")
+    except (json.JSONDecodeError, KeyError) as e:
+        duration_ms = int((time.monotonic() - start) * 1000)
+        return Trajectory(duration_ms=duration_ms, error=f"parse error: {e}")
+
+
+def run_codex(
+    prompt: str,
+    workdir: Path,
+    timeout: int = 300,
+    env: Optional[dict] = None,
+) -> Trajectory:
+    """Run Codex CLI in headless mode and capture trajectory."""
+    cmd = [
+        "codex", "exec",
+        "--dangerously-bypass-approvals-and-sandbox",
+        prompt,
+    ]
+
+    start = time.monotonic()
+    try:
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+            cwd=workdir,
+            env=env,
+        )
+        duration_ms = int((time.monotonic() - start) * 1000)
+
+        # Codex reports "tokens used\nNNNN" in its output footer
+        output = result.stdout
+        token_count = 0
+        lines = output.strip().split("\n")
+        for i, line in enumerate(lines):
+            if line.strip() == "tokens used" and i + 1 < len(lines):
+                next_line = lines[i + 1].strip().replace(",", "")
+                if next_line.isdigit():
+                    token_count = int(next_line)
+                break
+
+        return Trajectory(
+            input_tokens=token_count,  # Codex reports total, not split
+            duration_ms=duration_ms,
+            error="" if result.returncode == 0 else result.stderr[:1000],
+        )
+
+    except subprocess.TimeoutExpired:
+        duration_ms = int((time.monotonic() - start) * 1000)
+        return Trajectory(duration_ms=duration_ms, error="timeout")
+
+
+AGENT_RUNNERS = {
+    AgentType.CLAUDE: run_claude,
+    AgentType.CODEX: run_codex,
+}
diff --git a/deliberate_eval/cli.py b/deliberate_eval/cli.py
@@ -0,0 +1,121 @@
+"""CLI entry point for deliberate-eval."""
+
+import argparse
+import sys
+from pathlib import Path
+
+from deliberate_eval import AgentType, Treatment
+
+
+def cmd_run(args: argparse.Namespace) -> None:
+    """Run the eval."""
+    from deliberate_eval.runner import run_pilot
+    from deliberate_eval.tasks import load_tasks
+
+    tasks_path = Path(args.tasks)
+    output_path = Path(args.output)
+    eval_dir = Path(args.eval_dir) if args.eval_dir else None
+    clone_dir = Path(args.clone_dir) if args.clone_dir else None
+
+    agent = AgentType(args.agent)
+
+    print(f"Running eval: {tasks_path}")
+    tasks = load_tasks(tasks_path)
+    print(f"  Tasks: {len(tasks)}")
+    print(f"  Agent: {agent.value}")
+    print(f"  Seeds: {args.seeds}")
+    print(f"  Total runs: {len(tasks) * 2 * args.seeds}")
+    print(f"  Output: {output_path}")
+    print()
+
+    runs = run_pilot(
+        tasks_path=tasks_path,
+        output_path=output_path,
+        agent=agent,
+        seeds=args.seeds,
+        eval_dir=eval_dir,
+        clone_dir=clone_dir,
+        agent_timeout=args.timeout,
+    )
+
+    passed = sum(1 for r in runs if r.trajectory.passed)
+    failed = sum(1 for r in runs if r.status == "completed" and not r.trajectory.passed)
+    errors = sum(1 for r in runs if r.status == "failed")
+    print(f"\nDone: {passed} passed, {failed} failed, {errors} errors")
+    print(f"Results: {output_path}")
+
+
+def cmd_report(args: argparse.Namespace) -> None:
+    """Generate a comparison report."""
+    from deliberate_eval.report import load_runs, format_comparison, format_per_task
+    from deliberate_eval.metrics import compare_treatments
+
+    runs = load_runs(Path(args.results))
+    comp = compare_treatments(runs)
+
+    print(format_comparison(comp))
+    if args.per_task:
+        print(format_per_task(runs))
+
+
+def cmd_validate(args: argparse.Namespace) -> None:
+    """Validate tasks in a JSONL file."""
+    from deliberate_eval.tasks import load_tasks, validate_task
+
+    tasks = load_tasks(Path(args.tasks))
+    all_ok = True
+    for task in tasks:
+        errors = validate_task(task)
+        if errors:
+            all_ok = False
+            for e in errors:
+                print(f"  ERROR: {e}")
+        else:
+            print(f"  OK: {task.id} [{task.difficulty}]")
+
+    if all_ok:
+        print(f"\nAll {len(tasks)} tasks valid.")
+    else:
+        print(f"\nSome tasks have errors.")
+        sys.exit(1)
+
+
+def main(argv: list[str] | None = None) -> None:
+    parser = argparse.ArgumentParser(
+        prog="deliberate-eval",
+        description="Evaluate whether AI planning tools improve agent coding outcomes.",
+    )
+    sub = parser.add_subparsers(dest="command")
+
+    # run
+    p_run = sub.add_parser("run", help="Run the eval")
+    p_run.add_argument("tasks", help="Path to tasks JSONL file")
+    p_run.add_argument("-o", "--output", default="results.jsonl", help="Output JSONL path")
+    p_run.add_argument("-a", "--agent", default="claude", choices=["claude", "codex"])
+    p_run.add_argument("-s", "--seeds", type=int, default=3, help="Seeds per treatment")
+    p_run.add_argument("-t", "--timeout", type=int, default=300, help="Agent timeout (seconds)")
+    p_run.add_argument("--eval-dir", help="Base eval directory (for patches, etc.)")
+    p_run.add_argument("--clone-dir", help="Directory with pre-cloned repos")
+    p_run.set_defaults(func=cmd_run)
+
+    # report
+    p_report = sub.add_parser("report", help="Generate comparison report")
+    p_report.add_argument("results", help="Path to results JSONL file")
+    p_report.add_argument("--per-task", action="store_true", help="Show per-task breakdown")
+    p_report.set_defaults(func=cmd_report)
+
+    # validate
+    p_validate = sub.add_parser("validate", help="Validate tasks JSONL")
+    p_validate.add_argument("tasks", help="Path to tasks JSONL file")
+    p_validate.set_defaults(func=cmd_validate)
+
+    args = parser.parse_args(argv)
+    if not args.command:
+        parser.print_help()
+        sys.exit(1)
+
+    args.func(args)
+
+
+if __name__ == "__main__":
+    main()