Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions deliberate_eval/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
"""deliberate-eval — measure whether AI planning tools improve agent outcomes."""

import json
from dataclasses import dataclass, field, asdict
from enum import Enum
from pathlib import Path
from typing import Optional


class Treatment(Enum):
"""Planning treatment levels."""
CLASS_A = "class_a" # No planning — just do it
CLASS_B = "class_b" # Brief with checklist
CLASS_C = "class_c" # Full campaign (spec → plan → tasks)


class AgentType(Enum):
"""Supported agent types."""
CLAUDE = "claude"
CODEX = "codex"


@dataclass
class Task:
"""A coding task to evaluate."""
id: str
description: str
repo: str # owner/repo or local path
test_command: str # command to verify success
difficulty: str = "medium" # trivial, medium, hard
issue_url: str = ""
repo_ref: str = "main" # git ref to check out
setup_command: str = "" # optional setup before agent runs

def to_dict(self) -> dict:
return asdict(self)

@classmethod
def from_dict(cls, d: dict) -> "Task":
return cls(**{k: v for k, v in d.items() if k in cls.__dataclass_fields__})


@dataclass
class Trajectory:
"""Captured metrics from a single agent run."""
input_tokens: int = 0
output_tokens: int = 0
cache_read_tokens: int = 0
total_cost_usd: float = 0.0
duration_ms: int = 0
num_turns: int = 0
files_changed: list[str] = field(default_factory=list)
lines_added: int = 0
lines_removed: int = 0
passed: bool = False
test_output: str = ""
error: str = ""

@property
def total_tokens(self) -> int:
return self.input_tokens + self.output_tokens

def to_dict(self) -> dict:
d = asdict(self)
d["total_tokens"] = self.total_tokens
return d


@dataclass
class Run:
"""A single eval run: one task × one treatment × one seed."""
task_id: str
treatment: str
agent: str
seed: int = 0
trajectory: Trajectory = field(default_factory=Trajectory)
status: str = "pending" # pending, running, completed, failed

def to_dict(self) -> dict:
d = asdict(self)
d["trajectory"]["total_tokens"] = self.trajectory.total_tokens
return d

def to_jsonl(self) -> str:
return json.dumps(self.to_dict())

@classmethod
def from_dict(cls, d: dict) -> "Run":
d = dict(d) # don't mutate caller's dict
traj_data = d.pop("trajectory", {})
traj_data = {k: v for k, v in traj_data.items() if k in Trajectory.__dataclass_fields__}
traj = Trajectory(**traj_data)
return cls(trajectory=traj, **{k: v for k, v in d.items() if k in cls.__dataclass_fields__})
130 changes: 130 additions & 0 deletions deliberate_eval/agents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
"""Agent adapters for headless execution."""

import json
import subprocess
import time
from pathlib import Path
from typing import Optional

from deliberate_eval import Trajectory, AgentType


def run_claude(
prompt: str,
workdir: Path,
timeout: int = 300,
max_turns: int = 50,
env: Optional[dict] = None,
) -> Trajectory:
"""Run Claude Code in headless mode and capture trajectory.

Uses `claude -p --output-format json` for structured output.
"""
cmd = [
"claude", "-p",
"--output-format", "json",
"--max-turns", str(max_turns),
"--no-session-persistence",
"--dangerously-skip-permissions",
]

start = time.monotonic()
try:
result = subprocess.run(
cmd,
input=prompt,
capture_output=True,
text=True,
timeout=timeout,
cwd=workdir,
env=env,
)
duration_ms = int((time.monotonic() - start) * 1000)

if result.returncode != 0 and not result.stdout.strip():
return Trajectory(
duration_ms=duration_ms,
error=result.stderr[:1000],
)

data = json.loads(result.stdout)
usage = data.get("usage", {})

# Claude API token breakdown:
# input_tokens: fresh (non-cached) input
# cache_read_input_tokens: replayed context (cheap, ~10x less)
# cache_creation_input_tokens: new context written to cache
# We store fresh input separately from cache reads so metrics
# reflect actual work, not context replay overhead.
input_tok = (usage.get("input_tokens", 0)
+ usage.get("cache_creation_input_tokens", 0))

return Trajectory(
input_tokens=input_tok,
output_tokens=usage.get("output_tokens", 0),
cache_read_tokens=usage.get("cache_read_input_tokens", 0),
total_cost_usd=data.get("total_cost_usd", 0.0),
duration_ms=data.get("duration_ms", duration_ms),
num_turns=data.get("num_turns", 0),
error="" if not data.get("is_error") else data.get("result", "unknown error"),
)

except subprocess.TimeoutExpired:
duration_ms = int((time.monotonic() - start) * 1000)
return Trajectory(duration_ms=duration_ms, error="timeout")
except (json.JSONDecodeError, KeyError) as e:
duration_ms = int((time.monotonic() - start) * 1000)
return Trajectory(duration_ms=duration_ms, error=f"parse error: {e}")


def run_codex(
prompt: str,
workdir: Path,
timeout: int = 300,
env: Optional[dict] = None,
) -> Trajectory:
"""Run Codex CLI in headless mode and capture trajectory."""
cmd = [
"codex", "exec",
"--dangerously-bypass-approvals-and-sandbox",
prompt,
]

start = time.monotonic()
try:
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=timeout,
cwd=workdir,
env=env,
)
duration_ms = int((time.monotonic() - start) * 1000)

# Codex reports "tokens used\nNNNN" in its output footer
output = result.stdout
token_count = 0
lines = output.strip().split("\n")
for i, line in enumerate(lines):
if line.strip() == "tokens used" and i + 1 < len(lines):
next_line = lines[i + 1].strip().replace(",", "")
if next_line.isdigit():
token_count = int(next_line)
break

return Trajectory(
input_tokens=token_count, # Codex reports total, not split
duration_ms=duration_ms,
error="" if result.returncode == 0 else result.stderr[:1000],
)

except subprocess.TimeoutExpired:
duration_ms = int((time.monotonic() - start) * 1000)
return Trajectory(duration_ms=duration_ms, error="timeout")


AGENT_RUNNERS = {
AgentType.CLAUDE: run_claude,
AgentType.CODEX: run_codex,
}
121 changes: 121 additions & 0 deletions deliberate_eval/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
"""CLI entry point for deliberate-eval."""

import argparse
import sys
from pathlib import Path

from deliberate_eval import AgentType, Treatment


def cmd_run(args: argparse.Namespace) -> None:
"""Run the eval."""
from deliberate_eval.runner import run_pilot
from deliberate_eval.tasks import load_tasks

tasks_path = Path(args.tasks)
output_path = Path(args.output)
eval_dir = Path(args.eval_dir) if args.eval_dir else None
clone_dir = Path(args.clone_dir) if args.clone_dir else None

agent = AgentType(args.agent)

print(f"Running eval: {tasks_path}")
tasks = load_tasks(tasks_path)
print(f" Tasks: {len(tasks)}")
print(f" Agent: {agent.value}")
print(f" Seeds: {args.seeds}")
print(f" Total runs: {len(tasks) * 2 * args.seeds}")
print(f" Output: {output_path}")
print()

runs = run_pilot(
tasks_path=tasks_path,
output_path=output_path,
agent=agent,
seeds=args.seeds,
eval_dir=eval_dir,
clone_dir=clone_dir,
agent_timeout=args.timeout,
)

passed = sum(1 for r in runs if r.trajectory.passed)
failed = sum(1 for r in runs if r.status == "completed" and not r.trajectory.passed)
errors = sum(1 for r in runs if r.status == "failed")
print(f"\nDone: {passed} passed, {failed} failed, {errors} errors")
print(f"Results: {output_path}")


def cmd_report(args: argparse.Namespace) -> None:
"""Generate a comparison report."""
from deliberate_eval.report import load_runs, format_comparison, format_per_task
from deliberate_eval.metrics import compare_treatments

runs = load_runs(Path(args.results))
comp = compare_treatments(runs)

print(format_comparison(comp))
if args.per_task:
print(format_per_task(runs))


def cmd_validate(args: argparse.Namespace) -> None:
"""Validate tasks in a JSONL file."""
from deliberate_eval.tasks import load_tasks, validate_task

tasks = load_tasks(Path(args.tasks))
all_ok = True
for task in tasks:
errors = validate_task(task)
if errors:
all_ok = False
for e in errors:
print(f" ERROR: {e}")
else:
print(f" OK: {task.id} [{task.difficulty}]")

if all_ok:
print(f"\nAll {len(tasks)} tasks valid.")
else:
print(f"\nSome tasks have errors.")
sys.exit(1)


def main(argv: list[str] | None = None) -> None:
parser = argparse.ArgumentParser(
prog="deliberate-eval",
description="Evaluate whether AI planning tools improve agent coding outcomes.",
)
sub = parser.add_subparsers(dest="command")

# run
p_run = sub.add_parser("run", help="Run the eval")
p_run.add_argument("tasks", help="Path to tasks JSONL file")
p_run.add_argument("-o", "--output", default="results.jsonl", help="Output JSONL path")
p_run.add_argument("-a", "--agent", default="claude", choices=["claude", "codex"])
p_run.add_argument("-s", "--seeds", type=int, default=3, help="Seeds per treatment")
p_run.add_argument("-t", "--timeout", type=int, default=300, help="Agent timeout (seconds)")
p_run.add_argument("--eval-dir", help="Base eval directory (for patches, etc.)")
p_run.add_argument("--clone-dir", help="Directory with pre-cloned repos")
p_run.set_defaults(func=cmd_run)

# report
p_report = sub.add_parser("report", help="Generate comparison report")
p_report.add_argument("results", help="Path to results JSONL file")
p_report.add_argument("--per-task", action="store_true", help="Show per-task breakdown")
p_report.set_defaults(func=cmd_report)

# validate
p_validate = sub.add_parser("validate", help="Validate tasks JSONL")
p_validate.add_argument("tasks", help="Path to tasks JSONL file")
p_validate.set_defaults(func=cmd_validate)

args = parser.parse_args(argv)
if not args.command:
parser.print_help()
sys.exit(1)

args.func(args)


if __name__ == "__main__":
main()
Loading
Loading