Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ __pycache__/
*.py[cod]
.idea/
external/*
*.rej
*.orig
**/uv.lock
*.egg-info/
**/.venv/
Expand Down
42 changes: 42 additions & 0 deletions configs/m4_external_optimizers_demo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
runs_dir: runs
mode: real
seeds: [123]
max_workers: 1
resume: auto

tasks:
- id: trace_examples:greeting_stub

trainers:
- id: DSPy-MIPROv2
optimizer_kwargs:
auto: light
max_bootstrapped_demos: 1
max_labeled_demos: 1
params_variants:
- num_trials: 1

- id: DSPy-BootstrapFewShot
optimizer_kwargs:
max_bootstrapped_demos: 1
max_labeled_demos: 1
max_rounds: 1
params_variants:
- {}

- id: TextGrad-TGD
optimizer_kwargs:
engine: ${TRACE_LITELLM_MODEL:-gpt-4o-mini}
params_variants:
- num_steps: 1

- id: OpenEvolve
params_variants:
- iterations: 1

- id: GEPA-Base
params_variants:
- gepa_iters: 1
gepa_train_bs: 1
gepa_merge_every: 1
gepa_pareto_subset: 1
60 changes: 60 additions & 0 deletions configs/m4_external_optimizers_real.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
runs_dir: runs_external
mode: real
seeds: [123]
max_workers: 1
resume: auto
job_timeout: 300

# Keep the benchmark simple and comparable:
# same task, same seed, same provider/model, different trainer families only.
llm:
provider: openrouter
base_url: https://openrouter.ai/api/v1
model: openrouter/openai/gpt-4o-mini
api_key_env: OPENROUTER_API_KEY

tags:
- m4
- external-trainers
- real-comparison

tasks:
- id: trace_examples:greeting_stub

trainers:
# Native Trace baseline
- id: GEPA-Base
params_variants:
- gepa_iters: 3
gepa_train_bs: 1
gepa_merge_every: 1
gepa_pareto_subset: 1

# DSPy family
- id: DSPy-MIPROv2
optimizer_kwargs:
auto: light
max_bootstrapped_demos: 1
max_labeled_demos: 1
params_variants:
- num_trials: 3

- id: DSPy-BootstrapFewShot
optimizer_kwargs:
max_bootstrapped_demos: 1
max_labeled_demos: 1
max_rounds: 1
params_variants:
- {}

# TextGrad family
- id: TextGrad-TGD
params_variants:
- num_steps: 3

# OpenEvolve family
- id: OpenEvolve
optimizer_kwargs:
include_artifacts: true
params_variants:
- iterations: 3
6 changes: 1 addition & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,7 @@
import setuptools

here = os.path.abspath(os.path.dirname(__file__))
version = {}
with open(os.path.join(here, "opto/version.py"), encoding="utf8") as fp:
exec(fp.read(), version)
__version__ = version["__version__"]

__version__ = "0.1.0"

install_requires = [
"graphviz>=0.20.1",
Expand Down
Empty file added tests/__init__.py
Empty file.
Empty file added tests/m4/__init__.py
Empty file.
65 changes: 65 additions & 0 deletions tests/m4/_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
class DummyParam:
def __init__(self, value="Hi"):
self.data = value
self.trainable = True
def _set(self, value):
self.data = value

class DummyGuide:
def __call__(self, query, response, reference):
score = 1.0 if response == reference else 0.0
feedback = "Correct" if score == 1.0 else f"Expected: {reference}"
return score, feedback

def make_bundle():
param = DummyParam("Hi")
guide = DummyGuide()
train_dataset = {"inputs": ["Hello I am Sam"], "infos": ["Hello, Sam!"]}

def score_for_greeting(candidate_greeting: str):
response = f"{candidate_greeting}, Sam!"
return guide("Hello I am Sam", response, "Hello, Sam!")

class Program:
def __init__(self, greeting="Hi"):
self.greeting = greeting
def __call__(self, query):
name = query.split()[-1].strip("!.?")
return f"{self.greeting}, {name}!"

bundle = {
"param": param,
"guide": guide,
"train_dataset": train_dataset,
"optimizer_kwargs": {"objective": "Improve greeting", "memory_size": 5},
"metadata": {"benchmark": "test", "entry": "dummy"},
"frameworks": {
"dspy": {
"program_factory": lambda: Program("Hi"),
"metric": lambda *args, **kwargs: None,
"to_trainset": lambda dataset: [{"input": dataset["inputs"][0], "label": dataset["infos"][0]}],
"evaluate": lambda program, trainset: score_for_greeting(program.greeting)[0],
"sync_to_bundle": lambda program, bundle: bundle["param"]._set(program.greeting),
"state_serializer": lambda value: {"kind": "Program", "greeting": getattr(value, "greeting", value)},
},
"textgrad": {
"initial_text": "Hi",
"role_description": "greeting prefix",
"evaluate": lambda candidate: score_for_greeting(candidate)[0],
"feedback": lambda candidate: score_for_greeting(candidate)[1],
"apply_update": lambda candidate: param._set(candidate),
"state_serializer": lambda value: {"kind": "TextState", "greeting": value},
},
"openevolve": {
"initial_program": "Hi",
"evaluate_program": lambda candidate: {
"score": score_for_greeting(candidate)[0],
"feedback": score_for_greeting(candidate)[1],
"artifacts": {"candidate": candidate},
},
"apply_candidate": lambda candidate, bundle: bundle["param"]._set(candidate),
"state_serializer": lambda value: {"kind": "ProgramText", "greeting": value},
},
},
}
return bundle
73 changes: 73 additions & 0 deletions tests/m4/test_dspy_adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import types
import sys

from tests.m4._helpers import make_bundle
from trace_bench.config import TrainerConfig
from trace_bench.integrations.external_optimizers import run_dspy_trainer


class _FakeMIPROv2:
def __init__(self, metric=None, **kwargs):
self.metric = metric
self.kwargs = kwargs

def compile(self, student, trainset=None, **kwargs):
student.greeting = "Hello"
return student


class _FakeBootstrapFewShot(_FakeMIPROv2):
pass


class _FakeBootstrapStrict:
def __init__(self, metric=None, **kwargs):
self.metric = metric
self.kwargs = kwargs

def compile(self, student, trainset=None):
student.greeting = "Hello"
return student


def test_dspy_mipro_adapter(monkeypatch):
fake = types.SimpleNamespace(MIPROv2=_FakeMIPROv2, BootstrapFewShot=_FakeBootstrapFewShot)
monkeypatch.setitem(sys.modules, "dspy", fake)
bundle = make_bundle()
trainer = TrainerConfig(id="DSPy-MIPROv2", optimizer_kwargs={"auto": "light"})
result = run_dspy_trainer(bundle, trainer, {"num_trials": 1}, mode="stub")
assert result["status"] == "ok"
assert result["resolved_optimizer"] == "DSPy.DSPy-MIPROv2"
assert bundle["param"].data == "Hello"


def test_dspy_fallback_to_teleprompt(monkeypatch):
teleprompt = types.SimpleNamespace(MIPROv2=_FakeMIPROv2, BootstrapFewShot=_FakeBootstrapFewShot)
fake = types.SimpleNamespace(teleprompt=teleprompt)
monkeypatch.setitem(sys.modules, "dspy", fake)
bundle = make_bundle()
trainer = TrainerConfig(id="DSPy-MIPROv2", optimizer_kwargs={"auto": "light"})
result = run_dspy_trainer(bundle, trainer, {"num_trials": 1}, mode="stub")
assert result["status"] == "ok"
assert bundle["param"].data == "Hello"


def test_dspy_fallback_to_mipro_alias(monkeypatch):
teleprompt = types.SimpleNamespace(MIPRO=_FakeMIPROv2, BootstrapFewShot=_FakeBootstrapFewShot)
fake = types.SimpleNamespace(teleprompt=teleprompt)
monkeypatch.setitem(sys.modules, "dspy", fake)
bundle = make_bundle()
trainer = TrainerConfig(id="DSPy-MIPROv2", optimizer_kwargs={"auto": "light"})
result = run_dspy_trainer(bundle, trainer, {"num_trials": 1}, mode="stub")
assert result["status"] == "ok"
assert bundle["param"].data == "Hello"


def test_dspy_compile_filters_optimizer_kwargs(monkeypatch):
fake = types.SimpleNamespace(BootstrapFewShot=_FakeBootstrapStrict)
monkeypatch.setitem(sys.modules, "dspy", fake)
bundle = make_bundle()
trainer = TrainerConfig(id="DSPy-BootstrapFewShot", optimizer_kwargs={"max_rounds": 1})
result = run_dspy_trainer(bundle, trainer, {"optimizer_kwargs": {"ignored": True}}, mode="stub")
assert result["status"] == "ok"
assert bundle["param"].data == "Hello"
25 changes: 25 additions & 0 deletions tests/m4/test_external_demo_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from trace_bench.config import load_config


def test_external_demo_config_parses():
cfg = load_config("configs/m4_external_optimizers_demo.yaml")
trainer_ids = [t.id for t in cfg.trainers]
assert trainer_ids == [
"DSPy-MIPROv2",
"DSPy-BootstrapFewShot",
"TextGrad-TGD",
"OpenEvolve",
"GEPA-Base",
]
assert cfg.tasks[0].id == "trace_examples:greeting_stub"


def test_trainer_comparison_rows_shape():
from trace_bench.results import build_trainer_comparison_rows
rows = [
{"task_id": "t1", "suite": "s", "job_id": "j1", "trainer_id": "A", "status": "ok", "score_best": 0.5, "time_seconds": 1.0},
{"task_id": "t1", "suite": "s", "job_id": "j2", "trainer_id": "B", "status": "ok", "score_best": 0.9, "time_seconds": 2.0},
{"task_id": "t1", "suite": "s", "job_id": "j3", "trainer_id": "C", "status": "failed", "score_best": None, "time_seconds": 3.0},
]
out = build_trainer_comparison_rows(rows)
assert [r["trainer_id"] for r in out] == ["B", "A"]
9 changes: 9 additions & 0 deletions tests/m4/test_external_trainers_registry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from trace_bench.registry import discover_trainers


def test_external_trainers_are_listed():
ids = {spec.id for spec in discover_trainers()}
assert "DSPy-MIPROv2" in ids
assert "DSPy-BootstrapFewShot" in ids
assert "TextGrad-TGD" in ids
assert "OpenEvolve" in ids
41 changes: 41 additions & 0 deletions tests/m4/test_openevolve_adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import types
import sys
import tempfile
from pathlib import Path

from tests.m4._helpers import make_bundle
from trace_bench.config import TrainerConfig
from trace_bench.integrations.external_optimizers import run_openevolve_trainer


class _Result:
def __init__(self, best_code):
self.best_code = best_code


def _run_evolution(initial_program, evaluator, iterations=1):
with tempfile.TemporaryDirectory() as td:
p = Path(td) / "candidate.py"
p.write_text("Hello", encoding="utf-8")
evaluator(str(p))
return _Result(best_code="Hello")


def test_openevolve_adapter(monkeypatch):
fake = types.SimpleNamespace(run_evolution=_run_evolution)
monkeypatch.setitem(sys.modules, "openevolve", fake)
bundle = make_bundle()
trainer = TrainerConfig(id="OpenEvolve", optimizer_kwargs={"include_artifacts": True})
result = run_openevolve_trainer(bundle, trainer, {"iterations": 1}, mode="stub")
assert result["status"] == "ok"
assert result["resolved_optimizer"] == "OpenEvolve.run_evolution"
assert bundle["param"].data == "Hello"


def test_openevolve_filters_unsupported_kwargs(monkeypatch):
fake = types.SimpleNamespace(run_evolution=_run_evolution)
monkeypatch.setitem(sys.modules, "openevolve", fake)
bundle = make_bundle()
trainer = TrainerConfig(id="OpenEvolve", optimizer_kwargs={"include_artifacts": True})
result = run_openevolve_trainer(bundle, trainer, {"iterations": 1, "max_artifact_bytes": 123}, mode="stub")
assert result["status"] == "ok"
29 changes: 29 additions & 0 deletions tests/m4/test_runner_external_dispatch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import types
import sys

from tests.m4._helpers import make_bundle
from trace_bench.config import TrainerConfig
from trace_bench.runner import _train_bundle


class _FakeMIPROv2:
def __init__(self, metric=None, **kwargs):
self.metric = metric

def compile(self, student, trainset=None, **kwargs):
student.greeting = "Hello"
return student


class _FakeBootstrapFewShot(_FakeMIPROv2):
pass


def test_runner_dispatches_external_trainer(monkeypatch):
fake = types.SimpleNamespace(MIPROv2=_FakeMIPROv2, BootstrapFewShot=_FakeBootstrapFewShot)
monkeypatch.setitem(sys.modules, "dspy", fake)
bundle = make_bundle()
trainer = TrainerConfig(id="DSPy-MIPROv2")
result = _train_bundle(bundle, trainer, {"num_trials": 1}, mode="stub")
assert result["status"] == "ok"
assert result["resolved_optimizer"] == "DSPy.DSPy-MIPROv2"
Loading