From 51a4ba2f00937c9c2bc02a5352b6b0ecb78d2543 Mon Sep 17 00:00:00 2001 From: doxav Date: Tue, 24 Mar 2026 17:59:15 +0100 Subject: [PATCH] Create external trainers (DSPy, TextGrad, OpenEvovle...) --- .gitignore | 2 + configs/m4_external_optimizers_demo.yaml | 42 ++ configs/m4_external_optimizers_real.yaml | 60 +++ setup.py | 6 +- tests/__init__.py | 0 tests/m4/__init__.py | 0 tests/m4/_helpers.py | 65 +++ tests/m4/test_dspy_adapter.py | 73 +++ tests/m4/test_external_demo_config.py | 25 + tests/m4/test_external_trainers_registry.py | 9 + tests/m4/test_openevolve_adapter.py | 41 ++ tests/m4/test_runner_external_dispatch.py | 29 ++ tests/m4/test_textgrad_adapter.py | 52 ++ tests/m4/test_validate_external_trainers.py | 47 ++ trace_bench/artifacts.py | 4 + trace_bench/cli.py | 32 +- trace_bench/examples/greeting_stub.py | 110 ++++ .../integrations/external_optimizers.py | 477 ++++++++++++++++++ trace_bench/registry.py | 3 + trace_bench/resolve.py | 6 + trace_bench/results.py | 32 ++ trace_bench/runner.py | 99 +++- 22 files changed, 1184 insertions(+), 30 deletions(-) create mode 100644 configs/m4_external_optimizers_demo.yaml create mode 100644 configs/m4_external_optimizers_real.yaml create mode 100644 tests/__init__.py create mode 100644 tests/m4/__init__.py create mode 100644 tests/m4/_helpers.py create mode 100644 tests/m4/test_dspy_adapter.py create mode 100644 tests/m4/test_external_demo_config.py create mode 100644 tests/m4/test_external_trainers_registry.py create mode 100644 tests/m4/test_openevolve_adapter.py create mode 100644 tests/m4/test_runner_external_dispatch.py create mode 100644 tests/m4/test_textgrad_adapter.py create mode 100644 tests/m4/test_validate_external_trainers.py create mode 100644 trace_bench/integrations/external_optimizers.py diff --git a/.gitignore b/.gitignore index 2f88be0..367a127 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,8 @@ __pycache__/ *.py[cod] .idea/ external/* +*.rej +*.orig **/uv.lock *.egg-info/ **/.venv/ diff --git a/configs/m4_external_optimizers_demo.yaml b/configs/m4_external_optimizers_demo.yaml new file mode 100644 index 0000000..aa43571 --- /dev/null +++ b/configs/m4_external_optimizers_demo.yaml @@ -0,0 +1,42 @@ +runs_dir: runs +mode: real +seeds: [123] +max_workers: 1 +resume: auto + +tasks: + - id: trace_examples:greeting_stub + +trainers: + - id: DSPy-MIPROv2 + optimizer_kwargs: + auto: light + max_bootstrapped_demos: 1 + max_labeled_demos: 1 + params_variants: + - num_trials: 1 + + - id: DSPy-BootstrapFewShot + optimizer_kwargs: + max_bootstrapped_demos: 1 + max_labeled_demos: 1 + max_rounds: 1 + params_variants: + - {} + + - id: TextGrad-TGD + optimizer_kwargs: + engine: ${TRACE_LITELLM_MODEL:-gpt-4o-mini} + params_variants: + - num_steps: 1 + + - id: OpenEvolve + params_variants: + - iterations: 1 + + - id: GEPA-Base + params_variants: + - gepa_iters: 1 + gepa_train_bs: 1 + gepa_merge_every: 1 + gepa_pareto_subset: 1 diff --git a/configs/m4_external_optimizers_real.yaml b/configs/m4_external_optimizers_real.yaml new file mode 100644 index 0000000..c24e9c0 --- /dev/null +++ b/configs/m4_external_optimizers_real.yaml @@ -0,0 +1,60 @@ +runs_dir: runs_external +mode: real +seeds: [123] +max_workers: 1 +resume: auto +job_timeout: 300 + +# Keep the benchmark simple and comparable: +# same task, same seed, same provider/model, different trainer families only. +llm: + provider: openrouter + base_url: https://openrouter.ai/api/v1 + model: openrouter/openai/gpt-4o-mini + api_key_env: OPENROUTER_API_KEY + +tags: + - m4 + - external-trainers + - real-comparison + +tasks: + - id: trace_examples:greeting_stub + +trainers: + # Native Trace baseline + - id: GEPA-Base + params_variants: + - gepa_iters: 3 + gepa_train_bs: 1 + gepa_merge_every: 1 + gepa_pareto_subset: 1 + + # DSPy family + - id: DSPy-MIPROv2 + optimizer_kwargs: + auto: light + max_bootstrapped_demos: 1 + max_labeled_demos: 1 + params_variants: + - num_trials: 3 + + - id: DSPy-BootstrapFewShot + optimizer_kwargs: + max_bootstrapped_demos: 1 + max_labeled_demos: 1 + max_rounds: 1 + params_variants: + - {} + + # TextGrad family + - id: TextGrad-TGD + params_variants: + - num_steps: 3 + + # OpenEvolve family + - id: OpenEvolve + optimizer_kwargs: + include_artifacts: true + params_variants: + - iterations: 3 diff --git a/setup.py b/setup.py index 7140f3b..8eb2658 100644 --- a/setup.py +++ b/setup.py @@ -2,11 +2,7 @@ import setuptools here = os.path.abspath(os.path.dirname(__file__)) -version = {} -with open(os.path.join(here, "opto/version.py"), encoding="utf8") as fp: - exec(fp.read(), version) -__version__ = version["__version__"] - +__version__ = "0.1.0" install_requires = [ "graphviz>=0.20.1", diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/m4/__init__.py b/tests/m4/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/m4/_helpers.py b/tests/m4/_helpers.py new file mode 100644 index 0000000..a472868 --- /dev/null +++ b/tests/m4/_helpers.py @@ -0,0 +1,65 @@ +class DummyParam: + def __init__(self, value="Hi"): + self.data = value + self.trainable = True + def _set(self, value): + self.data = value + +class DummyGuide: + def __call__(self, query, response, reference): + score = 1.0 if response == reference else 0.0 + feedback = "Correct" if score == 1.0 else f"Expected: {reference}" + return score, feedback + +def make_bundle(): + param = DummyParam("Hi") + guide = DummyGuide() + train_dataset = {"inputs": ["Hello I am Sam"], "infos": ["Hello, Sam!"]} + + def score_for_greeting(candidate_greeting: str): + response = f"{candidate_greeting}, Sam!" + return guide("Hello I am Sam", response, "Hello, Sam!") + + class Program: + def __init__(self, greeting="Hi"): + self.greeting = greeting + def __call__(self, query): + name = query.split()[-1].strip("!.?") + return f"{self.greeting}, {name}!" + + bundle = { + "param": param, + "guide": guide, + "train_dataset": train_dataset, + "optimizer_kwargs": {"objective": "Improve greeting", "memory_size": 5}, + "metadata": {"benchmark": "test", "entry": "dummy"}, + "frameworks": { + "dspy": { + "program_factory": lambda: Program("Hi"), + "metric": lambda *args, **kwargs: None, + "to_trainset": lambda dataset: [{"input": dataset["inputs"][0], "label": dataset["infos"][0]}], + "evaluate": lambda program, trainset: score_for_greeting(program.greeting)[0], + "sync_to_bundle": lambda program, bundle: bundle["param"]._set(program.greeting), + "state_serializer": lambda value: {"kind": "Program", "greeting": getattr(value, "greeting", value)}, + }, + "textgrad": { + "initial_text": "Hi", + "role_description": "greeting prefix", + "evaluate": lambda candidate: score_for_greeting(candidate)[0], + "feedback": lambda candidate: score_for_greeting(candidate)[1], + "apply_update": lambda candidate: param._set(candidate), + "state_serializer": lambda value: {"kind": "TextState", "greeting": value}, + }, + "openevolve": { + "initial_program": "Hi", + "evaluate_program": lambda candidate: { + "score": score_for_greeting(candidate)[0], + "feedback": score_for_greeting(candidate)[1], + "artifacts": {"candidate": candidate}, + }, + "apply_candidate": lambda candidate, bundle: bundle["param"]._set(candidate), + "state_serializer": lambda value: {"kind": "ProgramText", "greeting": value}, + }, + }, + } + return bundle diff --git a/tests/m4/test_dspy_adapter.py b/tests/m4/test_dspy_adapter.py new file mode 100644 index 0000000..8156619 --- /dev/null +++ b/tests/m4/test_dspy_adapter.py @@ -0,0 +1,73 @@ +import types +import sys + +from tests.m4._helpers import make_bundle +from trace_bench.config import TrainerConfig +from trace_bench.integrations.external_optimizers import run_dspy_trainer + + +class _FakeMIPROv2: + def __init__(self, metric=None, **kwargs): + self.metric = metric + self.kwargs = kwargs + + def compile(self, student, trainset=None, **kwargs): + student.greeting = "Hello" + return student + + +class _FakeBootstrapFewShot(_FakeMIPROv2): + pass + + +class _FakeBootstrapStrict: + def __init__(self, metric=None, **kwargs): + self.metric = metric + self.kwargs = kwargs + + def compile(self, student, trainset=None): + student.greeting = "Hello" + return student + + +def test_dspy_mipro_adapter(monkeypatch): + fake = types.SimpleNamespace(MIPROv2=_FakeMIPROv2, BootstrapFewShot=_FakeBootstrapFewShot) + monkeypatch.setitem(sys.modules, "dspy", fake) + bundle = make_bundle() + trainer = TrainerConfig(id="DSPy-MIPROv2", optimizer_kwargs={"auto": "light"}) + result = run_dspy_trainer(bundle, trainer, {"num_trials": 1}, mode="stub") + assert result["status"] == "ok" + assert result["resolved_optimizer"] == "DSPy.DSPy-MIPROv2" + assert bundle["param"].data == "Hello" + + +def test_dspy_fallback_to_teleprompt(monkeypatch): + teleprompt = types.SimpleNamespace(MIPROv2=_FakeMIPROv2, BootstrapFewShot=_FakeBootstrapFewShot) + fake = types.SimpleNamespace(teleprompt=teleprompt) + monkeypatch.setitem(sys.modules, "dspy", fake) + bundle = make_bundle() + trainer = TrainerConfig(id="DSPy-MIPROv2", optimizer_kwargs={"auto": "light"}) + result = run_dspy_trainer(bundle, trainer, {"num_trials": 1}, mode="stub") + assert result["status"] == "ok" + assert bundle["param"].data == "Hello" + + +def test_dspy_fallback_to_mipro_alias(monkeypatch): + teleprompt = types.SimpleNamespace(MIPRO=_FakeMIPROv2, BootstrapFewShot=_FakeBootstrapFewShot) + fake = types.SimpleNamespace(teleprompt=teleprompt) + monkeypatch.setitem(sys.modules, "dspy", fake) + bundle = make_bundle() + trainer = TrainerConfig(id="DSPy-MIPROv2", optimizer_kwargs={"auto": "light"}) + result = run_dspy_trainer(bundle, trainer, {"num_trials": 1}, mode="stub") + assert result["status"] == "ok" + assert bundle["param"].data == "Hello" + + +def test_dspy_compile_filters_optimizer_kwargs(monkeypatch): + fake = types.SimpleNamespace(BootstrapFewShot=_FakeBootstrapStrict) + monkeypatch.setitem(sys.modules, "dspy", fake) + bundle = make_bundle() + trainer = TrainerConfig(id="DSPy-BootstrapFewShot", optimizer_kwargs={"max_rounds": 1}) + result = run_dspy_trainer(bundle, trainer, {"optimizer_kwargs": {"ignored": True}}, mode="stub") + assert result["status"] == "ok" + assert bundle["param"].data == "Hello" diff --git a/tests/m4/test_external_demo_config.py b/tests/m4/test_external_demo_config.py new file mode 100644 index 0000000..61e5b8f --- /dev/null +++ b/tests/m4/test_external_demo_config.py @@ -0,0 +1,25 @@ +from trace_bench.config import load_config + + +def test_external_demo_config_parses(): + cfg = load_config("configs/m4_external_optimizers_demo.yaml") + trainer_ids = [t.id for t in cfg.trainers] + assert trainer_ids == [ + "DSPy-MIPROv2", + "DSPy-BootstrapFewShot", + "TextGrad-TGD", + "OpenEvolve", + "GEPA-Base", + ] + assert cfg.tasks[0].id == "trace_examples:greeting_stub" + + +def test_trainer_comparison_rows_shape(): + from trace_bench.results import build_trainer_comparison_rows + rows = [ + {"task_id": "t1", "suite": "s", "job_id": "j1", "trainer_id": "A", "status": "ok", "score_best": 0.5, "time_seconds": 1.0}, + {"task_id": "t1", "suite": "s", "job_id": "j2", "trainer_id": "B", "status": "ok", "score_best": 0.9, "time_seconds": 2.0}, + {"task_id": "t1", "suite": "s", "job_id": "j3", "trainer_id": "C", "status": "failed", "score_best": None, "time_seconds": 3.0}, + ] + out = build_trainer_comparison_rows(rows) + assert [r["trainer_id"] for r in out] == ["B", "A"] diff --git a/tests/m4/test_external_trainers_registry.py b/tests/m4/test_external_trainers_registry.py new file mode 100644 index 0000000..ba5e2ec --- /dev/null +++ b/tests/m4/test_external_trainers_registry.py @@ -0,0 +1,9 @@ +from trace_bench.registry import discover_trainers + + +def test_external_trainers_are_listed(): + ids = {spec.id for spec in discover_trainers()} + assert "DSPy-MIPROv2" in ids + assert "DSPy-BootstrapFewShot" in ids + assert "TextGrad-TGD" in ids + assert "OpenEvolve" in ids diff --git a/tests/m4/test_openevolve_adapter.py b/tests/m4/test_openevolve_adapter.py new file mode 100644 index 0000000..b5d4fc9 --- /dev/null +++ b/tests/m4/test_openevolve_adapter.py @@ -0,0 +1,41 @@ +import types +import sys +import tempfile +from pathlib import Path + +from tests.m4._helpers import make_bundle +from trace_bench.config import TrainerConfig +from trace_bench.integrations.external_optimizers import run_openevolve_trainer + + +class _Result: + def __init__(self, best_code): + self.best_code = best_code + + +def _run_evolution(initial_program, evaluator, iterations=1): + with tempfile.TemporaryDirectory() as td: + p = Path(td) / "candidate.py" + p.write_text("Hello", encoding="utf-8") + evaluator(str(p)) + return _Result(best_code="Hello") + + +def test_openevolve_adapter(monkeypatch): + fake = types.SimpleNamespace(run_evolution=_run_evolution) + monkeypatch.setitem(sys.modules, "openevolve", fake) + bundle = make_bundle() + trainer = TrainerConfig(id="OpenEvolve", optimizer_kwargs={"include_artifacts": True}) + result = run_openevolve_trainer(bundle, trainer, {"iterations": 1}, mode="stub") + assert result["status"] == "ok" + assert result["resolved_optimizer"] == "OpenEvolve.run_evolution" + assert bundle["param"].data == "Hello" + + +def test_openevolve_filters_unsupported_kwargs(monkeypatch): + fake = types.SimpleNamespace(run_evolution=_run_evolution) + monkeypatch.setitem(sys.modules, "openevolve", fake) + bundle = make_bundle() + trainer = TrainerConfig(id="OpenEvolve", optimizer_kwargs={"include_artifacts": True}) + result = run_openevolve_trainer(bundle, trainer, {"iterations": 1, "max_artifact_bytes": 123}, mode="stub") + assert result["status"] == "ok" diff --git a/tests/m4/test_runner_external_dispatch.py b/tests/m4/test_runner_external_dispatch.py new file mode 100644 index 0000000..af59e46 --- /dev/null +++ b/tests/m4/test_runner_external_dispatch.py @@ -0,0 +1,29 @@ +import types +import sys + +from tests.m4._helpers import make_bundle +from trace_bench.config import TrainerConfig +from trace_bench.runner import _train_bundle + + +class _FakeMIPROv2: + def __init__(self, metric=None, **kwargs): + self.metric = metric + + def compile(self, student, trainset=None, **kwargs): + student.greeting = "Hello" + return student + + +class _FakeBootstrapFewShot(_FakeMIPROv2): + pass + + +def test_runner_dispatches_external_trainer(monkeypatch): + fake = types.SimpleNamespace(MIPROv2=_FakeMIPROv2, BootstrapFewShot=_FakeBootstrapFewShot) + monkeypatch.setitem(sys.modules, "dspy", fake) + bundle = make_bundle() + trainer = TrainerConfig(id="DSPy-MIPROv2") + result = _train_bundle(bundle, trainer, {"num_trials": 1}, mode="stub") + assert result["status"] == "ok" + assert result["resolved_optimizer"] == "DSPy.DSPy-MIPROv2" diff --git a/tests/m4/test_textgrad_adapter.py b/tests/m4/test_textgrad_adapter.py new file mode 100644 index 0000000..8ef0aff --- /dev/null +++ b/tests/m4/test_textgrad_adapter.py @@ -0,0 +1,52 @@ +import types +import sys + +from tests.m4._helpers import make_bundle +from trace_bench.config import TrainerConfig +from trace_bench.integrations.external_optimizers import run_textgrad_trainer + + +class _Variable: + def __init__(self, value, role_description=None, requires_grad=True): + self.value = value + + +class _TextLoss: + def __init__(self, objective, engine=None): + self.objective = objective + self.engine = engine + + def __call__(self, variable): + class _Loss: + def __init__(self, variable): + self.variable = variable + + def backward(self): + return None + return _Loss(variable) + + +class _TGD: + def __init__(self, parameters=None, **kwargs): + self.parameters = parameters or [] + self.kwargs = kwargs + + def step(self): + for p in self.parameters: + p.value = "Hello" + + +def test_textgrad_adapter(monkeypatch): + calls = {} + def _set_backward_engine(engine, override=False): + calls["engine"] = engine + calls["override"] = override + fake = types.SimpleNamespace(Variable=_Variable, TextLoss=_TextLoss, TGD=_TGD, set_backward_engine=_set_backward_engine) + monkeypatch.setitem(sys.modules, "textgrad", fake) + bundle = make_bundle() + trainer = TrainerConfig(id="TextGrad-TGD", optimizer_kwargs={"engine": "test-engine"}) + result = run_textgrad_trainer(bundle, trainer, {"num_steps": 1}, mode="stub") + assert result["status"] == "ok" + assert result["resolved_optimizer"] == "TextGrad.TGD" + assert bundle["param"].data == "Hello" + assert calls["engine"] == "test-engine" diff --git a/tests/m4/test_validate_external_trainers.py b/tests/m4/test_validate_external_trainers.py new file mode 100644 index 0000000..0d70634 --- /dev/null +++ b/tests/m4/test_validate_external_trainers.py @@ -0,0 +1,47 @@ +import types +import sys + +from trace_bench.cli import _validate_trainer_params +from trace_bench.config import TrainerConfig +from trace_bench.registry import discover_trainers + + +def test_validate_allows_external_kwargs(monkeypatch): + monkeypatch.setitem(sys.modules, "dspy", types.SimpleNamespace(MIPROv2=object, BootstrapFewShot=object)) + monkeypatch.setitem(sys.modules, "textgrad", types.SimpleNamespace(TGD=object)) + monkeypatch.setitem(sys.modules, "openevolve", types.SimpleNamespace(run_evolution=lambda **k: None)) + + ids = {spec.id for spec in discover_trainers() if spec.available} + assert "DSPy-MIPROv2" in ids + assert "TextGrad-TGD" in ids + assert "OpenEvolve" in ids + + errors = [] + _validate_trainer_params(TrainerConfig(id="DSPy-MIPROv2", params_variants=[{"num_trials": 1}]), errors) + _validate_trainer_params(TrainerConfig(id="TextGrad-TGD", params_variants=[{"num_steps": 1}]), errors) + _validate_trainer_params(TrainerConfig(id="OpenEvolve", params_variants=[{"iterations": 1}]), errors) + assert errors == [] + +def test_validate_external_optimizer_kwargs_are_not_flagged(tmp_path, capsys): + cfg = tmp_path / "cfg.yaml" + cfg.write_text( + """ +mode: stub +tasks: + - id: trace_examples:greeting_stub +trainers: + - id: DSPy-MIPROv2 + optimizer_kwargs: + auto: light + max_bootstrapped_demos: 1 + max_labeled_demos: 1 + params_variants: + - num_trials: 1 +""", + encoding="utf-8", + ) + from trace_bench.cli import cmd_validate + rc = cmd_validate(str(cfg), "LLM4AD/benchmark_tasks", strict=True) + out = capsys.readouterr().out + assert rc == 0, out + assert "unknown trainer kwarg 'optimizer_kwargs'" not in out diff --git a/trace_bench/artifacts.py b/trace_bench/artifacts.py index f19d7ad..f6598b1 100644 --- a/trace_bench/artifacts.py +++ b/trace_bench/artifacts.py @@ -47,6 +47,10 @@ def summary_json(self) -> Path: def leaderboard_csv(self) -> Path: return self.run_dir / "leaderboard.csv" + @property + def trainer_comparison_csv(self) -> Path: + return self.run_dir / "trainer_comparison.csv" + @property def files_index_json(self) -> Path: return self.meta_dir / "files_index.json" diff --git a/trace_bench/cli.py b/trace_bench/cli.py index f8acfa0..045978e 100644 --- a/trace_bench/cli.py +++ b/trace_bench/cli.py @@ -1,6 +1,7 @@ from __future__ import annotations import argparse +from typing import Any import json from datetime import datetime from pathlib import Path @@ -18,6 +19,7 @@ from trace_bench.runner import BenchRunner, _has_trainables from trace_bench.artifacts import init_run_dir, write_manifest from trace_bench.ui import launch_ui +from trace_bench.integrations.external_optimizers import allowed_external_trainer_kwargs, is_external_trainer def cmd_list_tasks(root: str, bench: str | None = None) -> int: @@ -52,6 +54,16 @@ def _task_in_bench(task_key: str, bench: str | None) -> bool: return True return False +# Reserved trainer-level fields that are valid in config, but are NOT runtime +# trainer kwargs and therefore must not be validated against trainer kwarg allowlists. +_RESERVED_TRAINER_FIELDS = { + "optimizer", + "optimizer_kwargs", + "guide", + "guide_kwargs", + "logger", + "logger_kwargs", +} _ALLOWED_TRAINER_KWARGS = { "verbose", @@ -126,11 +138,29 @@ def _default_timeout(mode: str) -> float: def _validate_trainer_params(trainer, errors: list[str]) -> None: + allowed = _ALLOWED_TRAINER_KWARGS + if is_external_trainer(trainer.id): + allowed = allowed_external_trainer_kwargs(trainer.id) or set() + for params in trainer.params_variants or [{}]: for key in params.keys(): - if key not in _ALLOWED_TRAINER_KWARGS: + # These keys are valid at trainer level and should never be treated as + # runtime trainer kwargs. + if key in _RESERVED_TRAINER_FIELDS: + continue + if key not in allowed: errors.append(f"unknown trainer kwarg '{key}' for {trainer.id}") + if is_external_trainer(trainer.id): + # External frameworks own their runtime init; only logger remains a Trace concern. + # Still sanity-check the reserved optimizer_kwargs type if present. + if trainer.optimizer_kwargs is not None and not isinstance(trainer.optimizer_kwargs, dict): + errors.append(f"optimizer_kwargs must be a mapping for {trainer.id}") + # External frameworks own their runtime init; only logger remains a Trace concern. + if trainer.logger and trainer.logger.lower() not in ('none','null','off','disable','disabled') and not _resolve_symbol("opto.trainer.loggers", trainer.logger): + errors.append(f"logger not found: {trainer.logger}") + return + if trainer.optimizer and not _resolve_symbol("opto.optimizers", trainer.optimizer): errors.append(f"optimizer not found: {trainer.optimizer}") if trainer.guide and not _resolve_symbol("opto.trainer.guide", trainer.guide): diff --git a/trace_bench/examples/greeting_stub.py b/trace_bench/examples/greeting_stub.py index 9c119f8..95fc3fb 100644 --- a/trace_bench/examples/greeting_stub.py +++ b/trace_bench/examples/greeting_stub.py @@ -26,6 +26,91 @@ def compose(self, greeting, name: str): return f"{greeting_value}, {name}!" +class _GreetingDSPyProgram: + def __init__(self, greeting: str = "Hello"): + self.greeting = greeting + + def __call__(self, user_query: str) -> str: + name = user_query.split()[-1].strip("!.?") + return f"{self.greeting}, {name}!" + + # DSPy teleprompters expect a resettable/copyable student program. + def deepcopy(self): + return _GreetingDSPyProgram(self.greeting) + + def reset_copy(self): + return self.deepcopy() + + def predictors(self): + return [] + + def named_predictors(self): + return [] + + +def _greeting_score(candidate_greeting: str) -> tuple[float, str]: + guide = ExactMatchGuide() + query = "Hello I am Sam" + response = f"{candidate_greeting}, Sam!" + return guide.get_feedback(query, response, "Hello, Sam!") + + +def _apply_greeting(candidate_greeting: str, bundle: dict) -> None: + bundle["param"].greeting._set(candidate_greeting) + + +def _serialize_greeting_state(value) -> dict: + if isinstance(value, _GreetingDSPyProgram): + greeting = value.greeting + else: + greeting = value + return {"kind": "GreetingState", "greeting": greeting} + + +def _build_dspy_program(): + return _GreetingDSPyProgram("Hello") + + +def _evaluate_dspy(program, trainset): + query = trainset[0]["input"] + expected = trainset[0]["label"] + score, _ = ExactMatchGuide().get_feedback(query, program(query), expected) + return score + + +def _sync_dspy_to_bundle(program, bundle): + _apply_greeting(program.greeting, bundle) + + +def _loss_factory(_bundle): + class _Loss: + def __call__(self, variable): + class _Value: + def __init__(self, target): + self.target = target + def backward(self): + return None + return _Value(variable) + return _Loss() + + +def _evaluate_text(candidate_text): + return _greeting_score(candidate_text)[0] + + +def _feedback_text(candidate_text): + return _greeting_score(candidate_text)[1] + + +def _initial_program(_bundle): + return "Hello" + + +def _evaluate_program(candidate_program: str): + score, feedback = _greeting_score(candidate_program) + return {"score": score, "feedback": feedback, "artifacts": {"candidate": candidate_program}} + + def build_trace_problem(**override_eval_kwargs): agent = GreetingAgent() guide = ExactMatchGuide() @@ -43,6 +128,31 @@ def build_trace_problem(**override_eval_kwargs): train_dataset=train_dataset, optimizer_kwargs=optimizer_kwargs, metadata=dict(benchmark="example", entry="GreetingAgent"), + frameworks={ + "dspy": { + "program_factory": _build_dspy_program, + "metric": lambda *args, **kwargs: None, + "to_trainset": lambda dataset: [{"input": dataset["inputs"][0], "label": dataset["infos"][0]}], + "evaluate": _evaluate_dspy, + "sync_to_bundle": _sync_dspy_to_bundle, + "state_serializer": _serialize_greeting_state, + }, + "textgrad": { + "initial_text": "Hello", + "role_description": "greeting prefix", + "loss_fn_factory": _loss_factory, + "evaluate": _evaluate_text, + "feedback": _feedback_text, + "apply_update": lambda candidate, b=None: _apply_greeting(candidate, b or {"param": agent}), + "state_serializer": _serialize_greeting_state, + }, + "openevolve": { + "initial_program_factory": _initial_program, + "evaluate_program": _evaluate_program, + "apply_candidate": lambda candidate, b: _apply_greeting(candidate, b), + "state_serializer": _serialize_greeting_state, + }, + }, ) diff --git a/trace_bench/integrations/external_optimizers.py b/trace_bench/integrations/external_optimizers.py new file mode 100644 index 0000000..1b3995e --- /dev/null +++ b/trace_bench/integrations/external_optimizers.py @@ -0,0 +1,477 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional +import importlib +import inspect +import os +import tempfile + + +def _package_available(package_name: str) -> bool: + try: + importlib.import_module(package_name) + return True + except Exception: + return False + + +_EXTERNAL_TRAINERS: Dict[str, Dict[str, str]] = { + "DSPy-MIPROv2": {"package": "dspy", "source": "external:dspy"}, + "DSPy-BootstrapFewShot": {"package": "dspy", "source": "external:dspy"}, + "TextGrad-TGD": {"package": "textgrad", "source": "external:textgrad"}, + "OpenEvolve": {"package": "openevolve", "source": "external:openevolve"}, +} + +_EXTERNAL_ALLOWED_PARAMS: Dict[str, set[str]] = { + "DSPy-MIPROv2": { + "threads", "num_threads", "num_trials", "teacher", "valset", "seed", "verbose", + "minibatch", "minibatch_size", "minibatch_full_eval_steps", "max_errors", "valset_ratio", + }, + "DSPy-BootstrapFewShot": { + "threads", "num_threads", "teacher", "seed", "verbose", "max_errors", + }, + "TextGrad-TGD": { + "num_steps", "steps", "verbose", "learning_rate", "lr", + }, + "OpenEvolve": { + "iterations", "num_iterations", "num_steps", "seed", "verbose", + "population_size", "num_islands", "include_artifacts", "max_artifact_bytes", + }, +} + + +def discover_external_trainers() -> List[Dict[str, Any]]: + return [ + { + "id": trainer_id, + "source": meta["source"], + "available": _package_available(meta["package"]), + } + for trainer_id, meta in _EXTERNAL_TRAINERS.items() + ] + + +def is_external_trainer(trainer_id: str) -> bool: + return trainer_id in _EXTERNAL_TRAINERS + + +def allowed_external_trainer_kwargs(trainer_id: str) -> Optional[set[str]]: + return _EXTERNAL_ALLOWED_PARAMS.get(trainer_id) + + +def _safe_score(value: Any) -> Optional[float]: + try: + return None if value is None else float(value) + except Exception: + return None + + +def _state_serializer_fallback(obj: Any) -> Dict[str, Any]: + if isinstance(obj, dict): + return obj + return {"kind": type(obj).__name__, "repr": repr(obj)} + + +def _extract_bundle_score(bundle: Dict[str, Any], candidate_value: Any) -> tuple[Any, Any]: + dataset = bundle["train_dataset"] + guide = bundle["guide"] + inputs = dataset.get("inputs") or [] + infos = dataset.get("infos") or [] + if not inputs or not infos: + return None, "empty_dataset" + task_input = inputs[0] + task_info = infos[0] + try: + score, feedback = guide(task_input, candidate_value, task_info) + return score, feedback + except Exception as exc: + return None, f"guide_error: {exc}" + + +def _score_from_eval_result(result: Any) -> tuple[Any, Any, Dict[str, Any]]: + feedback = None + artifacts: Dict[str, Any] = {} + score = None + if isinstance(result, tuple): + if len(result) >= 2: + score, feedback = result[0], result[1] + else: + score = result[0] + elif isinstance(result, dict): + if "score" in result: + score = result.get("score") + elif isinstance(result.get("metrics"), dict): + metrics = result["metrics"] + score = metrics.get("score") + if score is None and metrics: + score = next(iter(metrics.values())) + feedback = result.get("feedback") + artifacts = dict(result.get("artifacts") or {}) + else: + score = result + return score, feedback, artifacts + + +def _apply_text_candidate(bundle: Dict[str, Any], candidate_text: str) -> None: + param = bundle.get("param") + if callable(getattr(param, "_set", None)): + param._set(candidate_text) + return + if hasattr(param, "data"): + setattr(param, "data", candidate_text) + return + raise ValueError("Cannot apply text candidate back to bundle param") + + +def _select_best_state(initial_state: Dict[str, Any], final_state: Dict[str, Any], score_initial: Any, score_final: Any) -> Dict[str, Any]: + si = _safe_score(score_initial) + sf = _safe_score(score_final) + if sf is not None and (si is None or sf >= si): + return final_state + return initial_state or final_state + + +def _build_dspy_runtime(bundle: Dict[str, Any]) -> Dict[str, Any]: + spec = (bundle.get("frameworks") or {}).get("dspy") + if not isinstance(spec, dict): + raise ValueError("Task does not expose a DSPy adapter payload under bundle['frameworks']['dspy']") + required = ["program_factory", "metric", "evaluate"] + missing = [k for k in required if not callable(spec.get(k))] + if missing: + raise ValueError(f"DSPy adapter missing callables: {missing}") + return spec + + +def _build_textgrad_runtime(bundle: Dict[str, Any]) -> Dict[str, Any]: + spec = (bundle.get("frameworks") or {}).get("textgrad") + if isinstance(spec, dict): + return spec + param = bundle.get("param") + value = getattr(param, "data", None) + if isinstance(value, str): + objective = (bundle.get("optimizer_kwargs") or {}).get("objective", "Improve this text") + return { + "initial_text": value, + "role_description": "trainable text", + "evaluate": lambda candidate: _extract_bundle_score(bundle, candidate)[0], + "feedback": lambda candidate: _extract_bundle_score(bundle, candidate)[1], + "objective": objective, + "apply_update": lambda candidate_value, b=bundle: _apply_text_candidate(b, candidate_value), + } + raise ValueError("Task does not expose a TextGrad adapter and generic text fallback is not possible") + + +def _build_openevolve_runtime(bundle: Dict[str, Any]) -> Dict[str, Any]: + spec = (bundle.get("frameworks") or {}).get("openevolve") + if not isinstance(spec, dict): + raise ValueError("Task does not expose an OpenEvolve adapter payload under bundle['frameworks']['openevolve']") + if not (spec.get("initial_program") is not None or callable(spec.get("initial_program_factory"))): + raise ValueError("OpenEvolve adapter missing initial_program or initial_program_factory") + if not (callable(spec.get("evaluate_path")) or callable(spec.get("evaluate_program"))): + raise ValueError("OpenEvolve adapter missing evaluate_path or evaluate_program") + return spec + + +def _dspy_runner_class(trainer_id: str): + dspy = importlib.import_module("dspy") + mapping = { + "DSPy-MIPROv2": ["MIPROv2", "MIPRO"], + "DSPy-BootstrapFewShot": ["BootstrapFewShot"], + } + class_candidates = mapping.get(trainer_id) + if not class_candidates: + raise ValueError(f"Unknown DSPy trainer: {trainer_id}") + + namespaces = [dspy] + teleprompt = getattr(dspy, "teleprompt", None) + if teleprompt is not None: + namespaces.append(teleprompt) + try: + namespaces.append(importlib.import_module("dspy.teleprompt")) + except Exception: + pass + + for cls_name in class_candidates: + for ns in namespaces: + if hasattr(ns, cls_name): + return getattr(ns, cls_name) + + raise AttributeError(f"DSPy optimizer class not found: {class_candidates[0]}") + + +def run_dspy_trainer(bundle: Dict[str, Any], trainer_spec: Any, params: Dict[str, Any], mode: str) -> Dict[str, Any]: + spec = _build_dspy_runtime(bundle) + if mode == "stub" and callable(spec.get("stub_setup")): + spec["stub_setup"]() + elif callable(spec.get("setup")): + spec["setup"]() + + program = spec["program_factory"]() + metric = spec["metric"] + to_trainset = spec.get("to_trainset") + trainset = spec.get("trainset") + if callable(to_trainset): + trainset = to_trainset(bundle.get("train_dataset", {})) + elif callable(trainset): + trainset = trainset(bundle.get("train_dataset", {})) + if trainset is None: + trainset = bundle.get("train_dataset") + + optimizer_init = dict(spec.get("optimizer_init") or {}) + optimizer_init.update(dict(trainer_spec.optimizer_kwargs or {})) + compile_kwargs = dict(spec.get("compile_kwargs") or {}) + compile_kwargs.update(dict(params or {})) + # optimizer kwargs belong to optimizer ctor, not compile(). + compile_kwargs.pop("optimizer_kwargs", None) + if "threads" in compile_kwargs and "num_threads" not in compile_kwargs: + compile_kwargs["num_threads"] = compile_kwargs.pop("threads") + + runner_cls = _dspy_runner_class(trainer_spec.id) + ctor_kwargs = {"metric": metric, **optimizer_init} + ctor_kwargs = _filter_supported_kwargs(runner_cls, ctor_kwargs) + optimizer = runner_cls(**ctor_kwargs) + + # Some DSPy versions expect these on compile() instead of constructor. + for k in ("max_bootstrapped_demos", "max_labeled_demos", "max_rounds", "num_trials", "auto", "eval_kwargs"): + if k in optimizer_init and k not in compile_kwargs: + compile_kwargs[k] = optimizer_init[k] + compile_kwargs.setdefault("eval_kwargs", {}) + # Keep benchmark runs non-interactive for MIPRO variants. + compile_kwargs.setdefault("requires_permission_to_run", False) + compile_kwargs.setdefault("view_data", False) + compile_kwargs.setdefault("view_examples", False) + + compile_kwargs = _filter_supported_kwargs(optimizer.compile, compile_kwargs) + evaluate = spec["evaluate"] + score_initial = evaluate(program, trainset) + compiled = optimizer.compile(student=program, trainset=trainset, **compile_kwargs) + score_final = evaluate(compiled, trainset) + + if callable(spec.get("sync_to_bundle")): + spec["sync_to_bundle"](compiled, bundle) + + serializer = spec.get("state_serializer") if callable(spec.get("state_serializer")) else _state_serializer_fallback + initial_state = serializer(program) + final_state = serializer(compiled) + best_state = _select_best_state(initial_state, final_state, score_initial, score_final) + feedback = f"compiled with {trainer_spec.id}" + return { + "status": "ok", + "score_initial": score_initial, + "score_final": score_final, + "score_best": max(filter(lambda x: x is not None, [_safe_score(score_initial), _safe_score(score_final)]), default=None), + "feedback": feedback, + "resolved_optimizer": f"DSPy.{trainer_spec.id}", + "resolved_trainer_kwargs": compile_kwargs, + "resolved_optimizer_kwargs": {k: v for k, v in ctor_kwargs.items() if k != "metric"}, + "resolved_guide": "dspy metric", + "resolved_logger": None, + "initial_state": initial_state, + "final_state": final_state, + "best_state": best_state, + } + + +def _resolve_textgrad_engine(tg: Any, optimizer_init: Dict[str, Any]) -> Any: + engine = ( + optimizer_init.get("engine") + or os.environ.get("TRACE_TEXTGRAD_ENGINE") + or os.environ.get("TRACE_LITELLM_MODEL") + or os.environ.get("OPENAI_MODEL") + ) + if not engine: + raise ValueError( + "TextGrad-TGD requires an engine. Set trainer.optimizer_kwargs.engine " + "or TRACE_TEXTGRAD_ENGINE / TRACE_LITELLM_MODEL." + ) + try: + tg.set_backward_engine(engine, override=True) + except TypeError: + tg.set_backward_engine(engine) + return engine + + +def _filter_supported_kwargs(fn: Any, kwargs: Dict[str, Any]) -> Dict[str, Any]: + try: + sig = inspect.signature(fn) + except Exception: + return dict(kwargs) + params = sig.parameters + if any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params.values()): + return dict(kwargs) + return {k: v for k, v in kwargs.items() if k in params} + + +def run_textgrad_trainer(bundle: Dict[str, Any], trainer_spec: Any, params: Dict[str, Any], mode: str) -> Dict[str, Any]: + del mode + tg = importlib.import_module("textgrad") + spec = _build_textgrad_runtime(bundle) + if callable(spec.get("setup")): + spec["setup"]() + + initial_text = spec.get("initial_text") + if initial_text is None and callable(spec.get("variable_factory")): + initial = spec["variable_factory"](bundle) + # variable_factory may directly return a Variable or text + if hasattr(initial, "value"): + variable = initial + initial_text = getattr(variable, "value", None) + else: + initial_text = initial + variable = None + else: + variable = None + + if variable is None: + variable = tg.Variable( + initial_text, + role_description=spec.get("role_description", "trainable text"), + requires_grad=True, + ) + + optimizer_init = dict(spec.get("optimizer_init") or {}) + optimizer_init.update(dict(trainer_spec.optimizer_kwargs or {})) + engine = _resolve_textgrad_engine(tg, optimizer_init) + optimizer_init.setdefault("engine", engine) + + loss_fn = spec.get("loss_fn") + if loss_fn is None: + if callable(spec.get("loss_fn_factory")): + loss_fn = spec["loss_fn_factory"](bundle) + else: + objective = spec.get("objective") or (bundle.get("optimizer_kwargs") or {}).get("objective", "Improve this text") + loss_fn = tg.TextLoss(objective, engine=engine) + + optimizer = tg.TGD(parameters=[variable], **optimizer_init) + num_steps = int(params.get("num_steps", params.get("steps", 1))) + + evaluate = spec.get("evaluate") or (lambda value: _extract_bundle_score(bundle, value)[0]) + feedback_fn = spec.get("feedback") or (lambda value: _extract_bundle_score(bundle, value)[1]) + score_initial = evaluate(getattr(variable, "value", initial_text)) + for _ in range(num_steps): + loss = loss_fn(variable) + loss.backward() + optimizer.step() + + candidate = getattr(variable, "value", initial_text) + if callable(spec.get("apply_update")): + spec["apply_update"](candidate) + else: + _apply_text_candidate(bundle, candidate) + + score_final = evaluate(candidate) + serializer = spec.get("state_serializer") if callable(spec.get("state_serializer")) else (lambda obj: {"kind": "TextGradVariable", "value": getattr(obj, "value", obj)}) + initial_state = serializer(initial_text) + final_state = serializer(candidate) + best_state = _select_best_state(initial_state, final_state, score_initial, score_final) + return { + "status": "ok", + "score_initial": score_initial, + "score_final": score_final, + "score_best": max(filter(lambda x: x is not None, [_safe_score(score_initial), _safe_score(score_final)]), default=None), + "feedback": feedback_fn(candidate), + "resolved_optimizer": "TextGrad.TGD", + "resolved_trainer_kwargs": dict(params or {}), + "resolved_optimizer_kwargs": optimizer_init, + "resolved_guide": "TextLoss / adapter evaluator", + "resolved_logger": None, + "initial_state": initial_state, + "final_state": final_state, + "best_state": best_state, + } + + +def run_openevolve_trainer(bundle: Dict[str, Any], trainer_spec: Any, params: Dict[str, Any], mode: str) -> Dict[str, Any]: + del mode + spec = _build_openevolve_runtime(bundle) + if callable(spec.get("setup")): + spec["setup"]() + openevolve = importlib.import_module("openevolve") + run_evolution = getattr(openevolve, "run_evolution") + + initial_program = spec.get("initial_program") + if initial_program is None and callable(spec.get("initial_program_factory")): + initial_program = spec["initial_program_factory"](bundle) + if not isinstance(initial_program, str): + raise ValueError("OpenEvolve initial program must be a string") + + def evaluator(candidate_path: str): + if callable(spec.get("evaluate_path")): + return spec["evaluate_path"](candidate_path) + code = Path(candidate_path).read_text(encoding="utf-8") + return spec["evaluate_program"](code) + + run_kwargs = dict(spec.get("run_kwargs") or {}) + run_kwargs.update(dict(trainer_spec.optimizer_kwargs or {})) + iterations = int(params.get("iterations", params.get("num_iterations", params.get("num_steps", 1)))) + other_kwargs = {k: v for k, v in (params or {}).items() if k not in {"iterations", "num_iterations", "num_steps"}} + safe_kwargs = _filter_supported_kwargs(run_evolution, {**run_kwargs, **other_kwargs}) + + # Baseline evaluation from the initial program text + if callable(spec.get("evaluate_program")): + score_initial, feedback_initial, _ = _score_from_eval_result(spec["evaluate_program"](initial_program)) + else: + with tempfile.TemporaryDirectory(prefix="tb_openevolve_init_") as td: + p = Path(td) / "candidate.py" + p.write_text(initial_program, encoding="utf-8") + score_initial, feedback_initial, _ = _score_from_eval_result(spec["evaluate_path"](str(p))) + + result = run_evolution(initial_program=initial_program, evaluator=evaluator, iterations=iterations, **safe_kwargs) + + best_code = None + if isinstance(result, dict): + best_code = result.get("best_code") or result.get("code") or result.get("best_program") + else: + for attr in ("best_code", "code", "best_program"): + if hasattr(result, attr): + best_code = getattr(result, attr) + break + if not isinstance(best_code, str): + raise ValueError("OpenEvolve result does not expose best_code") + + if callable(spec.get("apply_candidate")): + spec["apply_candidate"](best_code, bundle) + else: + _apply_text_candidate(bundle, best_code) + + if callable(spec.get("evaluate_program")): + score_final, feedback_final, artifacts = _score_from_eval_result(spec["evaluate_program"](best_code)) + else: + with tempfile.TemporaryDirectory(prefix="tb_openevolve_best_") as td: + p = Path(td) / "candidate.py" + p.write_text(best_code, encoding="utf-8") + score_final, feedback_final, artifacts = _score_from_eval_result(spec["evaluate_path"](str(p))) + + serializer = spec.get("state_serializer") if callable(spec.get("state_serializer")) else (lambda code: {"kind": "OpenEvolveProgram", "program": code}) + initial_state = serializer(initial_program) + final_state = serializer(best_code) + best_state = _select_best_state(initial_state, final_state, score_initial, score_final) + feedback = feedback_final or feedback_initial or "optimized with OpenEvolve" + if artifacts: + feedback = f"{feedback} | artifacts: {artifacts}" + return { + "status": "ok", + "score_initial": score_initial, + "score_final": score_final, + "score_best": max(filter(lambda x: x is not None, [_safe_score(score_initial), _safe_score(score_final)]), default=None), + "feedback": feedback, + "resolved_optimizer": "OpenEvolve.run_evolution", + "resolved_trainer_kwargs": {"iterations": iterations, **other_kwargs}, + "resolved_optimizer_kwargs": safe_kwargs, + "resolved_guide": "OpenEvolve evaluator", + "resolved_logger": None, + "initial_state": initial_state, + "final_state": final_state, + "best_state": best_state, + } + + +def run_external_trainer(bundle: Dict[str, Any], trainer_spec: Any, params: Dict[str, Any], mode: str) -> Dict[str, Any]: + if trainer_spec.id.startswith("DSPy-"): + return run_dspy_trainer(bundle, trainer_spec, params, mode) + if trainer_spec.id == "TextGrad-TGD": + return run_textgrad_trainer(bundle, trainer_spec, params, mode) + if trainer_spec.id == "OpenEvolve": + return run_openevolve_trainer(bundle, trainer_spec, params, mode) + raise ValueError(f"Unknown external trainer: {trainer_spec.id}") diff --git a/trace_bench/registry.py b/trace_bench/registry.py index 6f55840..7d55e25 100644 --- a/trace_bench/registry.py +++ b/trace_bench/registry.py @@ -13,6 +13,7 @@ from trace_bench.veribench_adapter import build_bundle as build_veribench_bundle from trace_bench.veribench_adapter import discover_task_names as discover_veribench_task_names +from trace_bench.integrations.external_optimizers import discover_external_trainers @dataclass class TaskSpec: @@ -304,6 +305,8 @@ def discover_trainers() -> List[TrainerSpec]: if trainer_id in _PRIORITY_SEARCH_EXAMPLE_TRAINERS and not priority_examples_supported: available = False specs[trainer_id] = TrainerSpec(id=trainer_id, source=obj.__module__, available=available) + for ext in discover_external_trainers(): + specs[ext['id']] = TrainerSpec(id=ext['id'], source=ext['source'], available=ext['available']) return sorted(specs.values(), key=lambda spec: spec.id) diff --git a/trace_bench/resolve.py b/trace_bench/resolve.py index e285341..be56cbe 100644 --- a/trace_bench/resolve.py +++ b/trace_bench/resolve.py @@ -35,6 +35,12 @@ def _param_alias_map(algo_name: str) -> Dict[str, str]: def resolve_trainer_kwargs(params: Dict[str, Any], algo_name: str) -> Dict[str, Any]: + try: + from trace_bench.integrations.external_optimizers import is_external_trainer + if is_external_trainer(algo_name): + return {k: v for k, v in (params or {}).items() if k not in _FILTERED_KWARGS} + except Exception: + pass kwargs = _default_trainer_kwargs(algo_name) alias_map = _param_alias_map(algo_name) for key, value in (params or {}).items(): diff --git a/trace_bench/results.py b/trace_bench/results.py index b6a0fe1..a4b9142 100644 --- a/trace_bench/results.py +++ b/trace_bench/results.py @@ -170,6 +170,37 @@ def _score(v: Any) -> float: return out + +def build_trainer_comparison_rows(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + def _score(v: Any) -> float: + try: + return float(v) + except Exception: + return float("-inf") + + ok_rows = [r for r in (rows or []) if str(r.get("status")) == "ok"] + ok_rows.sort( + key=lambda r: ( + str(r.get("task_id") or ""), + -_score(r.get("score_best")), + str(r.get("trainer_id") or ""), + ) + ) + + out: List[Dict[str, Any]] = [] + for rank, row in enumerate(ok_rows, start=1): + out.append({ + "rank": rank, + "task_id": row.get("task_id"), + "suite": row.get("suite"), + "job_id": row.get("job_id"), + "trainer_id": row.get("trainer_id"), + "score_best": row.get("score_best"), + "time_seconds": row.get("time_seconds"), + "status": row.get("status"), + }) + return out + def summarize_results(rows: List[Dict[str, Any]]) -> Dict[str, Any]: counts: Dict[str, int] = {"ok": 0, "failed": 0, "skipped": 0} token_totals: Dict[str, int] = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0} @@ -191,5 +222,6 @@ def summarize_results(rows: List[Dict[str, Any]]) -> Dict[str, Any]: "build_results_row", "build_results_csv_row", "build_leaderboard_rows", + "build_trainer_comparison_rows", "summarize_results", ] diff --git a/trace_bench/runner.py b/trace_bench/runner.py index 6b3183e..61ebf1f 100644 --- a/trace_bench/runner.py +++ b/trace_bench/runner.py @@ -45,9 +45,11 @@ priority_search_example_trainers_supported, ) from trace_bench.resolve import merge_kwargs, resolve_trainer_kwargs +from trace_bench.integrations.external_optimizers import is_external_trainer, run_external_trainer from trace_bench.results import ( RESULT_COLUMNS, build_leaderboard_rows, + build_trainer_comparison_rows, build_results_csv_row, build_results_row, summarize_results, @@ -303,6 +305,9 @@ def _train_bundle( mode: str, ) -> Dict[str, Any]: """Train a bundle synchronously. Timeout is handled at the job level.""" + if is_external_trainer(trainer_spec.id): + return run_external_trainer(bundle, trainer_spec, params, mode) + from opto import trainer as opto_trainer algo_name = trainer_spec.id @@ -564,6 +569,21 @@ def _resolve_runtime_from_bundle( resolved_guide_kwargs = merge_kwargs(bundle.get("guide_kwargs"), trainer_spec.guide_kwargs or {}) resolved_logger_kwargs = merge_kwargs(bundle.get("logger_kwargs"), trainer_spec.logger_kwargs or {}) + if is_external_trainer(trainer_spec.id): + raw_logger_obj = trainer_spec.logger or "ConsoleLogger" + logger_obj, resolved_logger = _build_logger(raw_logger_obj, resolved_logger_kwargs) + return { + "resolved_optimizer": trainer_spec.id, + "resolved_guide": _component_identity(trainer_spec.guide or bundle.get("guide")), + "resolved_logger": resolved_logger, + "resolved_trainer_kwargs": resolved_trainer_kwargs, + "resolved_optimizer_kwargs": dict(trainer_spec.optimizer_kwargs or {}), + "resolved_guide_kwargs": resolved_guide_kwargs, + "resolved_logger_kwargs": resolved_logger_kwargs, + "guide_obj": trainer_spec.guide or bundle.get("guide"), + "logger_obj": logger_obj, + } + guide_obj = trainer_spec.guide or bundle["guide"] raw_logger_obj = trainer_spec.logger or "ConsoleLogger" logger_obj, resolved_logger = _build_logger(raw_logger_obj, resolved_logger_kwargs) @@ -712,6 +732,12 @@ def _write_atomic(path: str, payload_value: Dict[str, Any]) -> None: train_result = _train_bundle(bundle, trainer_spec, params, mode) payload["status"] = train_result.get("status", "ok") + payload["score_initial"] = train_result.get("score_initial", payload["score_initial"]) + payload["score_final"] = train_result.get("score_final", payload["score_final"]) + payload["score_best"] = train_result.get("score_best", payload["score_best"]) + payload["initial_state"] = train_result.get("initial_state", payload["initial_state"]) + payload["final_state"] = train_result.get("final_state", payload.get("final_state", {})) + payload["best_state"] = train_result.get("best_state", payload.get("best_state", {})) payload["resolved_optimizer"] = train_result.get("resolved_optimizer", payload["resolved_optimizer"]) payload["resolved_guide"] = train_result.get("resolved_guide", payload["resolved_guide"]) payload["resolved_logger"] = train_result.get("resolved_logger", payload["resolved_logger"]) @@ -736,21 +762,25 @@ def _write_atomic(path: str, payload_value: Dict[str, Any]) -> None: suffix = f"\n{trace}" if trace else "" payload["feedback"] = f"training_error: {train_result.get('error', 'unknown')}{suffix}" - final = _evaluate_bundle(bundle) - payload["score_final"] = final.get("score") - if payload["status"] != "failed": - payload["feedback"] = final.get("feedback") or payload["feedback"] + if payload["score_final"] is None: + final = _evaluate_bundle(bundle) + payload["score_final"] = final.get("score") + if payload["status"] != "failed": + payload["feedback"] = final.get("feedback") or payload["feedback"] si, sf = payload["score_initial"], payload["score_final"] - if isinstance(si, (int, float)) and isinstance(sf, (int, float)): - payload["score_best"] = max(si, sf) - else: - payload["score_best"] = sf if sf is not None else si - - payload["final_state"] = _snapshot_model_state(bundle["param"]) - payload["best_state"] = _select_best_state( - payload["initial_state"], payload["final_state"], payload["score_initial"], payload["score_final"] - ) + if payload["score_best"] is None: + if isinstance(si, (int, float)) and isinstance(sf, (int, float)): + payload["score_best"] = max(si, sf) + else: + payload["score_best"] = sf if sf is not None else si + + if not payload.get("final_state"): + payload["final_state"] = _snapshot_model_state(bundle["param"]) + if not payload.get("best_state"): + payload["best_state"] = _select_best_state( + payload["initial_state"], payload["final_state"], payload["score_initial"], payload["score_final"] + ) payload.update(_extract_token_usage(payload["resolved_optimizer_kwargs"])) except NotImplementedError as exc: @@ -801,6 +831,7 @@ def _build_files_index(run_artifacts: RunArtifacts, manifest_jobs: List[Dict[str "results_csv": str(run_artifacts.results_csv), "summary_json": str(run_artifacts.summary_json), "leaderboard_csv": str(run_artifacts.leaderboard_csv), + "trainer_comparison_csv": str(run_artifacts.trainer_comparison_csv), }, "notes": { "token_scope": _token_scope_note(), @@ -1118,6 +1149,16 @@ def _execute_job(j: JobSpec) -> Optional[Tuple[Dict[str, Any], Dict[str, Any]]]: writer.writeheader() writer.writerows(leaderboard_rows) + comparison_rows = build_trainer_comparison_rows(results) + if comparison_rows: + with self.artifacts.trainer_comparison_csv.open("w", encoding="utf-8", newline="") as f: + writer = csv.DictWriter( + f, + fieldnames=["rank", "task_id", "suite", "job_id", "trainer_id", "score_best", "time_seconds", "status"], + ) + writer.writeheader() + writer.writerows(comparison_rows) + write_files_index(self.artifacts.files_index_json, _build_files_index(self.artifacts, manifest_jobs)) log_run_end(mlflow_ctx, summary_payload) @@ -1341,6 +1382,12 @@ def _run_job_inprocess(self, job: JobSpec) -> Dict[str, Any]: bundle, job.trainer, job.params, self.config.mode, ) status = train_result.get("status", "ok") + score_initial = train_result.get("score_initial", score_initial) + score_final = train_result.get("score_final", score_final) + score_best = train_result.get("score_best", score_best) + initial_state = train_result.get("initial_state", initial_state) + final_state = train_result.get("final_state", final_state) + best_state = train_result.get("best_state", best_state) resolved_optimizer = train_result.get("resolved_optimizer", resolved_optimizer) resolved_guide = train_result.get("resolved_guide", resolved_guide) resolved_logger = train_result.get("resolved_logger", resolved_logger) @@ -1360,17 +1407,21 @@ def _run_job_inprocess(self, job: JobSpec) -> Dict[str, Any]: trace = train_result.get("traceback") suffix = f"\n{trace}" if trace else "" feedback = f"training_error: {train_result.get('error', 'unknown')}{suffix}" - final = _evaluate_bundle(bundle) - score_final = final.get("score") - if status != "failed": - feedback = final.get("feedback") or feedback - - if isinstance(score_initial, (int, float)) and isinstance(score_final, (int, float)): - score_best = max(score_initial, score_final) - else: - score_best = score_final if score_final is not None else score_initial - final_state = _snapshot_model_state(bundle["param"]) - best_state = _select_best_state(initial_state, final_state, score_initial, score_final) + if score_final is None: + final = _evaluate_bundle(bundle) + score_final = final.get("score") + if status != "failed": + feedback = final.get("feedback") or feedback + + if score_best is None: + if isinstance(score_initial, (int, float)) and isinstance(score_final, (int, float)): + score_best = max(score_initial, score_final) + else: + score_best = score_final if score_final is not None else score_initial + if not final_state: + final_state = _snapshot_model_state(bundle["param"]) + if not best_state: + best_state = _select_best_state(initial_state, final_state, score_initial, score_final) usage = _extract_token_usage(resolved_optimizer_kwargs) prompt_tokens = usage["prompt_tokens"] completion_tokens = usage["completion_tokens"]