diff --git a/notebooks/03_ui_launch_monitor.ipynb b/notebooks/03_ui_launch_monitor.ipynb index 6fc2d71..bdde54c 100644 --- a/notebooks/03_ui_launch_monitor.ipynb +++ b/notebooks/03_ui_launch_monitor.ipynb @@ -1,428 +1,428 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "ZZiTf7GH2QMd" - }, - "source": [ - "# Trace-Bench M3 — UI Launch + Browse + Resume + TensorBoard/MLflow\n", - "\n", - "This notebook demonstrates M3 UI behavior:\n", - "- Launch a small stub run (2×2 matrix) to produce run artifacts\n", - "- Start the Gradio UI with `trace-bench ui`\n", - "- Browse runs, filter results, inspect jobs\n", - "- Resume a previous run using the UI resume flow\n", - "- TensorBoard and MLflow integration (optional)\n", - "\n", - "**Artifacts under `runs//` are canonical.** The UI reads filesystem; MLflow/TB mirror it.\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/guru-code-expert/Trace-Bench/blob/m3/deliverable/notebooks/03_ui_launch_monitor.ipynb)" - ], - "id": "ZZiTf7GH2QMd" - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "W3AArnYD2QMf", - "outputId": "6bb2af2d-fdbe-4b3d-e545-a02a5cf0eec1" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Mounted at /content/drive\n", - "Runs dir: /content/drive/MyDrive/bench/2026-03-05/trace_bench_m3\n" - ] - } - ], - "source": [ - "from datetime import date\n", - "from pathlib import Path\n", - "import os\n", - "\n", - "try:\n", - " from google.colab import drive\n", - " drive.mount(\"/content/drive\")\n", - "except Exception:\n", - " pass\n", - "\n", - "def bench_dir(project=\"bench\", sub=\"trace_bench_m3\", local=\"/content/bench\"):\n", - " drive_root = Path(\"/content/drive/MyDrive\")\n", - " root = drive_root if drive_root.is_dir() else Path(local)\n", - " out = root / project / date.today().isoformat() / sub\n", - " out.mkdir(parents=True, exist_ok=True)\n", - " return str(out)\n", - "\n", - "RUNS_DIR = bench_dir()\n", - "os.environ[\"RUNS_DIR\"] = RUNS_DIR\n", - "print(\"Runs dir:\", RUNS_DIR)\n" - ], - "id": "W3AArnYD2QMf" - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "V3IInAJw2QMg", - "outputId": "1ffabb08-6575-41ef-b172-3480ee67a4d9" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "/content/Trace-Bench\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.7/40.7 kB\u001b[0m \u001b[31m286.5 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.9/8.9 MB\u001b[0m \u001b[31m69.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m65.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25himports ok\n" - ] - } - ], - "source": [ - "# Clone repos + install deps\n", - "!git clone --depth 1 --branch m3/deliverable https://github.com/guru-code-expert/Trace-Bench.git /content/Trace-Bench 2>/dev/null || (cd /content/Trace-Bench && git fetch origin m3/deliverable && git checkout m3/deliverable && git pull --ff-only)\n", - "!git clone --depth 1 --branch experimental https://github.com/AgentOpt/OpenTrace.git /content/OpenTrace 2>/dev/null || (cd /content/OpenTrace && git fetch origin experimental && git checkout experimental && git pull --ff-only)\n", - "\n", - "%cd /content/Trace-Bench\n", - "!python -m pip install -q pyyaml gradio \"litellm==1.75.0\" \"aiohttp>=3.9,<3.13\"\n", - "\n", - "# OpenTrace currently declares python>=3.13 in pyproject; on Colab (3.12) use source checkout via PYTHONPATH.\n", - "!PYTHONPATH=/content/OpenTrace:/content/Trace-Bench:$PYTHONPATH python -c \"import opto, trace_bench, litellm; print('imports ok')\"\n" - ], - "id": "V3IInAJw2QMg" - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Y02hciN42QMi", - "outputId": "9c878b90-3fe1-45aa-adf9-aa6cee66e66a" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Run complete: 20260305-071546-cc2fe733\n", - "Run dir: /content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-071546-cc2fe733\n", - "Manifest: /content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-071546-cc2fe733/meta/manifest.json\n", - "Results: /content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-071546-cc2fe733/results.csv\n", - "Summary: /content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-071546-cc2fe733/summary.json\n", - "Leaderboard: /content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-071546-cc2fe733/leaderboard.csv\n", - "Files index: /content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-071546-cc2fe733/meta/files_index.json\n", - "\n", - "--- Run artifacts ---\n", - "/content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-063935-71c660a2/:\n", - "total 13\n", - "drwx------ 2 root root 4096 Mar 5 06:39 jobs\n", - "-rw------- 1 root root 202 Mar 5 06:39 leaderboard.csv\n", - "drwx------ 2 root root 4096 Mar 5 06:39 meta\n", - "-rw------- 1 root root 3242 Mar 5 06:39 results.csv\n", - "-rw------- 1 root root 212 Mar 5 06:39 summary.json\n", - "\n", - "/content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-064200-ba7c04cb/:\n", - "total 12\n", - "drwx------ 2 root root 4096 Mar 5 06:42 jobs\n", - "-rw------- 1 root root 151 Mar 5 07:01 leaderboard.csv\n", - "drwx------ 2 root root 4096 Mar 5 06:42 meta\n", - "-rw------- 1 root root 2809 Mar 5 07:01 results.csv\n", - "-rw------- 1 root root 234 Mar 5 07:01 summary.json\n", - "\n", - "/content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-071314-cc2fe733/:\n", - "total 13\n", - "drwx------ 2 root root 4096 Mar 5 07:13 jobs\n", - "-rw------- 1 root root 202 Mar 5 07:13 leaderboard.csv\n", - "drwx------ 2 root root 4096 Mar 5 07:13 meta\n", - "-rw------- 1 root root 3242 Mar 5 07:13 results.csv\n", - "-rw------- 1 root root 212 Mar 5 07:13 summary.json\n", - "\n", - "/content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-071546-cc2fe733/:\n", - "total 13\n", - "drwx------ 6 root root 4096 Mar 5 07:15 jobs\n", - "-rw------- 1 root root 200 Mar 5 07:15 leaderboard.csv\n", - "drwx------ 2 root root 4096 Mar 5 07:15 meta\n", - "-rw------- 1 root root 3239 Mar 5 07:15 results.csv\n", - "-rw------- 1 root root 212 Mar 5 07:15 summary.json\n", - "\n", - "--- results.csv (head) ---\n", - "==> /content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-063935-71c660a2/results.csv <==\n", - "run_id,job_id,task_id,suite,trainer_id,seed,status,score_initial,score_final,score_best,time_seconds,resolved_optimizer,resolved_guide,resolved_logger,resolved_trainer_kwargs,resolved_optimizer_kwargs,resolved_guide_kwargs,resolved_logger_kwargs,eval_kwargs,feedback,llm_provider,llm_model,llm_base_url,token_scope,prompt_tokens,completion_tokens,total_tokens,initial_state_path,best_state_path,final_state_path,state_history_path,tb_logdir\r\n", - "20260305-063935-71c660a2,a757c4090147,internal:numeric_param,internal,GEPA-Base,123,ok,-3.0,-3.0,-3.0,5.807537,OptoPrimeV2,trace_bench.examples.internal_numeric_param.NumericGuide,ConsoleLogger,\"{\"\"merge_every\"\": 2, \"\"num_iters\"\": 1, \"\"pareto_subset_size\"\": 2, \"\"train_batch_size\"\": 2}\",\"{\"\"llm\"\": {\"\"__class__\"\": \"\"DummyLLM\"\", \"\"__module__\"\": \"\"opto.utils.llm\"\"}, \"\"memory_size\"\": 5, \"\"objective\"\": \"\"Match the numeric target value.\"\"}\",{},{},{},target=3.0,,,,trace_optimization_only,0,0,0,jobs/a757c4090147/artifacts/initial_state.yaml,jobs/a757c4090147/artifacts/best_state.yaml,jobs/a757c4090147/artifacts/final_state.yaml,jobs/a757c4090147/artifacts/state_history.jsonl,jobs/a757c4090147/tb\r\n", - "20260305-063935-71c660a2,806f391306ae,internal:numeric_param,internal,PrioritySearch,123,ok,-3.0,-3.0,-3.0,5.817129,OptoPrimeV2,trace_bench.examples.internal_numeric_param.NumericGuide,ConsoleLogger,\"{\"\"num_batches\"\": 1, \"\"num_candidates\"\": 2, \"\"num_epochs\"\": 1, \"\"num_proposals\"\": 2, \"\"num_steps\"\": 1}\",\"{\"\"llm\"\": {\"\"__class__\"\": \"\"DummyLLM\"\", \"\"__module__\"\": \"\"opto.utils.llm\"\"}, \"\"memory_size\"\": 5, \"\"objective\"\": \"\"Match the numeric target value.\"\"}\",{},{},{},target=3.0,,,,trace_optimization_only,0,0,0,jobs/806f391306ae/artifacts/initial_state.yaml,jobs/806f391306ae/artifacts/best_state.yaml,jobs/806f391306ae/artifacts/final_state.yaml,jobs/806f391306ae/artifacts/state_history.jsonl,jobs/806f391306ae/tb\r\n", - "20260305-063935-71c660a2,51046d033ec9,internal:code_param,internal,GEPA-Base,123,ok,1.0,1.0,1.0,0.218794,OptoPrimeV2,trace_bench.examples.internal_code_param.CodeExactGuide,ConsoleLogger,\"{\"\"merge_every\"\": 2, \"\"num_iters\"\": 1, \"\"pareto_subset_size\"\": 2, \"\"train_batch_size\"\": 2}\",\"{\"\"llm\"\": {\"\"__class__\"\": \"\"DummyLLM\"\", \"\"__module__\"\": \"\"opto.utils.llm\"\"}, \"\"memory_size\"\": 5, \"\"objective\"\": \"\"Match the target code exactly.\"\"}\",{},{},{},Correct,,,,trace_optimization_only,0,0,0,jobs/51046d033ec9/artifacts/initial_state.yaml,jobs/51046d033ec9/artifacts/best_state.yaml,jobs/51046d033ec9/artifacts/final_state.yaml,jobs/51046d033ec9/artifacts/state_history.jsonl,jobs/51046d033ec9/tb\r\n", - "20260305-063935-71c660a2,3dc5df989755,internal:code_param,internal,PrioritySearch,123,ok,1.0,1.0,1.0,0.305007,OptoPrimeV2,trace_bench.examples.internal_code_param.CodeExactGuide,ConsoleLogger,\"{\"\"num_batches\"\": 1, \"\"num_candidates\"\": 2, \"\"num_epochs\"\": 1, \"\"num_proposals\"\": 2, \"\"num_steps\"\": 1}\",\"{\"\"llm\"\": {\"\"__class__\"\": \"\"DummyLLM\"\", \"\"__module__\"\": \"\"opto.utils.llm\"\"}, \"\"memory_size\"\": 5, \"\"objective\"\": \"\"Match the target code exactly.\"\"}\",{},{},{},Correct,,,,trace_optimization_only,0,0,0,jobs/3dc5df989755/artifacts/initial_state.yaml,jobs/3dc5df989755/artifacts/best_state.yaml,jobs/3dc5df989755/artifacts/final_state.yaml,jobs/3dc5df989755/artifacts/state_history.jsonl,jobs/3dc5df989755/tb\r\n", - "\n", - "==> /content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-064200-ba7c04cb/results.csv <==\n", - "run_id,job_id,task_id,suite,trainer_id,seed,status,score_initial,score_final,score_best,time_seconds,resolved_optimizer,resolved_guide,resolved_logger,resolved_trainer_kwargs,resolved_optimizer_kwargs,resolved_guide_kwargs,resolved_logger_kwargs,eval_kwargs,feedback,llm_provider,llm_model,llm_base_url,token_scope,prompt_tokens,completion_tokens,total_tokens,initial_state_path,best_state_path,final_state_path,state_history_path,tb_logdir\r\n", - "20260305-064200-ba7c04cb,ea4ad49014e2,llm4ad:optimization/bp_2d_construct,llm4ad,GEPA-Beam,123,ok,-29.0,-29.0,-29.0,498.058084,OPROv2,LLM4AD.llm4ad_loader.AutonomousEvaluatorGuide,ConsoleLogger,\"{\"\"merge_every\"\": 2, \"\"num_search_iterations\"\": 1, \"\"pareto_subset_size\"\": 2, \"\"train_batch_size\"\": 2, \"\"verbose\"\": false}\",\"{\"\"memory_size\"\": 10, \"\"objective\"\": \"\"You are optimizing the implementation of `determine_next_assignment` for the LLM4AD task.\\\\n\\\\nTask description:\\\\n'\\\\n\\\\nYour goal is to return a correct and efficient function whose score (computed by the task evaluator) is as high as possible.\"\"}\",{},{},{},\"TRACE_FEEDBACK_JSON={\"\"status\"\": \"\"ok\"\", \"\"phase\"\": \"\"evaluate\"\", \"\"score\"\": -29.0}\n", - "Autonomous eval OK in 57.94s; score=-29.0\n", - "TRACE_FEEDBACK_JSON={\"\"status\"\": \"\"ok\"\", \"\"phase\"\": \"\"evaluate\"\", \"\"score\"\": -29.0}\n", - "Autonomous eval OK in 57.94s; score=-29.0\",openrouter,openrouter/x-ai/grok-4.1-fast,https://openrouter.ai/api/v1,trace_optimization_only,0,0,0,jobs/ea4ad49014e2/artifacts/initial_state.yaml,jobs/ea4ad49014e2/artifacts/best_state.yaml,jobs/ea4ad49014e2/artifacts/final_state.yaml,jobs/ea4ad49014e2/artifacts/state_history.jsonl,jobs/ea4ad49014e2/tb\r\n", - "\n", - "==> /content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-071314-cc2fe733/results.csv <==\n", - "run_id,job_id,task_id,suite,trainer_id,seed,status,score_initial,score_final,score_best,time_seconds,resolved_optimizer,resolved_guide,resolved_logger,resolved_trainer_kwargs,resolved_optimizer_kwargs,resolved_guide_kwargs,resolved_logger_kwargs,eval_kwargs,feedback,llm_provider,llm_model,llm_base_url,token_scope,prompt_tokens,completion_tokens,total_tokens,initial_state_path,best_state_path,final_state_path,state_history_path,tb_logdir\r\n", - "20260305-071314-cc2fe733,a757c4090147,internal:numeric_param,internal,GEPA-Base,123,ok,-3.0,-3.0,-3.0,4.712792,OptoPrimeV2,trace_bench.examples.internal_numeric_param.NumericGuide,ConsoleLogger,\"{\"\"merge_every\"\": 2, \"\"num_iters\"\": 1, \"\"pareto_subset_size\"\": 2, \"\"train_batch_size\"\": 2}\",\"{\"\"llm\"\": {\"\"__class__\"\": \"\"DummyLLM\"\", \"\"__module__\"\": \"\"opto.utils.llm\"\"}, \"\"memory_size\"\": 5, \"\"objective\"\": \"\"Match the numeric target value.\"\"}\",{},{},{},target=3.0,,,,trace_optimization_only,0,0,0,jobs/a757c4090147/artifacts/initial_state.yaml,jobs/a757c4090147/artifacts/best_state.yaml,jobs/a757c4090147/artifacts/final_state.yaml,jobs/a757c4090147/artifacts/state_history.jsonl,jobs/a757c4090147/tb\r\n", - "20260305-071314-cc2fe733,806f391306ae,internal:numeric_param,internal,PrioritySearch,123,ok,-3.0,-3.0,-3.0,4.713517,OptoPrimeV2,trace_bench.examples.internal_numeric_param.NumericGuide,ConsoleLogger,\"{\"\"num_batches\"\": 1, \"\"num_candidates\"\": 2, \"\"num_epochs\"\": 1, \"\"num_proposals\"\": 2, \"\"num_steps\"\": 1}\",\"{\"\"llm\"\": {\"\"__class__\"\": \"\"DummyLLM\"\", \"\"__module__\"\": \"\"opto.utils.llm\"\"}, \"\"memory_size\"\": 5, \"\"objective\"\": \"\"Match the numeric target value.\"\"}\",{},{},{},target=3.0,,,,trace_optimization_only,0,0,0,jobs/806f391306ae/artifacts/initial_state.yaml,jobs/806f391306ae/artifacts/best_state.yaml,jobs/806f391306ae/artifacts/final_state.yaml,jobs/806f391306ae/artifacts/state_history.jsonl,jobs/806f391306ae/tb\r\n", - "20260305-071314-cc2fe733,51046d033ec9,internal:code_param,internal,GEPA-Base,123,ok,1.0,1.0,1.0,0.090701,OptoPrimeV2,trace_bench.examples.internal_code_param.CodeExactGuide,ConsoleLogger,\"{\"\"merge_every\"\": 2, \"\"num_iters\"\": 1, \"\"pareto_subset_size\"\": 2, \"\"train_batch_size\"\": 2}\",\"{\"\"llm\"\": {\"\"__class__\"\": \"\"DummyLLM\"\", \"\"__module__\"\": \"\"opto.utils.llm\"\"}, \"\"memory_size\"\": 5, \"\"objective\"\": \"\"Match the target code exactly.\"\"}\",{},{},{},Correct,,,,trace_optimization_only,0,0,0,jobs/51046d033ec9/artifacts/initial_state.yaml,jobs/51046d033ec9/artifacts/best_state.yaml,jobs/51046d033ec9/artifacts/final_state.yaml,jobs/51046d033ec9/artifacts/state_history.jsonl,jobs/51046d033ec9/tb\r\n", - "20260305-071314-cc2fe733,3dc5df989755,internal:code_param,internal,PrioritySearch,123,ok,1.0,1.0,1.0,0.160663,OptoPrimeV2,trace_bench.examples.internal_code_param.CodeExactGuide,ConsoleLogger,\"{\"\"num_batches\"\": 1, \"\"num_candidates\"\": 2, \"\"num_epochs\"\": 1, \"\"num_proposals\"\": 2, \"\"num_steps\"\": 1}\",\"{\"\"llm\"\": {\"\"__class__\"\": \"\"DummyLLM\"\", \"\"__module__\"\": \"\"opto.utils.llm\"\"}, \"\"memory_size\"\": 5, \"\"objective\"\": \"\"Match the target code exactly.\"\"}\",{},{},{},Correct,,,,trace_optimization_only,0,0,0,jobs/3dc5df989755/artifacts/initial_state.yaml,jobs/3dc5df989755/artifacts/best_state.yaml,jobs/3dc5df989755/artifacts/final_state.yaml,jobs/3dc5df989755/artifacts/state_history.jsonl,jobs/3dc5df989755/tb\r\n", - "\n", - "==> /content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-071546-cc2fe733/results.csv <==\n", - "run_id,job_id,task_id,suite,trainer_id,seed,status,score_initial,score_final,score_best,time_seconds,resolved_optimizer,resolved_guide,resolved_logger,resolved_trainer_kwargs,resolved_optimizer_kwargs,resolved_guide_kwargs,resolved_logger_kwargs,eval_kwargs,feedback,llm_provider,llm_model,llm_base_url,token_scope,prompt_tokens,completion_tokens,total_tokens,initial_state_path,best_state_path,final_state_path,state_history_path,tb_logdir\r\n", - "20260305-071546-cc2fe733,a757c4090147,internal:numeric_param,internal,GEPA-Base,123,ok,-3.0,-3.0,-3.0,6.5457,OptoPrimeV2,trace_bench.examples.internal_numeric_param.NumericGuide,ConsoleLogger,\"{\"\"merge_every\"\": 2, \"\"num_iters\"\": 1, \"\"pareto_subset_size\"\": 2, \"\"train_batch_size\"\": 2}\",\"{\"\"llm\"\": {\"\"__class__\"\": \"\"DummyLLM\"\", \"\"__module__\"\": \"\"opto.utils.llm\"\"}, \"\"memory_size\"\": 5, \"\"objective\"\": \"\"Match the numeric target value.\"\"}\",{},{},{},target=3.0,,,,trace_optimization_only,0,0,0,jobs/a757c4090147/artifacts/initial_state.yaml,jobs/a757c4090147/artifacts/best_state.yaml,jobs/a757c4090147/artifacts/final_state.yaml,jobs/a757c4090147/artifacts/state_history.jsonl,jobs/a757c4090147/tb\r\n", - "20260305-071546-cc2fe733,806f391306ae,internal:numeric_param,internal,PrioritySearch,123,ok,-3.0,-3.0,-3.0,6.62411,OptoPrimeV2,trace_bench.examples.internal_numeric_param.NumericGuide,ConsoleLogger,\"{\"\"num_batches\"\": 1, \"\"num_candidates\"\": 2, \"\"num_epochs\"\": 1, \"\"num_proposals\"\": 2, \"\"num_steps\"\": 1}\",\"{\"\"llm\"\": {\"\"__class__\"\": \"\"DummyLLM\"\", \"\"__module__\"\": \"\"opto.utils.llm\"\"}, \"\"memory_size\"\": 5, \"\"objective\"\": \"\"Match the numeric target value.\"\"}\",{},{},{},target=3.0,,,,trace_optimization_only,0,0,0,jobs/806f391306ae/artifacts/initial_state.yaml,jobs/806f391306ae/artifacts/best_state.yaml,jobs/806f391306ae/artifacts/final_state.yaml,jobs/806f391306ae/artifacts/state_history.jsonl,jobs/806f391306ae/tb\r\n", - "20260305-071546-cc2fe733,51046d033ec9,internal:code_param,internal,GEPA-Base,123,ok,1.0,1.0,1.0,0.156546,OptoPrimeV2,trace_bench.examples.internal_code_param.CodeExactGuide,ConsoleLogger,\"{\"\"merge_every\"\": 2, \"\"num_iters\"\": 1, \"\"pareto_subset_size\"\": 2, \"\"train_batch_size\"\": 2}\",\"{\"\"llm\"\": {\"\"__class__\"\": \"\"DummyLLM\"\", \"\"__module__\"\": \"\"opto.utils.llm\"\"}, \"\"memory_size\"\": 5, \"\"objective\"\": \"\"Match the target code exactly.\"\"}\",{},{},{},Correct,,,,trace_optimization_only,0,0,0,jobs/51046d033ec9/artifacts/initial_state.yaml,jobs/51046d033ec9/artifacts/best_state.yaml,jobs/51046d033ec9/artifacts/final_state.yaml,jobs/51046d033ec9/artifacts/state_history.jsonl,jobs/51046d033ec9/tb\r\n", - "20260305-071546-cc2fe733,3dc5df989755,internal:code_param,internal,PrioritySearch,123,ok,1.0,1.0,1.0,0.273471,OptoPrimeV2,trace_bench.examples.internal_code_param.CodeExactGuide,ConsoleLogger,\"{\"\"num_batches\"\": 1, \"\"num_candidates\"\": 2, \"\"num_epochs\"\": 1, \"\"num_proposals\"\": 2, \"\"num_steps\"\": 1}\",\"{\"\"llm\"\": {\"\"__class__\"\": \"\"DummyLLM\"\", \"\"__module__\"\": \"\"opto.utils.llm\"\"}, \"\"memory_size\"\": 5, \"\"objective\"\": \"\"Match the target code exactly.\"\"}\",{},{},{},Correct,,,,trace_optimization_only,0,0,0,jobs/3dc5df989755/artifacts/initial_state.yaml,jobs/3dc5df989755/artifacts/best_state.yaml,jobs/3dc5df989755/artifacts/final_state.yaml,jobs/3dc5df989755/artifacts/state_history.jsonl,jobs/3dc5df989755/tb\r\n" - ] - } - ], - "source": [ - "%%bash\n", - "set -euo pipefail\n", - "cd /content/Trace-Bench\n", - "\n", - "# Use the bundled M3 demo config\n", - "PYTHONPATH=/content/OpenTrace:/content/Trace-Bench:$PYTHONPATH python -m trace_bench run --config configs/m3_ui_demo.yaml --runs-dir \"$RUNS_DIR\"\n", - "\n", - "echo \"\"\n", - "echo \"--- Run artifacts ---\"\n", - "ls -la \"$RUNS_DIR\"/*/\n", - "echo \"\"\n", - "echo \"--- results.csv (head) ---\"\n", - "head -5 \"$RUNS_DIR\"/*/results.csv 2>/dev/null || echo \"(no results.csv found)\"\n" - ], - "id": "Y02hciN42QMi" - }, - { - "cell_type": "code", - "id": "6pqajjt3r8g", - "source": [ - "# Show run summary\n", - "import json, glob\n", - "\n", - "for summary_path in sorted(glob.glob(f\"{RUNS_DIR}/*/summary.json\")):\n", - " with open(summary_path) as f:\n", - " summary = json.load(f)\n", - " run_id = summary_path.split(\"/\")[-2]\n", - " print(f\"Run: {run_id}\")\n", - " print(f\" Total jobs: {summary.get('total_jobs', '?')}\")\n", - " print(f\" Counts: {summary.get('counts', {})}\")\n", - " print()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "6pqajjt3r8g", - "outputId": "a0b40aef-122f-401c-cc9c-51a65c195869" - }, - "execution_count": 4, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Run: 20260305-063935-71c660a2\n", - " Total jobs: 4\n", - " Counts: {'ok': 4, 'failed': 0, 'skipped': 0}\n", - "\n", - "Run: 20260305-064200-ba7c04cb\n", - " Total jobs: 2\n", - " Counts: {'ok': 2, 'failed': 0, 'skipped': 0}\n", - "\n", - "Run: 20260305-071314-cc2fe733\n", - " Total jobs: 4\n", - " Counts: {'ok': 4, 'failed': 0, 'skipped': 0}\n", - "\n", - "Run: 20260305-071546-cc2fe733\n", - " Total jobs: 4\n", - " Counts: {'ok': 4, 'failed': 0, 'skipped': 0}\n", - "\n" - ] - } - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "ZZiTf7GH2QMd" + }, + "source": [ + "# Trace-Bench M3 — UI Launch + Browse + Resume + TensorBoard/MLflow\n", + "\n", + "This notebook demonstrates M3 UI behavior:\n", + "- Launch a small stub run (2×2 matrix) to produce run artifacts\n", + "- Start the Gradio UI with `trace-bench ui`\n", + "- Browse runs, filter results, inspect jobs\n", + "- Resume a previous run using the UI resume flow\n", + "- TensorBoard and MLflow integration (optional)\n", + "\n", + "**Artifacts under `runs//` are canonical.** The UI reads filesystem; MLflow/TB mirror it.\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/guru-code-expert/Trace-Bench/blob/m3/deliverable/notebooks/03_ui_launch_monitor.ipynb)" + ], + "id": "ZZiTf7GH2QMd" + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "W3AArnYD2QMf", + "outputId": "6bb2af2d-fdbe-4b3d-e545-a02a5cf0eec1" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "TqnwOwlN2QMj", - "outputId": "0baf2710-fade-4175-8cf7-678f446f9f6a" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "[info] Secret not found or inaccessible: OPENAI_API_KEY\n", - "OPENROUTER_API_KEY loaded: True\n", - "OPENAI_API_KEY loaded: False\n" - ] - } - ], - "source": [ - "# Load API keys from Colab secrets into env (safe: no hard failure if missing)\n", - "import os\n", - "\n", - "try:\n", - " from google.colab import userdata\n", - "except Exception:\n", - " userdata = None\n", - "\n", - "\n", - "def _safe_secret(name: str) -> str:\n", - " if userdata is None:\n", - " return \"\"\n", - " try:\n", - " return userdata.get(name) or \"\"\n", - " except Exception:\n", - " print(f\"[info] Secret not found or inaccessible: {name}\")\n", - " return \"\"\n", - "\n", - "# Keep existing env value first; fallback to secret if present\n", - "os.environ[\"OPENROUTER_API_KEY\"] = os.environ.get(\"OPENROUTER_API_KEY\") or _safe_secret(\"OPENROUTER_API_KEY\")\n", - "os.environ[\"OPENAI_API_KEY\"] = os.environ.get(\"OPENAI_API_KEY\") or _safe_secret(\"OPENAI_API_KEY\")\n", - "\n", - "print(\"OPENROUTER_API_KEY loaded:\", bool(os.environ.get(\"OPENROUTER_API_KEY\")))\n", - "print(\"OPENAI_API_KEY loaded:\", bool(os.environ.get(\"OPENAI_API_KEY\")))\n", - "\n", - "if not os.environ.get(\"OPENROUTER_API_KEY\") and not os.environ.get(\"OPENAI_API_KEY\"):\n", - " print(\"[warn] No API key found. Real-mode runs may fail; stub mode still works.\")\n" - ], - "id": "TqnwOwlN2QMj" + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/drive\n", + "Runs dir: /content/drive/MyDrive/bench/2026-03-05/trace_bench_m3\n" + ] + } + ], + "source": [ + "from datetime import date\n", + "from pathlib import Path\n", + "import os\n", + "\n", + "try:\n", + " from google.colab import drive\n", + " drive.mount(\"/content/drive\")\n", + "except Exception:\n", + " pass\n", + "\n", + "def bench_dir(project=\"bench\", sub=\"trace_bench_m3\", local=\"/content/bench\"):\n", + " drive_root = Path(\"/content/drive/MyDrive\")\n", + " root = drive_root if drive_root.is_dir() else Path(local)\n", + " out = root / project / date.today().isoformat() / sub\n", + " out.mkdir(parents=True, exist_ok=True)\n", + " return str(out)\n", + "\n", + "RUNS_DIR = bench_dir()\n", + "os.environ[\"RUNS_DIR\"] = RUNS_DIR\n", + "print(\"Runs dir:\", RUNS_DIR)\n" + ], + "id": "W3AArnYD2QMf" + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "V3IInAJw2QMg", + "outputId": "1ffabb08-6575-41ef-b172-3480ee67a4d9" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "ze8ljpzqqzb", - "source": [ - "## Launch Gradio UI\n", - "\n", - "The UI has 3 tabs:\n", - "1. **Launch Run** ? discover tasks/trainers dynamically, edit configs in YAML editor, choose provider (`custom/openai/openrouter`), and run with overrides including logger override (`default/none/`).\n", - "2. **Browse Runs** ? select a run, view results/config/summary, filter by suite/status/trainer, resume a run.\n", - "3. **Job Inspector** ? drill into individual jobs, view meta/events/state artifacts/TensorBoard dir.\n", - "\n", - "The `--share` flag generates a public URL (auto-detected on Colab).\n", - "\n", - "**Note:** This cell blocks while the UI is running. Open the printed URL to interact.\n" - ], - "metadata": { - "id": "ze8ljpzqqzb" - } + "output_type": "stream", + "name": "stdout", + "text": [ + "/content/Trace-Bench\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.7/40.7 kB\u001b[0m \u001b[31m286.5 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.9/8.9 MB\u001b[0m \u001b[31m69.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m65.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25himports ok\n" + ] + } + ], + "source": [ + "# Clone repos + install deps\n", + "!git clone --depth 1 --branch m3/deliverable https://github.com/guru-code-expert/Trace-Bench.git /content/Trace-Bench 2>/dev/null || (cd /content/Trace-Bench && git fetch origin m3/deliverable && git checkout m3/deliverable && git pull --ff-only)\n", + "!git clone --depth 1 --branch experimental https://github.com/AgentOpt/OpenTrace.git /content/OpenTrace 2>/dev/null || (cd /content/OpenTrace && git fetch origin experimental && git checkout experimental && git pull --ff-only)\n", + "\n", + "%cd /content/Trace-Bench\n", + "!python -m pip install -q pyyaml gradio \"litellm==1.75.0\" \"aiohttp>=3.9,<3.13\"\n", + "\n", + "# OpenTrace currently declares python>=3.13 in pyproject; on Colab (3.12) use source checkout via PYTHONPATH.\n", + "!PYTHONPATH=/content/OpenTrace:/content/Trace-Bench:$PYTHONPATH python -c \"import opto, trace_bench, litellm; print('imports ok')\"\n" + ], + "id": "V3IInAJw2QMg" + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "Y02hciN42QMi", + "outputId": "9c878b90-3fe1-45aa-adf9-aa6cee66e66a" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "wDCPwjtl2QMk", - "outputId": "daca6327-1745-46cf-ba7d-3972d114d0af" - }, - "source": [ - "!PYTHONPATH=/content/OpenTrace:/content/Trace-Bench:$PYTHONPATH python -m trace_bench ui --runs-dir \"$RUNS_DIR\" --share\n" - ], - "id": "wDCPwjtl2QMk", - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "* Running on local URL: http://127.0.0.1:7860\n", - "* Running on public URL: https://f9f1596fdf4957b63e.gradio.live\n", - "\n", - "This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n", - "Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n", - "README.md: 100% 150/150 [00:00<00:00, 378kB/s]\n", - "py_src_index.json: 2.04MB [00:00, 136MB/s]\n", - "Generating train split: 140 examples [00:00, 2097.99 examples/s]\n" - ] - } - ] + "output_type": "stream", + "name": "stdout", + "text": [ + "Run complete: 20260305-071546-cc2fe733\n", + "Run dir: /content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-071546-cc2fe733\n", + "Manifest: /content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-071546-cc2fe733/meta/manifest.json\n", + "Results: /content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-071546-cc2fe733/results.csv\n", + "Summary: /content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-071546-cc2fe733/summary.json\n", + "Leaderboard: /content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-071546-cc2fe733/leaderboard.csv\n", + "Files index: /content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-071546-cc2fe733/meta/files_index.json\n", + "\n", + "--- Run artifacts ---\n", + "/content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-063935-71c660a2/:\n", + "total 13\n", + "drwx------ 2 root root 4096 Mar 5 06:39 jobs\n", + "-rw------- 1 root root 202 Mar 5 06:39 leaderboard.csv\n", + "drwx------ 2 root root 4096 Mar 5 06:39 meta\n", + "-rw------- 1 root root 3242 Mar 5 06:39 results.csv\n", + "-rw------- 1 root root 212 Mar 5 06:39 summary.json\n", + "\n", + "/content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-064200-ba7c04cb/:\n", + "total 12\n", + "drwx------ 2 root root 4096 Mar 5 06:42 jobs\n", + "-rw------- 1 root root 151 Mar 5 07:01 leaderboard.csv\n", + "drwx------ 2 root root 4096 Mar 5 06:42 meta\n", + "-rw------- 1 root root 2809 Mar 5 07:01 results.csv\n", + "-rw------- 1 root root 234 Mar 5 07:01 summary.json\n", + "\n", + "/content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-071314-cc2fe733/:\n", + "total 13\n", + "drwx------ 2 root root 4096 Mar 5 07:13 jobs\n", + "-rw------- 1 root root 202 Mar 5 07:13 leaderboard.csv\n", + "drwx------ 2 root root 4096 Mar 5 07:13 meta\n", + "-rw------- 1 root root 3242 Mar 5 07:13 results.csv\n", + "-rw------- 1 root root 212 Mar 5 07:13 summary.json\n", + "\n", + "/content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-071546-cc2fe733/:\n", + "total 13\n", + "drwx------ 6 root root 4096 Mar 5 07:15 jobs\n", + "-rw------- 1 root root 200 Mar 5 07:15 leaderboard.csv\n", + "drwx------ 2 root root 4096 Mar 5 07:15 meta\n", + "-rw------- 1 root root 3239 Mar 5 07:15 results.csv\n", + "-rw------- 1 root root 212 Mar 5 07:15 summary.json\n", + "\n", + "--- results.csv (head) ---\n", + "==> /content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-063935-71c660a2/results.csv <==\n", + "run_id,job_id,task_id,suite,trainer_id,seed,status,score_initial,score_final,score_best,time_seconds,resolved_optimizer,resolved_guide,resolved_logger,resolved_trainer_kwargs,resolved_optimizer_kwargs,resolved_guide_kwargs,resolved_logger_kwargs,eval_kwargs,feedback,llm_provider,llm_model,llm_base_url,token_scope,prompt_tokens,completion_tokens,total_tokens,initial_state_path,best_state_path,final_state_path,state_history_path,tb_logdir\r\n", + "20260305-063935-71c660a2,a757c4090147,internal:numeric_param,internal,GEPA-Base,123,ok,-3.0,-3.0,-3.0,5.807537,OptoPrimeV2,trace_bench.examples.internal_numeric_param.NumericGuide,ConsoleLogger,\"{\"\"merge_every\"\": 2, \"\"num_iters\"\": 1, \"\"pareto_subset_size\"\": 2, \"\"train_batch_size\"\": 2}\",\"{\"\"llm\"\": {\"\"__class__\"\": \"\"DummyLLM\"\", \"\"__module__\"\": \"\"opto.utils.llm\"\"}, \"\"memory_size\"\": 5, \"\"objective\"\": \"\"Match the numeric target value.\"\"}\",{},{},{},target=3.0,,,,trace_optimization_only,0,0,0,jobs/a757c4090147/artifacts/initial_state.yaml,jobs/a757c4090147/artifacts/best_state.yaml,jobs/a757c4090147/artifacts/final_state.yaml,jobs/a757c4090147/artifacts/state_history.jsonl,jobs/a757c4090147/tb\r\n", + "20260305-063935-71c660a2,806f391306ae,internal:numeric_param,internal,PrioritySearch,123,ok,-3.0,-3.0,-3.0,5.817129,OptoPrimeV2,trace_bench.examples.internal_numeric_param.NumericGuide,ConsoleLogger,\"{\"\"num_batches\"\": 1, \"\"num_candidates\"\": 2, \"\"num_epochs\"\": 1, \"\"num_proposals\"\": 2, \"\"num_steps\"\": 1}\",\"{\"\"llm\"\": {\"\"__class__\"\": \"\"DummyLLM\"\", \"\"__module__\"\": \"\"opto.utils.llm\"\"}, \"\"memory_size\"\": 5, \"\"objective\"\": \"\"Match the numeric target value.\"\"}\",{},{},{},target=3.0,,,,trace_optimization_only,0,0,0,jobs/806f391306ae/artifacts/initial_state.yaml,jobs/806f391306ae/artifacts/best_state.yaml,jobs/806f391306ae/artifacts/final_state.yaml,jobs/806f391306ae/artifacts/state_history.jsonl,jobs/806f391306ae/tb\r\n", + "20260305-063935-71c660a2,51046d033ec9,internal:code_param,internal,GEPA-Base,123,ok,1.0,1.0,1.0,0.218794,OptoPrimeV2,trace_bench.examples.internal_code_param.CodeExactGuide,ConsoleLogger,\"{\"\"merge_every\"\": 2, \"\"num_iters\"\": 1, \"\"pareto_subset_size\"\": 2, \"\"train_batch_size\"\": 2}\",\"{\"\"llm\"\": {\"\"__class__\"\": \"\"DummyLLM\"\", \"\"__module__\"\": \"\"opto.utils.llm\"\"}, \"\"memory_size\"\": 5, \"\"objective\"\": \"\"Match the target code exactly.\"\"}\",{},{},{},Correct,,,,trace_optimization_only,0,0,0,jobs/51046d033ec9/artifacts/initial_state.yaml,jobs/51046d033ec9/artifacts/best_state.yaml,jobs/51046d033ec9/artifacts/final_state.yaml,jobs/51046d033ec9/artifacts/state_history.jsonl,jobs/51046d033ec9/tb\r\n", + "20260305-063935-71c660a2,3dc5df989755,internal:code_param,internal,PrioritySearch,123,ok,1.0,1.0,1.0,0.305007,OptoPrimeV2,trace_bench.examples.internal_code_param.CodeExactGuide,ConsoleLogger,\"{\"\"num_batches\"\": 1, \"\"num_candidates\"\": 2, \"\"num_epochs\"\": 1, \"\"num_proposals\"\": 2, \"\"num_steps\"\": 1}\",\"{\"\"llm\"\": {\"\"__class__\"\": \"\"DummyLLM\"\", \"\"__module__\"\": \"\"opto.utils.llm\"\"}, \"\"memory_size\"\": 5, \"\"objective\"\": \"\"Match the target code exactly.\"\"}\",{},{},{},Correct,,,,trace_optimization_only,0,0,0,jobs/3dc5df989755/artifacts/initial_state.yaml,jobs/3dc5df989755/artifacts/best_state.yaml,jobs/3dc5df989755/artifacts/final_state.yaml,jobs/3dc5df989755/artifacts/state_history.jsonl,jobs/3dc5df989755/tb\r\n", + "\n", + "==> /content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-064200-ba7c04cb/results.csv <==\n", + "run_id,job_id,task_id,suite,trainer_id,seed,status,score_initial,score_final,score_best,time_seconds,resolved_optimizer,resolved_guide,resolved_logger,resolved_trainer_kwargs,resolved_optimizer_kwargs,resolved_guide_kwargs,resolved_logger_kwargs,eval_kwargs,feedback,llm_provider,llm_model,llm_base_url,token_scope,prompt_tokens,completion_tokens,total_tokens,initial_state_path,best_state_path,final_state_path,state_history_path,tb_logdir\r\n", + "20260305-064200-ba7c04cb,ea4ad49014e2,llm4ad:optimization/bp_2d_construct,llm4ad,GEPA-Beam,123,ok,-29.0,-29.0,-29.0,498.058084,OPROv2,LLM4AD.llm4ad_loader.AutonomousEvaluatorGuide,ConsoleLogger,\"{\"\"merge_every\"\": 2, \"\"num_search_iterations\"\": 1, \"\"pareto_subset_size\"\": 2, \"\"train_batch_size\"\": 2, \"\"verbose\"\": false}\",\"{\"\"memory_size\"\": 10, \"\"objective\"\": \"\"You are optimizing the implementation of `determine_next_assignment` for the LLM4AD task.\\\\n\\\\nTask description:\\\\n'\\\\n\\\\nYour goal is to return a correct and efficient function whose score (computed by the task evaluator) is as high as possible.\"\"}\",{},{},{},\"TRACE_FEEDBACK_JSON={\"\"status\"\": \"\"ok\"\", \"\"phase\"\": \"\"evaluate\"\", \"\"score\"\": -29.0}\n", + "Autonomous eval OK in 57.94s; score=-29.0\n", + "TRACE_FEEDBACK_JSON={\"\"status\"\": \"\"ok\"\", \"\"phase\"\": \"\"evaluate\"\", \"\"score\"\": -29.0}\n", + "Autonomous eval OK in 57.94s; score=-29.0\",openrouter,openrouter/x-ai/grok-4.1-fast,https://openrouter.ai/api/v1,trace_optimization_only,0,0,0,jobs/ea4ad49014e2/artifacts/initial_state.yaml,jobs/ea4ad49014e2/artifacts/best_state.yaml,jobs/ea4ad49014e2/artifacts/final_state.yaml,jobs/ea4ad49014e2/artifacts/state_history.jsonl,jobs/ea4ad49014e2/tb\r\n", + "\n", + "==> /content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-071314-cc2fe733/results.csv <==\n", + "run_id,job_id,task_id,suite,trainer_id,seed,status,score_initial,score_final,score_best,time_seconds,resolved_optimizer,resolved_guide,resolved_logger,resolved_trainer_kwargs,resolved_optimizer_kwargs,resolved_guide_kwargs,resolved_logger_kwargs,eval_kwargs,feedback,llm_provider,llm_model,llm_base_url,token_scope,prompt_tokens,completion_tokens,total_tokens,initial_state_path,best_state_path,final_state_path,state_history_path,tb_logdir\r\n", + "20260305-071314-cc2fe733,a757c4090147,internal:numeric_param,internal,GEPA-Base,123,ok,-3.0,-3.0,-3.0,4.712792,OptoPrimeV2,trace_bench.examples.internal_numeric_param.NumericGuide,ConsoleLogger,\"{\"\"merge_every\"\": 2, \"\"num_iters\"\": 1, \"\"pareto_subset_size\"\": 2, \"\"train_batch_size\"\": 2}\",\"{\"\"llm\"\": {\"\"__class__\"\": \"\"DummyLLM\"\", \"\"__module__\"\": \"\"opto.utils.llm\"\"}, \"\"memory_size\"\": 5, \"\"objective\"\": \"\"Match the numeric target value.\"\"}\",{},{},{},target=3.0,,,,trace_optimization_only,0,0,0,jobs/a757c4090147/artifacts/initial_state.yaml,jobs/a757c4090147/artifacts/best_state.yaml,jobs/a757c4090147/artifacts/final_state.yaml,jobs/a757c4090147/artifacts/state_history.jsonl,jobs/a757c4090147/tb\r\n", + "20260305-071314-cc2fe733,806f391306ae,internal:numeric_param,internal,PrioritySearch,123,ok,-3.0,-3.0,-3.0,4.713517,OptoPrimeV2,trace_bench.examples.internal_numeric_param.NumericGuide,ConsoleLogger,\"{\"\"num_batches\"\": 1, \"\"num_candidates\"\": 2, \"\"num_epochs\"\": 1, \"\"num_proposals\"\": 2, \"\"num_steps\"\": 1}\",\"{\"\"llm\"\": {\"\"__class__\"\": \"\"DummyLLM\"\", \"\"__module__\"\": \"\"opto.utils.llm\"\"}, \"\"memory_size\"\": 5, \"\"objective\"\": \"\"Match the numeric target value.\"\"}\",{},{},{},target=3.0,,,,trace_optimization_only,0,0,0,jobs/806f391306ae/artifacts/initial_state.yaml,jobs/806f391306ae/artifacts/best_state.yaml,jobs/806f391306ae/artifacts/final_state.yaml,jobs/806f391306ae/artifacts/state_history.jsonl,jobs/806f391306ae/tb\r\n", + "20260305-071314-cc2fe733,51046d033ec9,internal:code_param,internal,GEPA-Base,123,ok,1.0,1.0,1.0,0.090701,OptoPrimeV2,trace_bench.examples.internal_code_param.CodeExactGuide,ConsoleLogger,\"{\"\"merge_every\"\": 2, \"\"num_iters\"\": 1, \"\"pareto_subset_size\"\": 2, \"\"train_batch_size\"\": 2}\",\"{\"\"llm\"\": {\"\"__class__\"\": \"\"DummyLLM\"\", \"\"__module__\"\": \"\"opto.utils.llm\"\"}, \"\"memory_size\"\": 5, \"\"objective\"\": \"\"Match the target code exactly.\"\"}\",{},{},{},Correct,,,,trace_optimization_only,0,0,0,jobs/51046d033ec9/artifacts/initial_state.yaml,jobs/51046d033ec9/artifacts/best_state.yaml,jobs/51046d033ec9/artifacts/final_state.yaml,jobs/51046d033ec9/artifacts/state_history.jsonl,jobs/51046d033ec9/tb\r\n", + "20260305-071314-cc2fe733,3dc5df989755,internal:code_param,internal,PrioritySearch,123,ok,1.0,1.0,1.0,0.160663,OptoPrimeV2,trace_bench.examples.internal_code_param.CodeExactGuide,ConsoleLogger,\"{\"\"num_batches\"\": 1, \"\"num_candidates\"\": 2, \"\"num_epochs\"\": 1, \"\"num_proposals\"\": 2, \"\"num_steps\"\": 1}\",\"{\"\"llm\"\": {\"\"__class__\"\": \"\"DummyLLM\"\", \"\"__module__\"\": \"\"opto.utils.llm\"\"}, \"\"memory_size\"\": 5, \"\"objective\"\": \"\"Match the target code exactly.\"\"}\",{},{},{},Correct,,,,trace_optimization_only,0,0,0,jobs/3dc5df989755/artifacts/initial_state.yaml,jobs/3dc5df989755/artifacts/best_state.yaml,jobs/3dc5df989755/artifacts/final_state.yaml,jobs/3dc5df989755/artifacts/state_history.jsonl,jobs/3dc5df989755/tb\r\n", + "\n", + "==> /content/drive/MyDrive/bench/2026-03-05/trace_bench_m3/20260305-071546-cc2fe733/results.csv <==\n", + "run_id,job_id,task_id,suite,trainer_id,seed,status,score_initial,score_final,score_best,time_seconds,resolved_optimizer,resolved_guide,resolved_logger,resolved_trainer_kwargs,resolved_optimizer_kwargs,resolved_guide_kwargs,resolved_logger_kwargs,eval_kwargs,feedback,llm_provider,llm_model,llm_base_url,token_scope,prompt_tokens,completion_tokens,total_tokens,initial_state_path,best_state_path,final_state_path,state_history_path,tb_logdir\r\n", + "20260305-071546-cc2fe733,a757c4090147,internal:numeric_param,internal,GEPA-Base,123,ok,-3.0,-3.0,-3.0,6.5457,OptoPrimeV2,trace_bench.examples.internal_numeric_param.NumericGuide,ConsoleLogger,\"{\"\"merge_every\"\": 2, \"\"num_iters\"\": 1, \"\"pareto_subset_size\"\": 2, \"\"train_batch_size\"\": 2}\",\"{\"\"llm\"\": {\"\"__class__\"\": \"\"DummyLLM\"\", \"\"__module__\"\": \"\"opto.utils.llm\"\"}, \"\"memory_size\"\": 5, \"\"objective\"\": \"\"Match the numeric target value.\"\"}\",{},{},{},target=3.0,,,,trace_optimization_only,0,0,0,jobs/a757c4090147/artifacts/initial_state.yaml,jobs/a757c4090147/artifacts/best_state.yaml,jobs/a757c4090147/artifacts/final_state.yaml,jobs/a757c4090147/artifacts/state_history.jsonl,jobs/a757c4090147/tb\r\n", + "20260305-071546-cc2fe733,806f391306ae,internal:numeric_param,internal,PrioritySearch,123,ok,-3.0,-3.0,-3.0,6.62411,OptoPrimeV2,trace_bench.examples.internal_numeric_param.NumericGuide,ConsoleLogger,\"{\"\"num_batches\"\": 1, \"\"num_candidates\"\": 2, \"\"num_epochs\"\": 1, \"\"num_proposals\"\": 2, \"\"num_steps\"\": 1}\",\"{\"\"llm\"\": {\"\"__class__\"\": \"\"DummyLLM\"\", \"\"__module__\"\": \"\"opto.utils.llm\"\"}, \"\"memory_size\"\": 5, \"\"objective\"\": \"\"Match the numeric target value.\"\"}\",{},{},{},target=3.0,,,,trace_optimization_only,0,0,0,jobs/806f391306ae/artifacts/initial_state.yaml,jobs/806f391306ae/artifacts/best_state.yaml,jobs/806f391306ae/artifacts/final_state.yaml,jobs/806f391306ae/artifacts/state_history.jsonl,jobs/806f391306ae/tb\r\n", + "20260305-071546-cc2fe733,51046d033ec9,internal:code_param,internal,GEPA-Base,123,ok,1.0,1.0,1.0,0.156546,OptoPrimeV2,trace_bench.examples.internal_code_param.CodeExactGuide,ConsoleLogger,\"{\"\"merge_every\"\": 2, \"\"num_iters\"\": 1, \"\"pareto_subset_size\"\": 2, \"\"train_batch_size\"\": 2}\",\"{\"\"llm\"\": {\"\"__class__\"\": \"\"DummyLLM\"\", \"\"__module__\"\": \"\"opto.utils.llm\"\"}, \"\"memory_size\"\": 5, \"\"objective\"\": \"\"Match the target code exactly.\"\"}\",{},{},{},Correct,,,,trace_optimization_only,0,0,0,jobs/51046d033ec9/artifacts/initial_state.yaml,jobs/51046d033ec9/artifacts/best_state.yaml,jobs/51046d033ec9/artifacts/final_state.yaml,jobs/51046d033ec9/artifacts/state_history.jsonl,jobs/51046d033ec9/tb\r\n", + "20260305-071546-cc2fe733,3dc5df989755,internal:code_param,internal,PrioritySearch,123,ok,1.0,1.0,1.0,0.273471,OptoPrimeV2,trace_bench.examples.internal_code_param.CodeExactGuide,ConsoleLogger,\"{\"\"num_batches\"\": 1, \"\"num_candidates\"\": 2, \"\"num_epochs\"\": 1, \"\"num_proposals\"\": 2, \"\"num_steps\"\": 1}\",\"{\"\"llm\"\": {\"\"__class__\"\": \"\"DummyLLM\"\", \"\"__module__\"\": \"\"opto.utils.llm\"\"}, \"\"memory_size\"\": 5, \"\"objective\"\": \"\"Match the target code exactly.\"\"}\",{},{},{},Correct,,,,trace_optimization_only,0,0,0,jobs/3dc5df989755/artifacts/initial_state.yaml,jobs/3dc5df989755/artifacts/best_state.yaml,jobs/3dc5df989755/artifacts/final_state.yaml,jobs/3dc5df989755/artifacts/state_history.jsonl,jobs/3dc5df989755/tb\r\n" + ] + } + ], + "source": [ + "%%bash\n", + "set -euo pipefail\n", + "cd /content/Trace-Bench\n", + "\n", + "# Use the bundled M3 demo config\n", + "PYTHONPATH=/content/OpenTrace:/content/Trace-Bench:$PYTHONPATH python -m trace_bench run --config configs/m3_ui_demo.yaml --runs-dir \"$RUNS_DIR\"\n", + "\n", + "echo \"\"\n", + "echo \"--- Run artifacts ---\"\n", + "ls -la \"$RUNS_DIR\"/*/\n", + "echo \"\"\n", + "echo \"--- results.csv (head) ---\"\n", + "head -5 \"$RUNS_DIR\"/*/results.csv 2>/dev/null || echo \"(no results.csv found)\"\n" + ], + "id": "Y02hciN42QMi" + }, + { + "cell_type": "code", + "id": "6pqajjt3r8g", + "source": [ + "# Show run summary\n", + "import json, glob\n", + "\n", + "for summary_path in sorted(glob.glob(f\"{RUNS_DIR}/*/summary.json\")):\n", + " with open(summary_path) as f:\n", + " summary = json.load(f)\n", + " run_id = summary_path.split(\"/\")[-2]\n", + " print(f\"Run: {run_id}\")\n", + " print(f\" Total jobs: {summary.get('total_jobs', '?')}\")\n", + " print(f\" Counts: {summary.get('counts', {})}\")\n", + " print()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "6pqajjt3r8g", + "outputId": "a0b40aef-122f-401c-cc9c-51a65c195869" + }, + "execution_count": 4, + "outputs": [ { - "cell_type": "markdown", - "id": "ggsltbvz9x", - "source": [ - "## TensorBoard\n", - "\n", - "Each job stores TensorBoard event files under `jobs//tb/`. When the trainer uses `TensorboardLogger`, the runner auto-injects the per-job logdir.\n", - "\n", - "To view:\n", - "```bash\n", - "tensorboard --logdir \"$RUNS_DIR//jobs\"\n", - "```\n", - "\n", - "The Gradio UI also has a \"Launch TensorBoard\" button in the Browse tab.\n", - "\n", - "## MLflow (optional)\n", - "\n", - "MLflow mirroring is **opt-in**. Set `MLFLOW_TRACKING_URI` to enable:\n", - "```bash\n", - "export MLFLOW_TRACKING_URI=mlruns\n", - "python -m trace_bench run --config configs/m3_ui_demo.yaml --runs-dir runs\n", - "```\n", - "\n", - "When enabled, the runner mirrors run params, job metrics, and artifact links to MLflow. The filesystem remains the canonical source of truth.\n", - "\n", - "**Note:** Deep MLflow/OTel logger integration depends on the Trace team's upcoming PR. Current behavior is minimal mirroring of `score_initial`, `score_final`, `score_best`, and `time_seconds`." - ], - "metadata": { - "id": "ggsltbvz9x" - } + "output_type": "stream", + "name": "stdout", + "text": [ + "Run: 20260305-063935-71c660a2\n", + " Total jobs: 4\n", + " Counts: {'ok': 4, 'failed': 0, 'skipped': 0}\n", + "\n", + "Run: 20260305-064200-ba7c04cb\n", + " Total jobs: 2\n", + " Counts: {'ok': 2, 'failed': 0, 'skipped': 0}\n", + "\n", + "Run: 20260305-071314-cc2fe733\n", + " Total jobs: 4\n", + " Counts: {'ok': 4, 'failed': 0, 'skipped': 0}\n", + "\n", + "Run: 20260305-071546-cc2fe733\n", + " Total jobs: 4\n", + " Counts: {'ok': 4, 'failed': 0, 'skipped': 0}\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "TqnwOwlN2QMj", + "outputId": "0baf2710-fade-4175-8cf7-678f446f9f6a" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "kmybt9p8yi", - "source": [ - "## Summary\n", - "\n", - "**M3 deliverables demonstrated:**\n", - "- Gradio UI with 3 tabs: Launch, Browse, Job Inspector\n", - "- Dynamic task/trainer discovery (non-hardcoded)\n", - "- Provider clarity: `custom/openai/openrouter` selector + base URL/key/model fields\n", - "- Config editor: load, edit, save, run YAML configs from the UI\n", - "- Logger override from UI (`default/none/`)\n", - "- Resume a previous run preserving the original run_id\n", - "- TensorBoard action returns URL/command guidance\n", - "- MLflow opt-in mirroring (filesystem canonical)\n", - "- CLI: `trace-bench run --logger ...` and `trace-bench ui --runs-dir ... --share --tasks-root ... --port ...`\n" - ], - "metadata": { - "id": "kmybt9p8yi" - } + "output_type": "stream", + "name": "stdout", + "text": [ + "[info] Secret not found or inaccessible: OPENAI_API_KEY\n", + "OPENROUTER_API_KEY loaded: True\n", + "OPENAI_API_KEY loaded: False\n" + ] } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.x" - }, + ], + "source": [ + "# Load API keys from Colab secrets into env (safe: no hard failure if missing)\n", + "import os\n", + "\n", + "try:\n", + " from google.colab import userdata\n", + "except Exception:\n", + " userdata = None\n", + "\n", + "\n", + "def _safe_secret(name: str) -> str:\n", + " if userdata is None:\n", + " return \"\"\n", + " try:\n", + " return userdata.get(name) or \"\"\n", + " except Exception:\n", + " print(f\"[info] Secret not found or inaccessible: {name}\")\n", + " return \"\"\n", + "\n", + "# Keep existing env value first; fallback to secret if present\n", + "os.environ[\"OPENROUTER_API_KEY\"] = os.environ.get(\"OPENROUTER_API_KEY\") or _safe_secret(\"OPENROUTER_API_KEY\")\n", + "os.environ[\"OPENAI_API_KEY\"] = os.environ.get(\"OPENAI_API_KEY\") or _safe_secret(\"OPENAI_API_KEY\")\n", + "\n", + "print(\"OPENROUTER_API_KEY loaded:\", bool(os.environ.get(\"OPENROUTER_API_KEY\")))\n", + "print(\"OPENAI_API_KEY loaded:\", bool(os.environ.get(\"OPENAI_API_KEY\")))\n", + "\n", + "if not os.environ.get(\"OPENROUTER_API_KEY\") and not os.environ.get(\"OPENAI_API_KEY\"):\n", + " print(\"[warn] No API key found. Real-mode runs may fail; stub mode still works.\")\n" + ], + "id": "TqnwOwlN2QMj" + }, + { + "cell_type": "markdown", + "id": "ze8ljpzqqzb", + "source": [ + "## Launch Gradio UI\n", + "\n", + "The UI has 3 tabs:\n", + "1. **Launch Run** ? discover tasks/trainers dynamically, edit configs in YAML editor, choose provider (`custom/openai/openrouter`), and run with overrides including logger override (`default/none/`).\n", + "2. **Browse Runs** ? select a run, view results/config/summary, filter by suite/status/trainer, resume a run.\n", + "3. **Job Inspector** ? drill into individual jobs, view meta/events/state artifacts/TensorBoard dir.\n", + "\n", + "The `--share` flag generates a public URL (auto-detected on Colab).\n", + "\n", + "**Note:** This cell blocks while the UI is running. Open the printed URL to interact.\n" + ], + "metadata": { + "id": "ze8ljpzqqzb" + } + }, + { + "cell_type": "code", + "metadata": { "colab": { - "provenance": [] + "base_uri": "https://localhost:8080/" + }, + "id": "wDCPwjtl2QMk", + "outputId": "daca6327-1745-46cf-ba7d-3972d114d0af" + }, + "source": [ + "!PYTHONPATH=/content/OpenTrace:/content/Trace-Bench:$PYTHONPATH python -m trace_bench ui --runs-dir \"$RUNS_DIR\" --share\n" + ], + "id": "wDCPwjtl2QMk", + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "* Running on local URL: http://127.0.0.1:7860\n", + "* Running on public URL: https://f9f1596fdf4957b63e.gradio.live\n", + "\n", + "This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n", + "Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n", + "README.md: 100% 150/150 [00:00<00:00, 378kB/s]\n", + "py_src_index.json: 2.04MB [00:00, 136MB/s]\n", + "Generating train split: 140 examples [00:00, 2097.99 examples/s]\n" + ] } + ] + }, + { + "cell_type": "markdown", + "id": "ggsltbvz9x", + "source": [ + "## TensorBoard\n", + "\n", + "Each job stores TensorBoard event files under `jobs//tb/`. When the trainer uses `TensorboardLogger`, the runner auto-injects the per-job logdir.\n", + "\n", + "To view:\n", + "```bash\n", + "tensorboard --logdir \"$RUNS_DIR//jobs\"\n", + "```\n", + "\n", + "The Gradio UI also has a \"Launch TensorBoard\" button in the Browse tab.\n", + "\n", + "## MLflow (optional)\n", + "\n", + "MLflow mirroring is **opt-in**. Set `MLFLOW_TRACKING_URI` to enable:\n", + "```bash\n", + "export MLFLOW_TRACKING_URI=mlruns\n", + "python -m trace_bench run --config configs/m3_ui_demo.yaml --runs-dir runs\n", + "```\n", + "\n", + "When enabled, the runner mirrors run params, job metrics, and artifact links to MLflow. The filesystem remains the canonical source of truth.\n", + "\n", + "**Note:** Deep MLflow/OTel integration is available via OpenTrace unified telemetry. Current Trace-Bench MLflow logging still mirrors `score_initial`, `score_final`, `score_best`, and `time_seconds`, but OpenTrace can additionally emit **MLflow trace spans** for traced operations.\n\nTo capture OpenTrace traces inside Trace-Bench runs:\n1) enable Trace-Bench MLflow logging,\n2) enable OpenTrace MLflow tracing via `import opto.trace as trace; trace.mlflow.autolog(...)`, and\n3) ensure the active MLflow run context is rebound inside worker threads (Trace-Bench does this with `bind_active_run(...)` when using `max_workers > 1`).\n" + ], + "metadata": { + "id": "ggsltbvz9x" + } + }, + { + "cell_type": "markdown", + "id": "kmybt9p8yi", + "source": [ + "## Summary\n", + "\n", + "**M3 deliverables demonstrated:**\n", + "- Gradio UI with 3 tabs: Launch, Browse, Job Inspector\n", + "- Dynamic task/trainer discovery (non-hardcoded)\n", + "- Provider clarity: `custom/openai/openrouter` selector + base URL/key/model fields\n", + "- Config editor: load, edit, save, run YAML configs from the UI\n", + "- Logger override from UI (`default/none/`)\n", + "- Resume a previous run preserving the original run_id\n", + "- TensorBoard action returns URL/command guidance\n", + "- MLflow opt-in mirroring (filesystem canonical)\n", + "- CLI: `trace-bench run --logger ...` and `trace-bench ui --runs-dir ... --share --tasks-root ... --port ...`\n" + ], + "metadata": { + "id": "kmybt9p8yi" + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.x" }, - "nbformat": 4, - "nbformat_minor": 5 + "colab": { + "provenance": [] + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/trace_bench/integrations/mlflow_client.py b/trace_bench/integrations/mlflow_client.py index f7ad1ba..a0bf077 100644 --- a/trace_bench/integrations/mlflow_client.py +++ b/trace_bench/integrations/mlflow_client.py @@ -1,5 +1,6 @@ from __future__ import annotations +from contextlib import contextmanager from dataclasses import dataclass from pathlib import Path from typing import Any, Dict, Optional @@ -37,6 +38,40 @@ class MLflowRunContext: active_run_id: str +@contextmanager +def bind_active_run(ctx: Optional[MLflowRunContext]): + """Bind the active MLflow run to the current thread. + + Trace-Bench executes jobs in worker threads when ``max_workers > 1``. + MLflow active-run state is thread-local, so any MLflow trace spans emitted + inside those workers (for example by OpenTrace unified telemetry) must + explicitly re-attach to the parent run in that thread. + """ + if ctx is None or not _enabled(): + yield + return + + mlflow = _safe_import_mlflow() + if mlflow is None: + yield + return + + try: + if os.environ.get("MLFLOW_TRACKING_URI"): + mlflow.set_tracking_uri(os.environ["MLFLOW_TRACKING_URI"]) + + active = mlflow.active_run() + if active is not None and active.info.run_id == ctx.active_run_id: + yield + return + + with mlflow.start_run(run_id=ctx.active_run_id): + yield + except Exception: + # Telemetry should never break the benchmark run. + yield + + def log_run_start(run_dir: str | Path, config_snapshot: Dict[str, Any], env_json: Dict[str, Any], git_json: Dict[str, Any]) -> Optional[MLflowRunContext]: """Create/activate an MLflow run and log run-level context. @@ -48,7 +83,6 @@ def log_run_start(run_dir: str | Path, config_snapshot: Dict[str, Any], env_json mlflow = _safe_import_mlflow() mlflow.set_tracking_uri(os.environ["MLFLOW_TRACKING_URI"]) - run_path = Path(run_dir) run_id = run_path.name @@ -137,5 +171,5 @@ def log_run_end(ctx: Optional[MLflowRunContext], summary_json: Dict[str, Any]) - pass -__all__ = ["MLflowRunContext", "log_run_start", "log_job_result", "log_run_end"] +__all__ = ["MLflowRunContext", "bind_active_run", "log_run_start", "log_job_result", "log_run_end"] diff --git a/trace_bench/runner.py b/trace_bench/runner.py index 6b3183e..423b626 100644 --- a/trace_bench/runner.py +++ b/trace_bench/runner.py @@ -53,7 +53,7 @@ summarize_results, ) -from trace_bench.integrations.mlflow_client import log_job_result, log_run_end, log_run_start +from trace_bench.integrations.mlflow_client import bind_active_run, log_job_result, log_run_end, log_run_start from trace_bench.null_logger import NullLogger @@ -1033,7 +1033,8 @@ def _execute_job(j: JobSpec) -> Optional[Tuple[Dict[str, Any], Dict[str, Any]]]: j.job_id, j.task_id, resume_mode) return None - row, manifest_job = self._run_job(j, timeout=effective_timeout) + with bind_active_run(mlflow_ctx): + row, manifest_job = self._run_job(j, timeout=effective_timeout) if row.get("status") == "failed": failed_flag.set() return row, manifest_job