Use src.optimizer.workflow for headless or scripted runs.
python -m src.optimizer.workflow profile --project <project_name>Captures operator I/O tensors and baseline benchmark data.
# All operators
python -m src.optimizer.workflow generate \
--project <project_name> \
--target-device cuda
# Selected operators only
python -m src.optimizer.workflow generate \
--project <project_name> \
--ops torch_nn_functional_conv2d,torch_nn_functional_linear \
--target-device cudapython -m src.optimizer.workflow generate \
--project <project_name> \
--target-device cuda \
--optimize \
--benchmark \
--iterations 5python -m src.optimizer.workflow optimize \
--project <project_name> \
--ops torch_nn_functional_conv2d \
--target-device cuda \
--iterations 5 \
--benchmarkpython -m src.optimizer.workflow benchmark --project <project_name>kernels/projects/<project_name>/
├── state.json # job state (progress, pause/cancel)
├── io/
│ ├── summary.json
│ └── individual_ops/ # captured tensor I/O per operator
├── kernels/
│ └── generated/individual_op_kernels/<op>/
├── trees/<op>/ # MCTS nodes and kernel source per attempt
├── benchmarks/op_benchmarks.json
└── logs/