From 40a8c7d8999ae779450b680f2784e4594a78c247 Mon Sep 17 00:00:00 2001 From: N!no Date: Sun, 15 Feb 2026 11:26:57 -0500 Subject: [PATCH 01/11] Prefer local CoMLRL path for torchrun runs --- train/train_iac.py | 3 +++ train/train_maac.py | 3 +++ train/train_magrpo.py | 3 +++ 3 files changed, 9 insertions(+) diff --git a/train/train_iac.py b/train/train_iac.py index eade3a1..782e425 100644 --- a/train/train_iac.py +++ b/train/train_iac.py @@ -21,6 +21,9 @@ REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, os.path.dirname(REPO_ROOT)) sys.path.insert(0, REPO_ROOT) +COMLRL_ROOT = os.path.join(os.path.dirname(REPO_ROOT), "CoMLRL") +if COMLRL_ROOT not in sys.path: + sys.path.insert(0, COMLRL_ROOT) from datasets import load_dataset # type: ignore from transformers import AutoTokenizer # type: ignore diff --git a/train/train_maac.py b/train/train_maac.py index 4ca6fa9..680ad70 100644 --- a/train/train_maac.py +++ b/train/train_maac.py @@ -21,6 +21,9 @@ REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, os.path.dirname(REPO_ROOT)) sys.path.insert(0, REPO_ROOT) +COMLRL_ROOT = os.path.join(os.path.dirname(REPO_ROOT), "CoMLRL") +if COMLRL_ROOT not in sys.path: + sys.path.insert(0, COMLRL_ROOT) from datasets import load_dataset # type: ignore from transformers import AutoTokenizer # type: ignore diff --git a/train/train_magrpo.py b/train/train_magrpo.py index ec497b3..85884fb 100644 --- a/train/train_magrpo.py +++ b/train/train_magrpo.py @@ -18,6 +18,9 @@ REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, os.path.dirname(REPO_ROOT)) sys.path.insert(0, REPO_ROOT) +COMLRL_ROOT = os.path.join(os.path.dirname(REPO_ROOT), "CoMLRL") +if COMLRL_ROOT not in sys.path: + sys.path.insert(0, COMLRL_ROOT) from datasets import load_dataset # type: ignore from transformers import AutoModelForCausalLM, AutoTokenizer # type: ignore From f7dde88e952adfb3f11c4ad00dfaafbc8b847710 Mon Sep 17 00:00:00 2001 From: N!no Date: Sun, 15 Feb 2026 13:39:17 -0500 Subject: [PATCH 02/11] pass parallel mode and device scheduling config to trainers --- train/train_iac.py | 16 ++++++++++++++++ train/train_maac.py | 16 ++++++++++++++++ utils/trainer_args.py | 15 +++++++++++++++ 3 files changed, 47 insertions(+) diff --git a/train/train_iac.py b/train/train_iac.py index 782e425..97dd3a8 100644 --- a/train/train_iac.py +++ b/train/train_iac.py @@ -180,6 +180,19 @@ def _as_bool(x: Any, default: bool) -> bool: return bool(default) +def _as_device_spec(x: Any) -> Any: + if x is None: + return None + if isinstance(x, str): + s = x.strip() + if s.lower() in ("none", "null", ""): + return None + return s + if isinstance(x, (list, tuple)): + return [str(v) for v in x] + return str(x) + + def _map_dtype(x: Any) -> Any: if isinstance(x, torch.dtype): return x @@ -234,6 +247,9 @@ def _build_iac_args(cfg: Dict[str, Any], *, model_name: Optional[str]) -> IACCon "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), "use_separate_critic": use_separate_critic, + "parallel_mode": str(tr.get("parallel_mode", "auto")).strip().lower(), + "agent_devices": _as_device_spec(tr.get("agent_devices", None)), + "critic_devices": _as_device_spec(tr.get("critic_devices", None)), "critic_value_head_hidden_dim": _as_opt_int( tr.get("critic_value_head_hidden_dim", None), None ), diff --git a/train/train_maac.py b/train/train_maac.py index 680ad70..30da008 100644 --- a/train/train_maac.py +++ b/train/train_maac.py @@ -180,6 +180,19 @@ def _as_bool(x: Any, default: bool) -> bool: return bool(default) +def _as_device_spec(x: Any) -> Any: + if x is None: + return None + if isinstance(x, str): + s = x.strip() + if s.lower() in ("none", "null", ""): + return None + return s + if isinstance(x, (list, tuple)): + return [str(v) for v in x] + return str(x) + + def _map_dtype(x: Any) -> Any: if isinstance(x, torch.dtype): return x @@ -236,6 +249,9 @@ def _build_maac_args(cfg: Dict[str, Any], *, model_name: Optional[str]) -> MAACC "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), "discount": _as_float(tr.get("discount", 0.9), 0.9), + "parallel_mode": str(tr.get("parallel_mode", "auto")).strip().lower(), + "agent_devices": _as_device_spec(tr.get("agent_devices", None)), + "critic_devices": _as_device_spec(tr.get("critic_devices", None)), "critic_type": critic_type, "early_termination_threshold": _as_opt_float( tr.get("early_termination_threshold", None), None diff --git a/utils/trainer_args.py b/utils/trainer_args.py index 51294ce..121fcd8 100644 --- a/utils/trainer_args.py +++ b/utils/trainer_args.py @@ -94,6 +94,19 @@ def _as_bool(x: Any, default: bool) -> bool: return bool(x) +def _as_device_spec(x: Any) -> Any: + if x is None: + return None + if isinstance(x, str): + s = x.strip() + if s.lower() in ("none", "null", ""): + return None + return s + if isinstance(x, (list, tuple)): + return [str(v) for v in x] + return str(x) + + def get_trainer_args(cfg: Dict[str, Any]) -> MAGRPOConfig: tr = cfg.get("magrpo", {}) lr_val = tr.get("agent_learning_rate", 3e-5) @@ -113,6 +126,8 @@ def get_trainer_args(cfg: Dict[str, Any]) -> MAGRPOConfig: candidate.update( { "num_agents": _as_int(tr.get("num_agents", 1), 1), + "parallel_mode": str(tr.get("parallel_mode", "auto")).strip().lower(), + "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "discount": _as_float(tr.get("discount", 0.9), 0.9), "joint_mode": str(tr.get("joint_mode", "aligned")), } From 90a60455417598dd09cd239223579847d0671098 Mon Sep 17 00:00:00 2001 From: N!no Date: Sun, 15 Feb 2026 14:02:25 -0500 Subject: [PATCH 03/11] use parallel_training key and set default auto in configs --- configs/iac_classeval_config.yaml | 1 + configs/maac_classeval_config.yaml | 1 + configs/magrpo_classeval_config.yaml | 1 + train/train_iac.py | 2 +- train/train_maac.py | 2 +- utils/trainer_args.py | 2 +- 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/configs/iac_classeval_config.yaml b/configs/iac_classeval_config.yaml index 9ed0e7a..df08a78 100644 --- a/configs/iac_classeval_config.yaml +++ b/configs/iac_classeval_config.yaml @@ -36,6 +36,7 @@ external: previous_response: true iac: + parallel_training: auto num_agents: 2 num_turns: 2 num_train_epochs: 40 diff --git a/configs/maac_classeval_config.yaml b/configs/maac_classeval_config.yaml index a57fa4a..6974927 100644 --- a/configs/maac_classeval_config.yaml +++ b/configs/maac_classeval_config.yaml @@ -36,6 +36,7 @@ external: previous_response: true maac: + parallel_training: auto num_agents: 2 num_turns: 2 critic_type: v diff --git a/configs/magrpo_classeval_config.yaml b/configs/magrpo_classeval_config.yaml index 813d2b8..02f35e2 100644 --- a/configs/magrpo_classeval_config.yaml +++ b/configs/magrpo_classeval_config.yaml @@ -30,6 +30,7 @@ external: previous_response: true magrpo: + parallel_training: auto num_agents: 2 num_turns: 2 num_train_epochs: 13 diff --git a/train/train_iac.py b/train/train_iac.py index 97dd3a8..be3b648 100644 --- a/train/train_iac.py +++ b/train/train_iac.py @@ -247,7 +247,7 @@ def _build_iac_args(cfg: Dict[str, Any], *, model_name: Optional[str]) -> IACCon "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), "use_separate_critic": use_separate_critic, - "parallel_mode": str(tr.get("parallel_mode", "auto")).strip().lower(), + "parallel_training": str(tr.get("parallel_training", "auto")).strip().lower(), "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "critic_devices": _as_device_spec(tr.get("critic_devices", None)), "critic_value_head_hidden_dim": _as_opt_int( diff --git a/train/train_maac.py b/train/train_maac.py index 30da008..d4a7f67 100644 --- a/train/train_maac.py +++ b/train/train_maac.py @@ -249,7 +249,7 @@ def _build_maac_args(cfg: Dict[str, Any], *, model_name: Optional[str]) -> MAACC "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), "discount": _as_float(tr.get("discount", 0.9), 0.9), - "parallel_mode": str(tr.get("parallel_mode", "auto")).strip().lower(), + "parallel_training": str(tr.get("parallel_training", "auto")).strip().lower(), "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "critic_devices": _as_device_spec(tr.get("critic_devices", None)), "critic_type": critic_type, diff --git a/utils/trainer_args.py b/utils/trainer_args.py index 121fcd8..7b85c50 100644 --- a/utils/trainer_args.py +++ b/utils/trainer_args.py @@ -126,7 +126,7 @@ def get_trainer_args(cfg: Dict[str, Any]) -> MAGRPOConfig: candidate.update( { "num_agents": _as_int(tr.get("num_agents", 1), 1), - "parallel_mode": str(tr.get("parallel_mode", "auto")).strip().lower(), + "parallel_training": str(tr.get("parallel_training", "auto")).strip().lower(), "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "discount": _as_float(tr.get("discount", 0.9), 0.9), "joint_mode": str(tr.get("joint_mode", "aligned")), From 815895c0a94a24ae1f65bc9077513be1abcb70ab Mon Sep 17 00:00:00 2001 From: N!no Date: Sun, 15 Feb 2026 15:57:38 -0500 Subject: [PATCH 04/11] ud --- configs/iac_classeval_config.yaml | 6 +--- configs/maac_classeval_config.yaml | 6 +--- configs/magrpo_classeval_config.yaml | 4 +-- train/train_iac.py | 47 ++++++++++++++++++++++++--- train/train_maac.py | 47 ++++++++++++++++++++++++--- train/train_magrpo.py | 8 +++-- utils/trainer_args.py | 48 +++++++++++++++++++++++++--- 7 files changed, 136 insertions(+), 30 deletions(-) diff --git a/configs/iac_classeval_config.yaml b/configs/iac_classeval_config.yaml index df08a78..bc62996 100644 --- a/configs/iac_classeval_config.yaml +++ b/configs/iac_classeval_config.yaml @@ -3,6 +3,7 @@ agent_model: type: qwen temperature: 0.6 top_p: 0.6 + top_k: null max_length: 2048 torch_dtype: bfloat16 @@ -11,8 +12,6 @@ agents: null critic_model: name: "Qwen/Qwen3-4B-Instruct-2507" type: qwen - temperature: 0.6 - top_p: 0.6 max_length: 2048 torch_dtype: bfloat16 @@ -47,9 +46,6 @@ iac: rollout_buffer_size: 2 train_batch_size: 2 max_new_tokens: 600 - temperature: 0.6 - top_p: 0.6 - top_k: null num_generations: 1 use_separate_critic: true discount: 0.9 diff --git a/configs/maac_classeval_config.yaml b/configs/maac_classeval_config.yaml index 6974927..49f6194 100644 --- a/configs/maac_classeval_config.yaml +++ b/configs/maac_classeval_config.yaml @@ -3,6 +3,7 @@ agent_model: type: qwen temperature: 0.6 top_p: 0.6 + top_k: null max_length: 2048 torch_dtype: bfloat16 @@ -11,8 +12,6 @@ agents: null critic_model: name: "Qwen/Qwen3-4B-Instruct-2507" type: qwen - temperature: 0.6 - top_p: 0.6 max_length: 2048 torch_dtype: bfloat16 @@ -47,9 +46,6 @@ maac: rollout_buffer_size: 2 train_batch_size: 2 max_new_tokens: 600 - temperature: 0.6 - top_p: 0.6 - top_k: null num_generations: 1 discount: 0.9 early_termination_threshold: -0.2 diff --git a/configs/magrpo_classeval_config.yaml b/configs/magrpo_classeval_config.yaml index 02f35e2..1db787f 100644 --- a/configs/magrpo_classeval_config.yaml +++ b/configs/magrpo_classeval_config.yaml @@ -3,6 +3,7 @@ agent_model: type: qwen temperature: 0.6 top_p: 0.6 + top_k: null max_length: 2048 torch_dtype: bfloat16 @@ -38,9 +39,6 @@ magrpo: logging_steps: 1 num_generations: 2 max_new_tokens: 600 - temperature: 0.6 - top_p: 0.6 - top_k: null discount: 0.9 joint_mode: aligned early_termination_threshold: -0.2 diff --git a/train/train_iac.py b/train/train_iac.py index be3b648..65f3371 100644 --- a/train/train_iac.py +++ b/train/train_iac.py @@ -193,6 +193,42 @@ def _as_device_spec(x: Any) -> Any: return str(x) +def _read_sampling_config(model_cfg: Dict[str, Any], *, section: str = "agent_model") -> Dict[str, Any]: + if not isinstance(model_cfg, dict): + raise ValueError(f"{section} must be a mapping.") + missing = [key for key in ("temperature", "top_p", "top_k") if key not in model_cfg] + if missing: + raise ValueError( + f"{section} is missing required sampling fields: {', '.join(missing)}." + ) + + def _require_float(key: str) -> float: + value = model_cfg.get(key) + if value is None or isinstance(value, bool): + raise ValueError(f"{section}.{key} must be provided as a float.") + try: + return float(value) + except Exception as exc: + raise ValueError(f"{section}.{key} must be a float, got {value!r}.") from exc + + def _parse_top_k() -> Optional[int]: + value = model_cfg.get("top_k") + if value is None: + return None + if isinstance(value, str) and value.strip().lower() in ("none", "null", ""): + return None + try: + return int(float(value)) + except Exception as exc: + raise ValueError(f"{section}.top_k must be an integer or null, got {value!r}.") from exc + + return { + "temperature": _require_float("temperature"), + "top_p": _require_float("top_p"), + "top_k": _parse_top_k(), + } + + def _map_dtype(x: Any) -> Any: if isinstance(x, torch.dtype): return x @@ -221,7 +257,7 @@ def _filter_config(candidate: Dict[str, Any], cfg_cls: Any) -> Dict[str, Any]: return {k: v for k, v in candidate.items() if k in params} -def _build_iac_args(cfg: Dict[str, Any], *, model_name: Optional[str]) -> IACConfig: +def _build_iac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> IACConfig: tr = cfg.get("iac") or {} if not isinstance(tr, dict): tr = {} @@ -241,9 +277,9 @@ def _build_iac_args(cfg: Dict[str, Any], *, model_name: Optional[str]) -> IACCon "value_clip_range": _as_opt_float(tr.get("value_clip_range", 0.2), 0.2), "advantage_normalization": _as_bool(adv_norm, True), "max_new_tokens": _as_int(tr.get("max_new_tokens", 256), 256), - "temperature": _as_float(tr.get("temperature", 0.6), 0.6), - "top_p": _as_float(tr.get("top_p", 0.6), 0.6), - "top_k": _as_opt_int(tr.get("top_k", None), None), + "temperature": sampling_cfg["temperature"], + "top_p": sampling_cfg["top_p"], + "top_k": sampling_cfg["top_k"], "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), "use_separate_critic": use_separate_critic, @@ -368,7 +404,8 @@ def main() -> int: if torch_dtype is not None: model_kwargs["torch_dtype"] = torch_dtype - iac_args = _build_iac_args(cfg, model_name=model_name) + sampling_cfg = _read_sampling_config(model_cfg, section="agent_model") + iac_args = _build_iac_args(cfg, sampling_cfg=sampling_cfg) num_agents = int(getattr(iac_args, "num_agents", 1)) if agent_names is not None: if not isinstance(agent_names, (list, tuple)) or not all( diff --git a/train/train_maac.py b/train/train_maac.py index d4a7f67..7f85529 100644 --- a/train/train_maac.py +++ b/train/train_maac.py @@ -193,6 +193,42 @@ def _as_device_spec(x: Any) -> Any: return str(x) +def _read_sampling_config(model_cfg: Dict[str, Any], *, section: str = "agent_model") -> Dict[str, Any]: + if not isinstance(model_cfg, dict): + raise ValueError(f"{section} must be a mapping.") + missing = [key for key in ("temperature", "top_p", "top_k") if key not in model_cfg] + if missing: + raise ValueError( + f"{section} is missing required sampling fields: {', '.join(missing)}." + ) + + def _require_float(key: str) -> float: + value = model_cfg.get(key) + if value is None or isinstance(value, bool): + raise ValueError(f"{section}.{key} must be provided as a float.") + try: + return float(value) + except Exception as exc: + raise ValueError(f"{section}.{key} must be a float, got {value!r}.") from exc + + def _parse_top_k() -> Optional[int]: + value = model_cfg.get("top_k") + if value is None: + return None + if isinstance(value, str) and value.strip().lower() in ("none", "null", ""): + return None + try: + return int(float(value)) + except Exception as exc: + raise ValueError(f"{section}.top_k must be an integer or null, got {value!r}.") from exc + + return { + "temperature": _require_float("temperature"), + "top_p": _require_float("top_p"), + "top_k": _parse_top_k(), + } + + def _map_dtype(x: Any) -> Any: if isinstance(x, torch.dtype): return x @@ -221,7 +257,7 @@ def _filter_config(candidate: Dict[str, Any], cfg_cls: Any) -> Dict[str, Any]: return {k: v for k, v in candidate.items() if k in params} -def _build_maac_args(cfg: Dict[str, Any], *, model_name: Optional[str]) -> MAACConfig: +def _build_maac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MAACConfig: tr = cfg.get("maac") or {} if not isinstance(tr, dict): tr = {} @@ -243,9 +279,9 @@ def _build_maac_args(cfg: Dict[str, Any], *, model_name: Optional[str]) -> MAACC "value_loss_coef": _as_float(tr.get("value_loss_coef", 0.6), 0.6), "advantage_normalization": _as_bool(adv_norm, True), "max_new_tokens": _as_int(tr.get("max_new_tokens", 256), 256), - "temperature": _as_float(tr.get("temperature", 0.6), 0.6), - "top_p": _as_float(tr.get("top_p", 0.6), 0.6), - "top_k": _as_opt_int(tr.get("top_k", None), None), + "temperature": sampling_cfg["temperature"], + "top_p": sampling_cfg["top_p"], + "top_k": sampling_cfg["top_k"], "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), "discount": _as_float(tr.get("discount", 0.9), 0.9), @@ -368,7 +404,8 @@ def main() -> int: if torch_dtype is not None: model_kwargs["torch_dtype"] = torch_dtype - maac_args = _build_maac_args(cfg, model_name=model_name) + sampling_cfg = _read_sampling_config(model_cfg, section="agent_model") + maac_args = _build_maac_args(cfg, sampling_cfg=sampling_cfg) num_agents = int(getattr(maac_args, "num_agents", 1)) if agent_names is not None: if not isinstance(agent_names, (list, tuple)) or not all( diff --git a/train/train_magrpo.py b/train/train_magrpo.py index 85884fb..707f328 100644 --- a/train/train_magrpo.py +++ b/train/train_magrpo.py @@ -27,7 +27,10 @@ import torch # type: ignore from comlrl.trainers.reinforce import MAGRPOTrainer # type: ignore -from LLM_Collab_Code_Completion.utils.trainer_args import get_trainer_args +from LLM_Collab_Code_Completion.utils.trainer_args import ( + get_trainer_args, + get_agent_sampling_config, +) from LLM_Collab_Code_Completion.utils.data import ( extract_class_name, @@ -257,7 +260,8 @@ def _map_dtype(x): strategy = get_strategy(num_agents=num_agents, seed=seed) - magrpo_args = get_trainer_args(cfg) + sampling_cfg = get_agent_sampling_config(cfg) + magrpo_args = get_trainer_args(cfg, sampling_cfg=sampling_cfg) formatters = build_agent_formatters(strategy) reward_func = get_reward_function(strategy=strategy, num_agents=num_agents) diff --git a/utils/trainer_args.py b/utils/trainer_args.py index 7b85c50..b2dc0df 100644 --- a/utils/trainer_args.py +++ b/utils/trainer_args.py @@ -107,7 +107,46 @@ def _as_device_spec(x: Any) -> Any: return str(x) -def get_trainer_args(cfg: Dict[str, Any]) -> MAGRPOConfig: +def get_agent_sampling_config(cfg: Dict[str, Any]) -> Dict[str, Any]: + model_cfg = cfg.get("agent_model") + if not isinstance(model_cfg, dict): + raise ValueError("agent_model must be a mapping.") + missing = [key for key in ("temperature", "top_p", "top_k") if key not in model_cfg] + if missing: + raise ValueError( + f"agent_model is missing required sampling fields: {', '.join(missing)}" + ) + + def _require_float(key: str) -> float: + value = model_cfg.get(key) + if value is None or isinstance(value, bool): + raise ValueError(f"agent_model.{key} must be provided as a float.") + try: + return float(value) + except Exception as exc: + raise ValueError(f"agent_model.{key} must be a float, got {value!r}.") from exc + + top_k_raw = model_cfg.get("top_k") + if isinstance(top_k_raw, str) and top_k_raw.strip().lower() in ("none", "null", ""): + top_k_val: Optional[int] = None + elif top_k_raw is None: + top_k_val = None + else: + try: + top_k_val = int(float(top_k_raw)) + except Exception as exc: + raise ValueError( + f"agent_model.top_k must be an integer or null, got {top_k_raw!r}." + ) from exc + + return { + "temperature": _require_float("temperature"), + "top_p": _require_float("top_p"), + "top_k": top_k_val, + } + + +def get_trainer_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MAGRPOConfig: tr = cfg.get("magrpo", {}) lr_val = tr.get("agent_learning_rate", 3e-5) @@ -118,11 +157,10 @@ def get_trainer_args(cfg: Dict[str, Any]) -> MAGRPOConfig: "logging_steps": _as_int(tr.get("logging_steps", 50), 50), "num_generations": _as_int(tr.get("num_generations", 4), 4), "max_new_tokens": _as_int(tr.get("max_new_tokens", 512), 512), - "temperature": _as_float(tr.get("temperature", 0.2), 0.2), - "top_p": _as_float(tr.get("top_p", 0.95), 0.95), + "temperature": _as_float(sampling_cfg.get("temperature"), 0.2), + "top_p": _as_float(sampling_cfg.get("top_p"), 0.95), + "top_k": _as_opt_int(sampling_cfg.get("top_k"), None), } - if "top_k" in tr: - candidate["top_k"] = _as_opt_int(tr.get("top_k", None), None) candidate.update( { "num_agents": _as_int(tr.get("num_agents", 1), 1), From 238fc6d0609b93413f7b41010be94e7bc6910363 Mon Sep 17 00:00:00 2001 From: N!no Date: Sun, 15 Feb 2026 16:33:41 -0500 Subject: [PATCH 05/11] ud --- README.md | 6 ++++-- configs/iac_classeval_config.yaml | 6 +++--- configs/maac_classeval_config.yaml | 6 +++--- configs/magrpo_classeval_config.yaml | 6 +++--- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index a3ff149..eb859ec 100644 --- a/README.md +++ b/README.md @@ -45,9 +45,11 @@ Key sections in `configs/magrpo_classeval_config.yaml`: ClassEval sub-slices or local mirrors. - `external`: feedback configuration (use `code_feedback` for syntax/test diagnostics). - `magrpo`: forwarded to `comlrl.trainers.reinforce.MAGRPOTrainer`. Includes collaboration - (`num_agents`, param-count assignment), sampling settings (`num_generations`, `num_turns`, - temperature/top_p), rollout buffering (`rollout_buffer_size`), optimization + (`num_agents`, param-count assignment), rollout settings (`num_generations`, `num_turns`), + rollout buffering (`rollout_buffer_size`), optimization hyperparameters, and IO controls. +- Sampling knobs (`temperature`, `top_p`, `top_k`) are configured in `agent_model` and passed + to trainer args at runtime. - `reward_processor`: optional post-processing for rewards (scale, shift). - `output`: persistence knobs (save final model, output paths, verbose debug prints). diff --git a/configs/iac_classeval_config.yaml b/configs/iac_classeval_config.yaml index bc62996..44c5a86 100644 --- a/configs/iac_classeval_config.yaml +++ b/configs/iac_classeval_config.yaml @@ -24,9 +24,9 @@ dataset: eval_split: test[66:82] output: - base_dir: output + base_dir: output_iac_classeval save_final_model: false - save_path: output/final_model + save_path: output_iac_classeval verbose: false external: @@ -64,5 +64,5 @@ wandb: project: classeval_dev entity: null name: codecompletion_classeval_iac - dir: output + dir: output_iac_classeval tags: ["iac", "classeval", "code-completion", "turns_2"] diff --git a/configs/maac_classeval_config.yaml b/configs/maac_classeval_config.yaml index 49f6194..b53b78e 100644 --- a/configs/maac_classeval_config.yaml +++ b/configs/maac_classeval_config.yaml @@ -24,9 +24,9 @@ dataset: eval_split: test[66:82] output: - base_dir: output + base_dir: output_maac_classeval save_final_model: false - save_path: output/final_model + save_path: output_maac_classeval verbose: false external: @@ -63,5 +63,5 @@ wandb: project: classeval_dev entity: null name: codecompletion_classeval_maac - dir: output + dir: output_maac_classeval tags: ["maac", "classeval", "code-completion", "turns_2"] diff --git a/configs/magrpo_classeval_config.yaml b/configs/magrpo_classeval_config.yaml index 1db787f..ef0e42d 100644 --- a/configs/magrpo_classeval_config.yaml +++ b/configs/magrpo_classeval_config.yaml @@ -20,9 +20,9 @@ dataset: eval_split: test[66:82] output: - base_dir: output + base_dir: output_magrpo_classeval save_final_model: false - save_path: output/final_model + save_path: output_magrpo_classeval verbose: false external: @@ -58,5 +58,5 @@ wandb: project: classeval_dev entity: null name: codecompletion_classeval_magrpo - dir: output + dir: output_magrpo_classeval tags: ["magrpo", "classeval", "code-completion", "turns_2"] From d644d06d8eea8a14bed527aeac3d03a2a4c36869 Mon Sep 17 00:00:00 2001 From: N!no Date: Mon, 16 Feb 2026 00:08:10 -0500 Subject: [PATCH 06/11] ud --- configs/iac_classeval_config.yaml | 2 +- configs/maac_classeval_config.yaml | 2 +- configs/magrpo_classeval_config.yaml | 2 +- train/train_iac.py | 2 +- train/train_maac.py | 2 +- utils/trainer_args.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/configs/iac_classeval_config.yaml b/configs/iac_classeval_config.yaml index 44c5a86..0acaaeb 100644 --- a/configs/iac_classeval_config.yaml +++ b/configs/iac_classeval_config.yaml @@ -35,7 +35,7 @@ external: previous_response: true iac: - parallel_training: auto + parallel_training: mp num_agents: 2 num_turns: 2 num_train_epochs: 40 diff --git a/configs/maac_classeval_config.yaml b/configs/maac_classeval_config.yaml index b53b78e..a099753 100644 --- a/configs/maac_classeval_config.yaml +++ b/configs/maac_classeval_config.yaml @@ -35,7 +35,7 @@ external: previous_response: true maac: - parallel_training: auto + parallel_training: mp num_agents: 2 num_turns: 2 critic_type: v diff --git a/configs/magrpo_classeval_config.yaml b/configs/magrpo_classeval_config.yaml index ef0e42d..104e5a5 100644 --- a/configs/magrpo_classeval_config.yaml +++ b/configs/magrpo_classeval_config.yaml @@ -31,7 +31,7 @@ external: previous_response: true magrpo: - parallel_training: auto + parallel_training: mp num_agents: 2 num_turns: 2 num_train_epochs: 13 diff --git a/train/train_iac.py b/train/train_iac.py index 65f3371..9bd26fc 100644 --- a/train/train_iac.py +++ b/train/train_iac.py @@ -283,7 +283,7 @@ def _build_iac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> IAC "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), "use_separate_critic": use_separate_critic, - "parallel_training": str(tr.get("parallel_training", "auto")).strip().lower(), + "parallel_training": str(tr.get("parallel_training", "mp")).strip().lower(), "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "critic_devices": _as_device_spec(tr.get("critic_devices", None)), "critic_value_head_hidden_dim": _as_opt_int( diff --git a/train/train_maac.py b/train/train_maac.py index 7f85529..880fe86 100644 --- a/train/train_maac.py +++ b/train/train_maac.py @@ -285,7 +285,7 @@ def _build_maac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MA "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), "discount": _as_float(tr.get("discount", 0.9), 0.9), - "parallel_training": str(tr.get("parallel_training", "auto")).strip().lower(), + "parallel_training": str(tr.get("parallel_training", "mp")).strip().lower(), "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "critic_devices": _as_device_spec(tr.get("critic_devices", None)), "critic_type": critic_type, diff --git a/utils/trainer_args.py b/utils/trainer_args.py index b2dc0df..5c3b35b 100644 --- a/utils/trainer_args.py +++ b/utils/trainer_args.py @@ -164,7 +164,7 @@ def get_trainer_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MA candidate.update( { "num_agents": _as_int(tr.get("num_agents", 1), 1), - "parallel_training": str(tr.get("parallel_training", "auto")).strip().lower(), + "parallel_training": str(tr.get("parallel_training", "mp")).strip().lower(), "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "discount": _as_float(tr.get("discount", 0.9), 0.9), "joint_mode": str(tr.get("joint_mode", "aligned")), From 07841cebf6572e065c89a3cdbe00ad425f83723f Mon Sep 17 00:00:00 2001 From: N!no Date: Mon, 16 Feb 2026 10:10:14 -0500 Subject: [PATCH 07/11] ud --- configs/iac_classeval_config.yaml | 4 +++- configs/maac_classeval_config.yaml | 4 +++- configs/magrpo_classeval_config.yaml | 3 ++- train/train_iac.py | 2 +- train/train_maac.py | 2 +- utils/trainer_args.py | 2 +- 6 files changed, 11 insertions(+), 6 deletions(-) diff --git a/configs/iac_classeval_config.yaml b/configs/iac_classeval_config.yaml index 0acaaeb..680ce22 100644 --- a/configs/iac_classeval_config.yaml +++ b/configs/iac_classeval_config.yaml @@ -35,7 +35,9 @@ external: previous_response: true iac: - parallel_training: mp + parallel_training: none + agent_devices: ["cuda:0"] + critic_devices: ["cuda:0"] num_agents: 2 num_turns: 2 num_train_epochs: 40 diff --git a/configs/maac_classeval_config.yaml b/configs/maac_classeval_config.yaml index a099753..0c3b3f9 100644 --- a/configs/maac_classeval_config.yaml +++ b/configs/maac_classeval_config.yaml @@ -35,7 +35,9 @@ external: previous_response: true maac: - parallel_training: mp + parallel_training: none + agent_devices: ["cuda:0"] + critic_devices: ["cuda:0"] num_agents: 2 num_turns: 2 critic_type: v diff --git a/configs/magrpo_classeval_config.yaml b/configs/magrpo_classeval_config.yaml index 104e5a5..b19345d 100644 --- a/configs/magrpo_classeval_config.yaml +++ b/configs/magrpo_classeval_config.yaml @@ -31,7 +31,8 @@ external: previous_response: true magrpo: - parallel_training: mp + parallel_training: none + agent_devices: ["cuda:0"] num_agents: 2 num_turns: 2 num_train_epochs: 13 diff --git a/train/train_iac.py b/train/train_iac.py index 9bd26fc..df3488b 100644 --- a/train/train_iac.py +++ b/train/train_iac.py @@ -283,7 +283,7 @@ def _build_iac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> IAC "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), "use_separate_critic": use_separate_critic, - "parallel_training": str(tr.get("parallel_training", "mp")).strip().lower(), + "parallel_training": str(tr.get("parallel_training", "none")).strip().lower(), "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "critic_devices": _as_device_spec(tr.get("critic_devices", None)), "critic_value_head_hidden_dim": _as_opt_int( diff --git a/train/train_maac.py b/train/train_maac.py index 880fe86..7aefd7f 100644 --- a/train/train_maac.py +++ b/train/train_maac.py @@ -285,7 +285,7 @@ def _build_maac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MA "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), "discount": _as_float(tr.get("discount", 0.9), 0.9), - "parallel_training": str(tr.get("parallel_training", "mp")).strip().lower(), + "parallel_training": str(tr.get("parallel_training", "none")).strip().lower(), "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "critic_devices": _as_device_spec(tr.get("critic_devices", None)), "critic_type": critic_type, diff --git a/utils/trainer_args.py b/utils/trainer_args.py index 5c3b35b..fc373fe 100644 --- a/utils/trainer_args.py +++ b/utils/trainer_args.py @@ -164,7 +164,7 @@ def get_trainer_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MA candidate.update( { "num_agents": _as_int(tr.get("num_agents", 1), 1), - "parallel_training": str(tr.get("parallel_training", "mp")).strip().lower(), + "parallel_training": str(tr.get("parallel_training", "none")).strip().lower(), "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "discount": _as_float(tr.get("discount", 0.9), 0.9), "joint_mode": str(tr.get("joint_mode", "aligned")), From d5e6d8082f60274b6896630a5bfe6d1056cb3fc9 Mon Sep 17 00:00:00 2001 From: N!no Date: Mon, 16 Feb 2026 11:01:12 -0500 Subject: [PATCH 08/11] ud --- train/train_iac.py | 16 ++++++++-------- train/train_maac.py | 18 +++++++++--------- train/train_magrpo.py | 4 ++-- utils/trainer_args.py | 35 +++++++++++++++++------------------ 4 files changed, 36 insertions(+), 37 deletions(-) diff --git a/train/train_iac.py b/train/train_iac.py index df3488b..3d4ba01 100644 --- a/train/train_iac.py +++ b/train/train_iac.py @@ -266,17 +266,17 @@ def _build_iac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> IAC adv_norm = tr.get("advantage_normalization", tr.get("normalize_advantage", True)) candidate = { - "num_turns": _as_int(tr.get("num_turns", 1), 1), + "num_turns": _as_int(tr.get("num_turns", 2), 2), "num_train_epochs": _as_int(tr.get("num_train_epochs", 40), 40), "agent_learning_rate": _as_float(tr.get("agent_learning_rate", 5e-6), 5e-6), "critic_learning_rate": _as_opt_float( tr.get("critic_learning_rate", 5e-6), 5e-6 ), - "rollout_buffer_size": _as_int(tr.get("rollout_buffer_size", 8), 8), + "rollout_buffer_size": _as_int(tr.get("rollout_buffer_size", 2), 2), "value_loss_coef": _as_float(tr.get("value_loss_coef", 0.6), 0.6), "value_clip_range": _as_opt_float(tr.get("value_clip_range", 0.2), 0.2), "advantage_normalization": _as_bool(adv_norm, True), - "max_new_tokens": _as_int(tr.get("max_new_tokens", 256), 256), + "max_new_tokens": _as_int(tr.get("max_new_tokens", 600), 600), "temperature": sampling_cfg["temperature"], "top_p": sampling_cfg["top_p"], "top_k": sampling_cfg["top_k"], @@ -284,17 +284,17 @@ def _build_iac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> IAC "num_generations": _as_int(tr.get("num_generations", 1), 1), "use_separate_critic": use_separate_critic, "parallel_training": str(tr.get("parallel_training", "none")).strip().lower(), - "agent_devices": _as_device_spec(tr.get("agent_devices", None)), - "critic_devices": _as_device_spec(tr.get("critic_devices", None)), + "agent_devices": _as_device_spec(tr.get("agent_devices", ["cuda:0"])), + "critic_devices": _as_device_spec(tr.get("critic_devices", ["cuda:0"])), "critic_value_head_hidden_dim": _as_opt_int( tr.get("critic_value_head_hidden_dim", None), None ), "value_head_hidden_dim": _as_opt_int(tr.get("value_head_hidden_dim", None), None), "discount": _as_float(tr.get("discount", 0.9), 0.9), "early_termination_threshold": _as_opt_float( - tr.get("early_termination_threshold", None), None + tr.get("early_termination_threshold", -0.2), -0.2 ), - "eval_interval": _as_int(tr.get("eval_interval", 16), 16), + "eval_interval": _as_int(tr.get("eval_interval", 20), 20), "eval_num_samples": _as_int(tr.get("eval_num_samples", 4), 4), "eval_batch_size": _as_int(tr.get("eval_batch_size", 1), 1), "logging_steps": _as_int(tr.get("logging_steps", 1), 1), @@ -388,7 +388,7 @@ def main() -> int: if tmp_base: os.environ["CLASSEVAL_TMP_BASE"] = str(tmp_base) - model_name = str(model_cfg.get("name", "Qwen/Qwen2.5-Coder-7B")).strip() + model_name = str(model_cfg.get("name", "Qwen/Qwen3-4B-Instruct-2507")).strip() agent_names = cfg.get("agents") model_kwargs: Dict[str, Any] = {} diff --git a/train/train_maac.py b/train/train_maac.py index 7aefd7f..6ff3946 100644 --- a/train/train_maac.py +++ b/train/train_maac.py @@ -269,16 +269,16 @@ def _build_maac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MA critic_type = str(critic_type) candidate = { - "num_turns": _as_int(tr.get("num_turns", 1), 1), + "num_turns": _as_int(tr.get("num_turns", 2), 2), "num_train_epochs": _as_int(tr.get("num_train_epochs", 40), 40), "agent_learning_rate": _as_float(tr.get("agent_learning_rate", 5e-6), 5e-6), "critic_learning_rate": _as_float( tr.get("critic_learning_rate", 5e-6), 5e-6 ), - "rollout_buffer_size": _as_int(tr.get("rollout_buffer_size", 8), 8), + "rollout_buffer_size": _as_int(tr.get("rollout_buffer_size", 2), 2), "value_loss_coef": _as_float(tr.get("value_loss_coef", 0.6), 0.6), "advantage_normalization": _as_bool(adv_norm, True), - "max_new_tokens": _as_int(tr.get("max_new_tokens", 256), 256), + "max_new_tokens": _as_int(tr.get("max_new_tokens", 600), 600), "temperature": sampling_cfg["temperature"], "top_p": sampling_cfg["top_p"], "top_k": sampling_cfg["top_k"], @@ -286,14 +286,14 @@ def _build_maac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MA "num_generations": _as_int(tr.get("num_generations", 1), 1), "discount": _as_float(tr.get("discount", 0.9), 0.9), "parallel_training": str(tr.get("parallel_training", "none")).strip().lower(), - "agent_devices": _as_device_spec(tr.get("agent_devices", None)), - "critic_devices": _as_device_spec(tr.get("critic_devices", None)), + "agent_devices": _as_device_spec(tr.get("agent_devices", ["cuda:0"])), + "critic_devices": _as_device_spec(tr.get("critic_devices", ["cuda:0"])), "critic_type": critic_type, "early_termination_threshold": _as_opt_float( - tr.get("early_termination_threshold", None), None + tr.get("early_termination_threshold", -0.2), -0.2 ), - "eval_interval": _as_int(tr.get("eval_interval", 16), 16), - "eval_num_samples": _as_int(tr.get("eval_num_samples", 4), 4), + "eval_interval": _as_int(tr.get("eval_interval", 20), 20), + "eval_num_samples": _as_int(tr.get("eval_num_samples", 2), 2), "eval_batch_size": _as_int(tr.get("eval_batch_size", 1), 1), "logging_steps": _as_int(tr.get("logging_steps", 1), 1), } @@ -388,7 +388,7 @@ def main() -> int: if tmp_base: os.environ["CLASSEVAL_TMP_BASE"] = str(tmp_base) - model_name = str(model_cfg.get("name", "Qwen/Qwen2.5-Coder-7B")).strip() + model_name = str(model_cfg.get("name", "Qwen/Qwen3-4B-Instruct-2507")).strip() agent_names = cfg.get("agents") model_kwargs: Dict[str, Any] = {} diff --git a/train/train_magrpo.py b/train/train_magrpo.py index 707f328..3b88bdb 100644 --- a/train/train_magrpo.py +++ b/train/train_magrpo.py @@ -157,7 +157,7 @@ def main(): if isinstance(eval_split, str): eval_split = eval_split.strip() or None - num_agents = int(magrpo_cfg.get("num_agents", 1)) + num_agents = int(magrpo_cfg.get("num_agents", 2)) if not eval_split: print("dataset.eval_split is required.") @@ -194,7 +194,7 @@ def main(): tmp_base = None if tmp_base: os.environ["CLASSEVAL_TMP_BASE"] = str(tmp_base) - model_name = model_cfg.get("name", "Qwen/Qwen2.5-3B") + model_name = model_cfg.get("name", "Qwen/Qwen3-4B-Instruct-2507") agent_names = cfg.get("agents") if agent_names is not None: if not isinstance(agent_names, (list, tuple)) or not all( diff --git a/utils/trainer_args.py b/utils/trainer_args.py index fc373fe..4540d28 100644 --- a/utils/trainer_args.py +++ b/utils/trainer_args.py @@ -148,40 +148,39 @@ def _require_float(key: str) -> float: def get_trainer_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MAGRPOConfig: tr = cfg.get("magrpo", {}) - lr_val = tr.get("agent_learning_rate", 3e-5) + lr_val = tr.get("agent_learning_rate", 1e-5) candidate = { - "num_turns": _as_int(tr.get("num_turns", 1), 1), - "num_train_epochs": _as_int(tr.get("num_train_epochs", 3), 3), - "agent_learning_rate": _as_float(lr_val, 3e-5), - "logging_steps": _as_int(tr.get("logging_steps", 50), 50), - "num_generations": _as_int(tr.get("num_generations", 4), 4), - "max_new_tokens": _as_int(tr.get("max_new_tokens", 512), 512), - "temperature": _as_float(sampling_cfg.get("temperature"), 0.2), - "top_p": _as_float(sampling_cfg.get("top_p"), 0.95), + "num_turns": _as_int(tr.get("num_turns", 2), 2), + "num_train_epochs": _as_int(tr.get("num_train_epochs", 13), 13), + "agent_learning_rate": _as_float(lr_val, 1e-5), + "logging_steps": _as_int(tr.get("logging_steps", 1), 1), + "num_generations": _as_int(tr.get("num_generations", 2), 2), + "max_new_tokens": _as_int(tr.get("max_new_tokens", 600), 600), + "temperature": _as_float(sampling_cfg.get("temperature"), 0.6), + "top_p": _as_float(sampling_cfg.get("top_p"), 0.6), "top_k": _as_opt_int(sampling_cfg.get("top_k"), None), } candidate.update( { - "num_agents": _as_int(tr.get("num_agents", 1), 1), + "num_agents": _as_int(tr.get("num_agents", 2), 2), "parallel_training": str(tr.get("parallel_training", "none")).strip().lower(), - "agent_devices": _as_device_spec(tr.get("agent_devices", None)), + "agent_devices": _as_device_spec(tr.get("agent_devices", ["cuda:0"])), "discount": _as_float(tr.get("discount", 0.9), 0.9), "joint_mode": str(tr.get("joint_mode", "aligned")), + "early_termination_threshold": _as_opt_float( + tr.get("early_termination_threshold", -0.2), -0.2 + ), } ) - if "early_termination_threshold" in tr: - candidate["early_termination_threshold"] = _as_opt_float( - tr.get("early_termination_threshold", None), None - ) candidate.update( { - "rollout_buffer_size": _as_int(tr.get("rollout_buffer_size", 2), 2), - "train_batch_size": _as_opt_int(tr.get("train_batch_size", None), None), + "rollout_buffer_size": _as_int(tr.get("rollout_buffer_size", 1), 1), + "train_batch_size": _as_opt_int(tr.get("train_batch_size", 1), 1), "advantage_normalization": _as_bool( tr.get("advantage_normalization", True), True ), - "eval_interval": _as_int(tr.get("eval_interval", 16), 16), + "eval_interval": _as_int(tr.get("eval_interval", 10), 10), "eval_num_samples": _as_int(tr.get("eval_num_samples", 4), 4), "eval_batch_size": _as_int(tr.get("eval_batch_size", 1), 1), "external_prompt_passthrough": True, From 548c2830031522ee0bec89edbc500441bbff6d7d Mon Sep 17 00:00:00 2001 From: N!no Date: Mon, 16 Feb 2026 14:10:07 -0500 Subject: [PATCH 09/11] ud --- configs/iac_classeval_config.yaml | 1 + configs/maac_classeval_config.yaml | 1 + configs/magrpo_classeval_config.yaml | 1 + train/train_iac.py | 6 ++++++ train/train_maac.py | 6 ++++++ utils/trainer_args.py | 5 ++++- 6 files changed, 19 insertions(+), 1 deletion(-) diff --git a/configs/iac_classeval_config.yaml b/configs/iac_classeval_config.yaml index 680ce22..f815739 100644 --- a/configs/iac_classeval_config.yaml +++ b/configs/iac_classeval_config.yaml @@ -33,6 +33,7 @@ external: mode: code_feedback original_prompt: true previous_response: true + external_prompt_passthrough: false iac: parallel_training: none diff --git a/configs/maac_classeval_config.yaml b/configs/maac_classeval_config.yaml index 0c3b3f9..35ba958 100644 --- a/configs/maac_classeval_config.yaml +++ b/configs/maac_classeval_config.yaml @@ -33,6 +33,7 @@ external: mode: code_feedback original_prompt: true previous_response: true + external_prompt_passthrough: false maac: parallel_training: none diff --git a/configs/magrpo_classeval_config.yaml b/configs/magrpo_classeval_config.yaml index b19345d..1f2b511 100644 --- a/configs/magrpo_classeval_config.yaml +++ b/configs/magrpo_classeval_config.yaml @@ -29,6 +29,7 @@ external: mode: code_feedback original_prompt: true previous_response: true + external_prompt_passthrough: false magrpo: parallel_training: none diff --git a/train/train_iac.py b/train/train_iac.py index 3d4ba01..b58ec90 100644 --- a/train/train_iac.py +++ b/train/train_iac.py @@ -261,6 +261,9 @@ def _build_iac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> IAC tr = cfg.get("iac") or {} if not isinstance(tr, dict): tr = {} + ext = cfg.get("external") or {} + if not isinstance(ext, dict): + ext = {} use_separate_critic = _as_bool(tr.get("use_separate_critic", True), True) adv_norm = tr.get("advantage_normalization", tr.get("normalize_advantage", True)) @@ -291,6 +294,9 @@ def _build_iac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> IAC ), "value_head_hidden_dim": _as_opt_int(tr.get("value_head_hidden_dim", None), None), "discount": _as_float(tr.get("discount", 0.9), 0.9), + "external_prompt_passthrough": _as_bool( + ext.get("external_prompt_passthrough", False), False + ), "early_termination_threshold": _as_opt_float( tr.get("early_termination_threshold", -0.2), -0.2 ), diff --git a/train/train_maac.py b/train/train_maac.py index 6ff3946..5dc429c 100644 --- a/train/train_maac.py +++ b/train/train_maac.py @@ -261,6 +261,9 @@ def _build_maac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MA tr = cfg.get("maac") or {} if not isinstance(tr, dict): tr = {} + ext = cfg.get("external") or {} + if not isinstance(ext, dict): + ext = {} output_cfg = cfg.get("output", {}) or {} adv_norm = tr.get("advantage_normalization", tr.get("normalize_advantage", True)) @@ -289,6 +292,9 @@ def _build_maac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MA "agent_devices": _as_device_spec(tr.get("agent_devices", ["cuda:0"])), "critic_devices": _as_device_spec(tr.get("critic_devices", ["cuda:0"])), "critic_type": critic_type, + "external_prompt_passthrough": _as_bool( + ext.get("external_prompt_passthrough", False), False + ), "early_termination_threshold": _as_opt_float( tr.get("early_termination_threshold", -0.2), -0.2 ), diff --git a/utils/trainer_args.py b/utils/trainer_args.py index 4540d28..48ea230 100644 --- a/utils/trainer_args.py +++ b/utils/trainer_args.py @@ -148,6 +148,7 @@ def _require_float(key: str) -> float: def get_trainer_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MAGRPOConfig: tr = cfg.get("magrpo", {}) + ext = cfg.get("external", {}) lr_val = tr.get("agent_learning_rate", 1e-5) candidate = { @@ -183,7 +184,9 @@ def get_trainer_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MA "eval_interval": _as_int(tr.get("eval_interval", 10), 10), "eval_num_samples": _as_int(tr.get("eval_num_samples", 4), 4), "eval_batch_size": _as_int(tr.get("eval_batch_size", 1), 1), - "external_prompt_passthrough": True, + "external_prompt_passthrough": _as_bool( + ext.get("external_prompt_passthrough", False), False + ), } ) From bcf2167d565197937908345d398c68f2c958ace4 Mon Sep 17 00:00:00 2001 From: N!no Date: Mon, 16 Feb 2026 17:01:35 -0500 Subject: [PATCH 10/11] ud --- configs/iac_classeval_config.yaml | 20 +++++++++++++------- configs/maac_classeval_config.yaml | 18 ++++++++++++------ configs/magrpo_classeval_config.yaml | 11 ++++++++--- 3 files changed, 33 insertions(+), 16 deletions(-) diff --git a/configs/iac_classeval_config.yaml b/configs/iac_classeval_config.yaml index f815739..01f7881 100644 --- a/configs/iac_classeval_config.yaml +++ b/configs/iac_classeval_config.yaml @@ -10,7 +10,7 @@ agent_model: agents: null critic_model: - name: "Qwen/Qwen3-4B-Instruct-2507" + name: Qwen/Qwen3-4B-Instruct-2507 type: qwen max_length: 2048 torch_dtype: bfloat16 @@ -25,9 +25,9 @@ dataset: output: base_dir: output_iac_classeval + verbose: false save_final_model: false save_path: output_iac_classeval - verbose: false external: mode: code_feedback @@ -37,10 +37,13 @@ external: iac: parallel_training: none - agent_devices: ["cuda:0"] - critic_devices: ["cuda:0"] + agent_devices: + - cuda:0 + critic_devices: + - cuda:0 num_agents: 2 num_turns: 2 + use_separate_critic: true num_train_epochs: 40 agent_learning_rate: 5e-6 critic_learning_rate: 5e-6 @@ -49,10 +52,9 @@ iac: rollout_buffer_size: 2 train_batch_size: 2 max_new_tokens: 600 - num_generations: 1 - use_separate_critic: true discount: 0.9 early_termination_threshold: -0.2 + num_generations: 1 eval_interval: 20 eval_num_samples: 4 eval_batch_size: 1 @@ -68,4 +70,8 @@ wandb: entity: null name: codecompletion_classeval_iac dir: output_iac_classeval - tags: ["iac", "classeval", "code-completion", "turns_2"] + tags: + - iac + - classeval + - code-completion + - turns_2 diff --git a/configs/maac_classeval_config.yaml b/configs/maac_classeval_config.yaml index 35ba958..d15ddbc 100644 --- a/configs/maac_classeval_config.yaml +++ b/configs/maac_classeval_config.yaml @@ -10,7 +10,7 @@ agent_model: agents: null critic_model: - name: "Qwen/Qwen3-4B-Instruct-2507" + name: Qwen/Qwen3-4B-Instruct-2507 type: qwen max_length: 2048 torch_dtype: bfloat16 @@ -25,9 +25,9 @@ dataset: output: base_dir: output_maac_classeval + verbose: false save_final_model: false save_path: output_maac_classeval - verbose: false external: mode: code_feedback @@ -37,8 +37,10 @@ external: maac: parallel_training: none - agent_devices: ["cuda:0"] - critic_devices: ["cuda:0"] + agent_devices: + - cuda:0 + critic_devices: + - cuda:0 num_agents: 2 num_turns: 2 critic_type: v @@ -49,9 +51,9 @@ maac: rollout_buffer_size: 2 train_batch_size: 2 max_new_tokens: 600 - num_generations: 1 discount: 0.9 early_termination_threshold: -0.2 + num_generations: 1 eval_interval: 20 eval_num_samples: 2 eval_batch_size: 1 @@ -67,4 +69,8 @@ wandb: entity: null name: codecompletion_classeval_maac dir: output_maac_classeval - tags: ["maac", "classeval", "code-completion", "turns_2"] + tags: + - maac + - classeval + - code-completion + - turns_2 diff --git a/configs/magrpo_classeval_config.yaml b/configs/magrpo_classeval_config.yaml index 1f2b511..051d0bd 100644 --- a/configs/magrpo_classeval_config.yaml +++ b/configs/magrpo_classeval_config.yaml @@ -21,9 +21,9 @@ dataset: output: base_dir: output_magrpo_classeval + verbose: false save_final_model: false save_path: output_magrpo_classeval - verbose: false external: mode: code_feedback @@ -33,7 +33,8 @@ external: magrpo: parallel_training: none - agent_devices: ["cuda:0"] + agent_devices: + - cuda:0 num_agents: 2 num_turns: 2 num_train_epochs: 13 @@ -61,4 +62,8 @@ wandb: entity: null name: codecompletion_classeval_magrpo dir: output_magrpo_classeval - tags: ["magrpo", "classeval", "code-completion", "turns_2"] + tags: + - magrpo + - classeval + - code-completion + - turns_2 From a6a5ccad31b7db41f7cffc1cc0caaa519288090e Mon Sep 17 00:00:00 2001 From: N!no Date: Mon, 16 Feb 2026 17:09:25 -0500 Subject: [PATCH 11/11] ud --- train/train_magrpo.py | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/train/train_magrpo.py b/train/train_magrpo.py index 3b88bdb..1b8e10b 100644 --- a/train/train_magrpo.py +++ b/train/train_magrpo.py @@ -23,7 +23,7 @@ sys.path.insert(0, COMLRL_ROOT) from datasets import load_dataset # type: ignore -from transformers import AutoModelForCausalLM, AutoTokenizer # type: ignore +from transformers import AutoTokenizer # type: ignore import torch # type: ignore from comlrl.trainers.reinforce import MAGRPOTrainer # type: ignore @@ -202,8 +202,6 @@ def main(): ): raise ValueError("agents must be a list of model names.") agent_names = [str(x) for x in agent_names] - model_kwargs: Dict[str, Any] = {} - dtype_cfg = ( model_cfg.get("dtype") or model_cfg.get("torch_dtype") @@ -233,9 +231,6 @@ def _map_dtype(x): except Exception: torch_dtype = None - if torch_dtype is not None: - model_kwargs["torch_dtype"] = torch_dtype - tokenizer_source = agent_names[0] if agent_names else model_name if not tokenizer_source: raise ValueError("agent_model.name or agents must be provided.") @@ -248,16 +243,6 @@ def _map_dtype(x): tok.pad_token = tok.eos_token tokenizer = tokenizers[0] - agents = [] - if agent_names: - for name in agent_names: - agent = AutoModelForCausalLM.from_pretrained(name, **model_kwargs) - agents.append(agent) - else: - for _ in range(num_agents): - agent = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs) - agents.append(agent) - strategy = get_strategy(num_agents=num_agents, seed=seed) sampling_cfg = get_agent_sampling_config(cfg) @@ -342,8 +327,12 @@ def _map_dtype(x): trainer_kwargs = { "agent_model": model_name or None, - "agents": agents, + "agents": agent_names, "num_agents": num_agents, + "model_config": { + "torch_dtype": torch_dtype, + "special_tokens": model_cfg.get("special_tokens", {}), + }, "reward_func": reward_func, "formatters": formatters, "args": magrpo_args,