From 70e5cf68e449674949fd4603b1058469c133420e Mon Sep 17 00:00:00 2001 From: N!no Date: Sun, 15 Feb 2026 11:27:02 -0500 Subject: [PATCH 01/14] Resolve CoMLRL from local workspace --- config.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/config.py b/config.py index 8a65304..60d21c7 100644 --- a/config.py +++ b/config.py @@ -4,6 +4,8 @@ """ import argparse +import os +import sys from dataclasses import dataclass, field from pathlib import Path from typing import Any, Dict, Optional @@ -11,6 +13,12 @@ import yaml +REPO_ROOT = os.path.dirname(os.path.abspath(__file__)) +COMLRL_ROOT = os.path.join(os.path.dirname(REPO_ROOT), "CoMLRL") +if COMLRL_ROOT not in sys.path: + sys.path.insert(0, COMLRL_ROOT) + + @dataclass(frozen=True) class ModelConfig: """Configuration for model loading and generation.""" From e2691ac2405638625c3d3e54f931d77b0fa8fcf5 Mon Sep 17 00:00:00 2001 From: N!no Date: Sun, 15 Feb 2026 13:39:20 -0500 Subject: [PATCH 02/14] wire parallel mode and device assignment options into trainer configs --- train_ac.py | 3 +++ train_grpo.py | 2 ++ train_iac.py | 3 +++ train_maac.py | 3 +++ train_magrpo.py | 4 ++++ 5 files changed, 15 insertions(+) diff --git a/train_ac.py b/train_ac.py index 720fd51..31cfebf 100644 --- a/train_ac.py +++ b/train_ac.py @@ -384,6 +384,9 @@ def external_transition_fn( num_agents=1, num_generations=ac_cfg.get("num_generations", 1), use_separate_critic=use_separate_critic, + parallel_mode=str(ac_cfg.get("parallel_mode", "auto")).strip().lower(), + agent_devices=ac_cfg.get("agent_devices", None), + critic_devices=ac_cfg.get("critic_devices", None), critic_value_head_hidden_dim=ac_cfg.get("critic_value_head_hidden_dim"), value_head_hidden_dim=ac_cfg.get("value_head_hidden_dim"), discount=ac_cfg.get("discount", 0.9), diff --git a/train_grpo.py b/train_grpo.py index c4a8359..fcf019b 100644 --- a/train_grpo.py +++ b/train_grpo.py @@ -337,6 +337,8 @@ def _resolver(prompt: str): temperature=temperature, top_p=top_p, top_k=top_k, + parallel_mode=str(grpo_config.get("parallel_mode", "auto")).strip().lower(), + agent_devices=grpo_config.get("agent_devices", None), discount=grpo_config.get("discount", 0.9), joint_mode=grpo_config.get("joint_mode", "aligned"), early_termination_threshold=grpo_config.get( diff --git a/train_iac.py b/train_iac.py index 7393d53..fa54ab9 100644 --- a/train_iac.py +++ b/train_iac.py @@ -417,6 +417,9 @@ def external_transition_fn( num_agents=num_agents, num_generations=iac_cfg.get("num_generations", 1), use_separate_critic=use_separate_critic, + parallel_mode=str(iac_cfg.get("parallel_mode", "auto")).strip().lower(), + agent_devices=iac_cfg.get("agent_devices", None), + critic_devices=iac_cfg.get("critic_devices", None), critic_value_head_hidden_dim=iac_cfg.get("critic_value_head_hidden_dim"), value_head_hidden_dim=iac_cfg.get("value_head_hidden_dim"), discount=iac_cfg.get("discount", 0.9), diff --git a/train_maac.py b/train_maac.py index d0a8683..e613da6 100644 --- a/train_maac.py +++ b/train_maac.py @@ -411,6 +411,9 @@ def external_transition_fn( top_k=top_k, num_agents=num_agents, num_generations=maac_cfg.get("num_generations", 1), + parallel_mode=str(maac_cfg.get("parallel_mode", "auto")).strip().lower(), + agent_devices=maac_cfg.get("agent_devices", None), + critic_devices=maac_cfg.get("critic_devices", None), discount=discount, critic_type=maac_cfg.get("critic_type", "v"), early_termination_threshold=maac_cfg.get( diff --git a/train_magrpo.py b/train_magrpo.py index ce77160..d24f187 100644 --- a/train_magrpo.py +++ b/train_magrpo.py @@ -407,6 +407,10 @@ def _resolver(prompt: str): magrpo_args_kwargs.update( { "num_agents": num_agents, + "parallel_mode": str( + magrpo_config.get("parallel_mode", "auto") + ).strip().lower(), + "agent_devices": magrpo_config.get("agent_devices", None), "discount": magrpo_config.get("discount", 0.9), "joint_mode": magrpo_config.get("joint_mode", "aligned"), "early_termination_threshold": magrpo_config.get( From 09408c82c58b845d2023611bcedfead9e516b770 Mon Sep 17 00:00:00 2001 From: N!no Date: Sun, 15 Feb 2026 14:02:30 -0500 Subject: [PATCH 03/14] switch to parallel_training key and add auto defaults in yaml --- configs/ac_che_config.yaml | 1 + configs/ac_he_config.yaml | 1 + configs/ac_mbpp_config.yaml | 1 + configs/grpo_che_config.yaml | 1 + configs/grpo_he_config.yaml | 1 + configs/grpo_mbpp_config.yaml | 1 + configs/iac_che_config.yaml | 1 + configs/iac_he_config.yaml | 1 + configs/iac_mbpp_config.yaml | 1 + configs/maac_che_config.yaml | 1 + configs/maac_he_config.yaml | 1 + configs/maac_mbpp_config.yaml | 1 + configs/magrpo_che_config.yaml | 1 + configs/magrpo_he_config.yaml | 1 + configs/magrpo_mbpp_config.yaml | 1 + train_ac.py | 2 +- train_grpo.py | 2 +- train_iac.py | 2 +- train_maac.py | 2 +- train_magrpo.py | 4 ++-- 20 files changed, 21 insertions(+), 6 deletions(-) diff --git a/configs/ac_che_config.yaml b/configs/ac_che_config.yaml index 507f846..7b5c632 100644 --- a/configs/ac_che_config.yaml +++ b/configs/ac_che_config.yaml @@ -35,6 +35,7 @@ external: sandbox_slice: 1 ac: + parallel_training: auto num_turns: 2 num_train_epochs: 80 agent_learning_rate: 5.0e-6 diff --git a/configs/ac_he_config.yaml b/configs/ac_he_config.yaml index cfa73a3..6fb8217 100644 --- a/configs/ac_he_config.yaml +++ b/configs/ac_he_config.yaml @@ -35,6 +35,7 @@ external: sandbox_slice: 1 ac: + parallel_training: auto num_turns: 2 num_train_epochs: 80 agent_learning_rate: 5.0e-6 diff --git a/configs/ac_mbpp_config.yaml b/configs/ac_mbpp_config.yaml index 4a502f4..7336ae9 100644 --- a/configs/ac_mbpp_config.yaml +++ b/configs/ac_mbpp_config.yaml @@ -35,6 +35,7 @@ external: sandbox_slice: 1 ac: + parallel_training: auto num_turns: 2 num_train_epochs: 80 agent_learning_rate: 5.0e-6 diff --git a/configs/grpo_che_config.yaml b/configs/grpo_che_config.yaml index 8eae75e..0ab8392 100644 --- a/configs/grpo_che_config.yaml +++ b/configs/grpo_che_config.yaml @@ -28,6 +28,7 @@ external: sandbox_slice: 1 grpo: + parallel_training: auto num_turns: 2 num_train_epochs: 20 agent_learning_rate: 2.0e-5 diff --git a/configs/grpo_he_config.yaml b/configs/grpo_he_config.yaml index 9d9d131..d957c1e 100644 --- a/configs/grpo_he_config.yaml +++ b/configs/grpo_he_config.yaml @@ -28,6 +28,7 @@ external: sandbox_slice: 1 grpo: + parallel_training: auto num_turns: 2 num_train_epochs: 6 agent_learning_rate: 2.0e-5 diff --git a/configs/grpo_mbpp_config.yaml b/configs/grpo_mbpp_config.yaml index c850a52..58d4429 100644 --- a/configs/grpo_mbpp_config.yaml +++ b/configs/grpo_mbpp_config.yaml @@ -28,6 +28,7 @@ external: sandbox_slice: 1 grpo: + parallel_training: auto num_turns: 2 num_train_epochs: 8 agent_learning_rate: 3.0e-5 diff --git a/configs/iac_che_config.yaml b/configs/iac_che_config.yaml index 0f2c3e7..06da93e 100644 --- a/configs/iac_che_config.yaml +++ b/configs/iac_che_config.yaml @@ -35,6 +35,7 @@ external: sandbox_slice: 1 iac: + parallel_training: auto num_agents: 2 num_turns: 2 use_separate_critic: true diff --git a/configs/iac_he_config.yaml b/configs/iac_he_config.yaml index 59d34f0..c23ad80 100644 --- a/configs/iac_he_config.yaml +++ b/configs/iac_he_config.yaml @@ -35,6 +35,7 @@ external: sandbox_slice: 1 iac: + parallel_training: auto num_agents: 2 num_turns: 2 use_separate_critic: true diff --git a/configs/iac_mbpp_config.yaml b/configs/iac_mbpp_config.yaml index b4e0400..c8bb51d 100644 --- a/configs/iac_mbpp_config.yaml +++ b/configs/iac_mbpp_config.yaml @@ -35,6 +35,7 @@ external: sandbox_slice: 1 iac: + parallel_training: auto num_agents: 2 num_turns: 2 use_separate_critic: true diff --git a/configs/maac_che_config.yaml b/configs/maac_che_config.yaml index ee1e828..8f8a4d5 100644 --- a/configs/maac_che_config.yaml +++ b/configs/maac_che_config.yaml @@ -35,6 +35,7 @@ external: sandbox_slice: 1 maac: + parallel_training: auto num_agents: 2 num_turns: 2 critic_type: "v" diff --git a/configs/maac_he_config.yaml b/configs/maac_he_config.yaml index 704c03d..0b133ec 100644 --- a/configs/maac_he_config.yaml +++ b/configs/maac_he_config.yaml @@ -35,6 +35,7 @@ external: sandbox_slice: 1 maac: + parallel_training: auto num_agents: 2 num_turns: 2 critic_type: "v" diff --git a/configs/maac_mbpp_config.yaml b/configs/maac_mbpp_config.yaml index f038a89..fedc6a8 100644 --- a/configs/maac_mbpp_config.yaml +++ b/configs/maac_mbpp_config.yaml @@ -35,6 +35,7 @@ external: sandbox_slice: 1 maac: + parallel_training: auto num_agents: 2 num_turns: 2 critic_type: "v" diff --git a/configs/magrpo_che_config.yaml b/configs/magrpo_che_config.yaml index 5ad628e..92f9b46 100644 --- a/configs/magrpo_che_config.yaml +++ b/configs/magrpo_che_config.yaml @@ -31,6 +31,7 @@ external: sandbox_slice: 1 magrpo: + parallel_training: auto num_agents: 2 num_turns: 2 num_train_epochs: 8 diff --git a/configs/magrpo_he_config.yaml b/configs/magrpo_he_config.yaml index 2155bca..2d20289 100644 --- a/configs/magrpo_he_config.yaml +++ b/configs/magrpo_he_config.yaml @@ -31,6 +31,7 @@ external: sandbox_slice: 1 magrpo: + parallel_training: auto num_agents: 2 num_turns: 2 num_train_epochs: 6 diff --git a/configs/magrpo_mbpp_config.yaml b/configs/magrpo_mbpp_config.yaml index abb4b12..aa3e5d4 100644 --- a/configs/magrpo_mbpp_config.yaml +++ b/configs/magrpo_mbpp_config.yaml @@ -31,6 +31,7 @@ external: sandbox_slice: 1 magrpo: + parallel_training: auto num_agents: 2 num_turns: 2 num_train_epochs: 8 diff --git a/train_ac.py b/train_ac.py index 31cfebf..3b24145 100644 --- a/train_ac.py +++ b/train_ac.py @@ -384,7 +384,7 @@ def external_transition_fn( num_agents=1, num_generations=ac_cfg.get("num_generations", 1), use_separate_critic=use_separate_critic, - parallel_mode=str(ac_cfg.get("parallel_mode", "auto")).strip().lower(), + parallel_training=str(ac_cfg.get("parallel_training", "auto")).strip().lower(), agent_devices=ac_cfg.get("agent_devices", None), critic_devices=ac_cfg.get("critic_devices", None), critic_value_head_hidden_dim=ac_cfg.get("critic_value_head_hidden_dim"), diff --git a/train_grpo.py b/train_grpo.py index fcf019b..76610c3 100644 --- a/train_grpo.py +++ b/train_grpo.py @@ -337,7 +337,7 @@ def _resolver(prompt: str): temperature=temperature, top_p=top_p, top_k=top_k, - parallel_mode=str(grpo_config.get("parallel_mode", "auto")).strip().lower(), + parallel_training=str(grpo_config.get("parallel_training", "auto")).strip().lower(), agent_devices=grpo_config.get("agent_devices", None), discount=grpo_config.get("discount", 0.9), joint_mode=grpo_config.get("joint_mode", "aligned"), diff --git a/train_iac.py b/train_iac.py index fa54ab9..05e81ad 100644 --- a/train_iac.py +++ b/train_iac.py @@ -417,7 +417,7 @@ def external_transition_fn( num_agents=num_agents, num_generations=iac_cfg.get("num_generations", 1), use_separate_critic=use_separate_critic, - parallel_mode=str(iac_cfg.get("parallel_mode", "auto")).strip().lower(), + parallel_training=str(iac_cfg.get("parallel_training", "auto")).strip().lower(), agent_devices=iac_cfg.get("agent_devices", None), critic_devices=iac_cfg.get("critic_devices", None), critic_value_head_hidden_dim=iac_cfg.get("critic_value_head_hidden_dim"), diff --git a/train_maac.py b/train_maac.py index e613da6..fc59b87 100644 --- a/train_maac.py +++ b/train_maac.py @@ -411,7 +411,7 @@ def external_transition_fn( top_k=top_k, num_agents=num_agents, num_generations=maac_cfg.get("num_generations", 1), - parallel_mode=str(maac_cfg.get("parallel_mode", "auto")).strip().lower(), + parallel_training=str(maac_cfg.get("parallel_training", "auto")).strip().lower(), agent_devices=maac_cfg.get("agent_devices", None), critic_devices=maac_cfg.get("critic_devices", None), discount=discount, diff --git a/train_magrpo.py b/train_magrpo.py index d24f187..530c984 100644 --- a/train_magrpo.py +++ b/train_magrpo.py @@ -407,8 +407,8 @@ def _resolver(prompt: str): magrpo_args_kwargs.update( { "num_agents": num_agents, - "parallel_mode": str( - magrpo_config.get("parallel_mode", "auto") + "parallel_training": str( + magrpo_config.get("parallel_training", "auto") ).strip().lower(), "agent_devices": magrpo_config.get("agent_devices", None), "discount": magrpo_config.get("discount", 0.9), From 162a9c50f91b03e5f3b546f3b3ec0fc18d5f5bf1 Mon Sep 17 00:00:00 2001 From: N!no Date: Sun, 15 Feb 2026 15:57:08 -0500 Subject: [PATCH 04/14] ud --- config.py | 56 ++++++++++++++++++++++++++++----- configs/ac_che_config.yaml | 6 +--- configs/ac_he_config.yaml | 6 +--- configs/ac_mbpp_config.yaml | 6 +--- configs/grpo_che_config.yaml | 4 +-- configs/grpo_he_config.yaml | 4 +-- configs/grpo_mbpp_config.yaml | 4 +-- configs/iac_che_config.yaml | 6 +--- configs/iac_he_config.yaml | 6 +--- configs/iac_mbpp_config.yaml | 6 +--- configs/maac_che_config.yaml | 6 +--- configs/maac_he_config.yaml | 6 +--- configs/maac_mbpp_config.yaml | 6 +--- configs/magrpo_che_config.yaml | 4 +-- configs/magrpo_he_config.yaml | 4 +-- configs/magrpo_mbpp_config.yaml | 4 +-- train_ac.py | 15 +++++---- train_grpo.py | 6 ++-- train_iac.py | 15 +++++---- train_maac.py | 15 +++++---- train_magrpo.py | 8 ++--- 21 files changed, 98 insertions(+), 95 deletions(-) diff --git a/config.py b/config.py index 60d21c7..ebfb5a5 100644 --- a/config.py +++ b/config.py @@ -25,20 +25,62 @@ class ModelConfig: name: str type: str = "qwen" - temperature: float = 0.7 - top_p: float = 0.9 + temperature: Optional[float] = None + top_p: Optional[float] = None + top_k: Optional[int] = None max_length: int = 2048 special_tokens: Dict[str, str] = field(default_factory=dict) torch_dtype: Optional[str] = None @classmethod - def from_dict(cls, config_dict: Dict[str, Any]) -> "ModelConfig": + def from_dict( + cls, + config_dict: Dict[str, Any], + *, + require_sampling: bool = True, + ) -> "ModelConfig": """Create ModelConfig from dictionary.""" + if require_sampling: + missing = [ + key + for key in ("temperature", "top_p", "top_k") + if key not in config_dict + ] + if missing: + raise ValueError( + f"agent_model is missing required sampling fields: {', '.join(missing)}" + ) + + def _as_optional_float(value: Any) -> Optional[float]: + if value is None: + return None + try: + return float(value) + except (TypeError, ValueError) as exc: + raise ValueError(f"Invalid float value: {value}") from exc + + def _as_optional_int(value: Any) -> Optional[int]: + if value is None: + return None + if isinstance(value, str) and value.strip().lower() in ("none", "null", ""): + return None + try: + return int(float(value)) + except (TypeError, ValueError) as exc: + raise ValueError(f"Invalid int value: {value}") from exc + + temperature = _as_optional_float(config_dict.get("temperature")) + top_p = _as_optional_float(config_dict.get("top_p")) + top_k = _as_optional_int(config_dict.get("top_k")) + if require_sampling and (temperature is None or top_p is None): + raise ValueError("agent_model.temperature and agent_model.top_p must be non-null.") + return cls( name=config_dict.get("name", ""), type=config_dict.get("type", "qwen"), - temperature=config_dict.get("temperature", 0.7), - top_p=config_dict.get("top_p", 0.9), + temperature=temperature, + top_p=top_p, + top_k=top_k, max_length=config_dict.get("max_length", 2048), special_tokens=config_dict.get("special_tokens", {}), torch_dtype=( @@ -82,7 +124,7 @@ def get_agent_model_config(self) -> ModelConfig: model_section = self.get_section("agent_model") if not model_section: raise ValueError("No 'agent_model' section found in configuration") - return ModelConfig.from_dict(model_section) + return ModelConfig.from_dict(model_section, require_sampling=True) def get_critic_model_config(self, required: bool = True) -> Optional[ModelConfig]: """Get critic model configuration as ModelConfig object.""" @@ -91,7 +133,7 @@ def get_critic_model_config(self, required: bool = True) -> Optional[ModelConfig if required: raise ValueError("No 'critic_model' section found in configuration") return None - return ModelConfig.from_dict(critic_section) + return ModelConfig.from_dict(critic_section, require_sampling=False) def update(self, updates: Dict[str, Any]): """Update configuration with new values (deep merge).""" diff --git a/configs/ac_che_config.yaml b/configs/ac_che_config.yaml index 7b5c632..a92843d 100644 --- a/configs/ac_che_config.yaml +++ b/configs/ac_che_config.yaml @@ -3,6 +3,7 @@ agent_model: type: "qwen" temperature: 0.7 top_p: 0.9 + top_k: null max_length: 2048 torch_dtype: "bfloat16" @@ -11,8 +12,6 @@ agents: null critic_model: name: "Qwen/Qwen2.5-Coder-3B" type: "qwen" - temperature: 0.7 - top_p: 0.9 max_length: 2048 torch_dtype: "bfloat16" @@ -44,9 +43,6 @@ ac: advantage_normalization: true rollout_buffer_size: 4 max_new_tokens: 256 - temperature: 0.6 - top_p: 0.6 - top_k: null discount: 0.9 early_termination_threshold: -0.2 eval_interval: 40 diff --git a/configs/ac_he_config.yaml b/configs/ac_he_config.yaml index 6fb8217..9766de1 100644 --- a/configs/ac_he_config.yaml +++ b/configs/ac_he_config.yaml @@ -3,6 +3,7 @@ agent_model: type: "qwen" temperature: 0.7 top_p: 0.9 + top_k: null max_length: 2048 torch_dtype: "bfloat16" @@ -11,8 +12,6 @@ agents: null critic_model: name: "Qwen/Qwen2.5-Coder-3B" type: "qwen" - temperature: 0.7 - top_p: 0.9 max_length: 2048 torch_dtype: "bfloat16" @@ -44,9 +43,6 @@ ac: advantage_normalization: true rollout_buffer_size: 4 max_new_tokens: 256 - temperature: 0.6 - top_p: 0.6 - top_k: null discount: 0.9 early_termination_threshold: -0.2 eval_interval: 40 diff --git a/configs/ac_mbpp_config.yaml b/configs/ac_mbpp_config.yaml index 7336ae9..71b941c 100644 --- a/configs/ac_mbpp_config.yaml +++ b/configs/ac_mbpp_config.yaml @@ -3,6 +3,7 @@ agent_model: type: "qwen" temperature: 0.7 top_p: 0.9 + top_k: null max_length: 2048 torch_dtype: "bfloat16" @@ -11,8 +12,6 @@ agents: null critic_model: name: "Qwen/Qwen2.5-Coder-3B" type: "qwen" - temperature: 0.7 - top_p: 0.9 max_length: 2048 torch_dtype: "bfloat16" @@ -44,9 +43,6 @@ ac: advantage_normalization: true rollout_buffer_size: 4 max_new_tokens: 256 - temperature: 0.6 - top_p: 0.6 - top_k: null discount: 0.9 early_termination_threshold: -0.2 eval_interval: 40 diff --git a/configs/grpo_che_config.yaml b/configs/grpo_che_config.yaml index 0ab8392..19817c9 100644 --- a/configs/grpo_che_config.yaml +++ b/configs/grpo_che_config.yaml @@ -3,6 +3,7 @@ agent_model: type: "qwen" temperature: 0.7 top_p: 0.9 + top_k: null max_length: 2048 torch_dtype: "bfloat16" @@ -35,9 +36,6 @@ grpo: logging_steps: 50 num_generations: 4 max_new_tokens: 256 - temperature: 0.8 - top_p: 0.95 - top_k: null discount: 0.9 early_termination_threshold: -0.1 rollout_buffer_size: 2 diff --git a/configs/grpo_he_config.yaml b/configs/grpo_he_config.yaml index d957c1e..7236796 100644 --- a/configs/grpo_he_config.yaml +++ b/configs/grpo_he_config.yaml @@ -3,6 +3,7 @@ agent_model: type: "qwen" temperature: 0.7 top_p: 0.9 + top_k: null max_length: 2048 torch_dtype: "bfloat16" @@ -35,9 +36,6 @@ grpo: logging_steps: 50 num_generations: 4 max_new_tokens: 256 - temperature: 0.8 - top_p: 0.95 - top_k: null discount: 0.9 early_termination_threshold: -0.1 rollout_buffer_size: 2 diff --git a/configs/grpo_mbpp_config.yaml b/configs/grpo_mbpp_config.yaml index 58d4429..bcf002f 100644 --- a/configs/grpo_mbpp_config.yaml +++ b/configs/grpo_mbpp_config.yaml @@ -3,6 +3,7 @@ agent_model: type: "qwen" temperature: 0.7 top_p: 0.9 + top_k: null max_length: 2048 torch_dtype: "bfloat16" @@ -35,9 +36,6 @@ grpo: logging_steps: 50 num_generations: 4 max_new_tokens: 256 - temperature: 0.8 - top_p: 0.95 - top_k: null discount: 0.9 early_termination_threshold: -0.1 rollout_buffer_size: 2 diff --git a/configs/iac_che_config.yaml b/configs/iac_che_config.yaml index 06da93e..5646f2d 100644 --- a/configs/iac_che_config.yaml +++ b/configs/iac_che_config.yaml @@ -3,6 +3,7 @@ agent_model: type: "qwen" temperature: 0.7 top_p: 0.9 + top_k: null max_length: 2048 torch_dtype: "bfloat16" @@ -11,8 +12,6 @@ agents: null critic_model: name: "Qwen/Qwen2.5-Coder-3B" type: "qwen" - temperature: 0.7 - top_p: 0.9 max_length: 2048 torch_dtype: "bfloat16" @@ -47,9 +46,6 @@ iac: rollout_buffer_size: 4 train_batch_size: 4 max_new_tokens: 256 - temperature: 0.6 - top_p: 0.6 - top_k: null discount: 0.9 early_termination_threshold: -0.2 eval_interval: 40 diff --git a/configs/iac_he_config.yaml b/configs/iac_he_config.yaml index c23ad80..2582103 100644 --- a/configs/iac_he_config.yaml +++ b/configs/iac_he_config.yaml @@ -3,6 +3,7 @@ agent_model: type: "qwen" temperature: 0.7 top_p: 0.9 + top_k: null max_length: 2048 torch_dtype: "bfloat16" @@ -11,8 +12,6 @@ agents: null critic_model: name: "Qwen/Qwen2.5-Coder-3B" type: "qwen" - temperature: 0.7 - top_p: 0.9 max_length: 2048 torch_dtype: "bfloat16" @@ -47,9 +46,6 @@ iac: rollout_buffer_size: 4 train_batch_size: 4 max_new_tokens: 256 - temperature: 0.6 - top_p: 0.6 - top_k: null discount: 0.9 early_termination_threshold: -0.2 eval_interval: 40 diff --git a/configs/iac_mbpp_config.yaml b/configs/iac_mbpp_config.yaml index c8bb51d..b39f84d 100644 --- a/configs/iac_mbpp_config.yaml +++ b/configs/iac_mbpp_config.yaml @@ -3,6 +3,7 @@ agent_model: type: "qwen" temperature: 0.7 top_p: 0.9 + top_k: null max_length: 2048 torch_dtype: "bfloat16" @@ -11,8 +12,6 @@ agents: null critic_model: name: "Qwen/Qwen2.5-Coder-3B" type: "qwen" - temperature: 0.7 - top_p: 0.9 max_length: 2048 torch_dtype: "bfloat16" @@ -47,9 +46,6 @@ iac: rollout_buffer_size: 4 train_batch_size: 4 max_new_tokens: 256 - temperature: 0.6 - top_p: 0.6 - top_k: null discount: 0.9 early_termination_threshold: -0.2 eval_interval: 40 diff --git a/configs/maac_che_config.yaml b/configs/maac_che_config.yaml index 8f8a4d5..91becf7 100644 --- a/configs/maac_che_config.yaml +++ b/configs/maac_che_config.yaml @@ -3,6 +3,7 @@ agent_model: type: "qwen" temperature: 0.7 top_p: 0.9 + top_k: null max_length: 2048 torch_dtype: "bfloat16" @@ -11,8 +12,6 @@ agents: null critic_model: name: "Qwen/Qwen2.5-Coder-3B" type: "qwen" - temperature: 0.7 - top_p: 0.9 max_length: 2048 torch_dtype: "bfloat16" @@ -46,9 +45,6 @@ maac: rollout_buffer_size: 4 train_batch_size: 4 max_new_tokens: 256 - temperature: 0.6 - top_p: 0.6 - top_k: null discount: 0.9 early_termination_threshold: -0.2 eval_interval: 40 diff --git a/configs/maac_he_config.yaml b/configs/maac_he_config.yaml index 0b133ec..07f0986 100644 --- a/configs/maac_he_config.yaml +++ b/configs/maac_he_config.yaml @@ -3,6 +3,7 @@ agent_model: type: "qwen" temperature: 0.7 top_p: 0.9 + top_k: null max_length: 2048 torch_dtype: "bfloat16" @@ -11,8 +12,6 @@ agents: null critic_model: name: "Qwen/Qwen2.5-Coder-3B" type: "qwen" - temperature: 0.7 - top_p: 0.9 max_length: 2048 torch_dtype: "bfloat16" @@ -46,9 +45,6 @@ maac: rollout_buffer_size: 4 train_batch_size: 4 max_new_tokens: 256 - temperature: 0.6 - top_p: 0.6 - top_k: null discount: 0.9 early_termination_threshold: -0.2 eval_interval: 40 diff --git a/configs/maac_mbpp_config.yaml b/configs/maac_mbpp_config.yaml index fedc6a8..076d8e4 100644 --- a/configs/maac_mbpp_config.yaml +++ b/configs/maac_mbpp_config.yaml @@ -3,6 +3,7 @@ agent_model: type: "qwen" temperature: 0.7 top_p: 0.9 + top_k: null max_length: 2048 torch_dtype: "bfloat16" @@ -11,8 +12,6 @@ agents: null critic_model: name: "Qwen/Qwen2.5-Coder-3B" type: "qwen" - temperature: 0.7 - top_p: 0.9 max_length: 2048 torch_dtype: "bfloat16" @@ -46,9 +45,6 @@ maac: rollout_buffer_size: 4 train_batch_size: 4 max_new_tokens: 256 - temperature: 0.6 - top_p: 0.6 - top_k: null discount: 0.9 early_termination_threshold: -0.2 eval_interval: 40 diff --git a/configs/magrpo_che_config.yaml b/configs/magrpo_che_config.yaml index 92f9b46..658a718 100644 --- a/configs/magrpo_che_config.yaml +++ b/configs/magrpo_che_config.yaml @@ -3,6 +3,7 @@ agent_model: type: "qwen" temperature: 0.7 top_p: 0.9 + top_k: null max_length: 2048 torch_dtype: "bfloat16" special_tokens: {} @@ -39,9 +40,6 @@ magrpo: logging_steps: 50 num_generations: 4 max_new_tokens: 256 - temperature: 0.6 - top_p: 0.6 - top_k: null discount: 0.9 joint_mode: aligned early_termination_threshold: -0.2 diff --git a/configs/magrpo_he_config.yaml b/configs/magrpo_he_config.yaml index 2d20289..34bfb37 100644 --- a/configs/magrpo_he_config.yaml +++ b/configs/magrpo_he_config.yaml @@ -3,6 +3,7 @@ agent_model: type: "qwen" temperature: 0.7 top_p: 0.9 + top_k: null max_length: 2048 torch_dtype: "bfloat16" special_tokens: {} @@ -39,9 +40,6 @@ magrpo: logging_steps: 50 num_generations: 4 max_new_tokens: 256 - temperature: 0.6 - top_p: 0.6 - top_k: null discount: 0.9 joint_mode: aligned early_termination_threshold: -0.2 diff --git a/configs/magrpo_mbpp_config.yaml b/configs/magrpo_mbpp_config.yaml index aa3e5d4..09e9991 100644 --- a/configs/magrpo_mbpp_config.yaml +++ b/configs/magrpo_mbpp_config.yaml @@ -3,6 +3,7 @@ agent_model: type: "qwen" temperature: 0.7 top_p: 0.9 + top_k: null max_length: 2048 torch_dtype: "bfloat16" special_tokens: {} @@ -39,9 +40,6 @@ magrpo: logging_steps: 50 num_generations: 4 max_new_tokens: 256 - temperature: 0.8 - top_p: 0.95 - top_k: null discount: 0.9 joint_mode: aligned early_termination_threshold: -0.2 diff --git a/train_ac.py b/train_ac.py index 3b24145..9907d95 100644 --- a/train_ac.py +++ b/train_ac.py @@ -311,9 +311,9 @@ def _resolver(prompt: str): reward_processor = RewardProcessors.shift(value=shift_val_f) # AC-specific config - top_k = ac_cfg.get("top_k") - temperature = ac_cfg.get("temperature", 0.6) - top_p = ac_cfg.get("top_p", 0.6) + top_k = model_config.top_k + temperature = model_config.temperature + top_p = model_config.top_p use_separate_critic = bool(ac_cfg.get("use_separate_critic", True)) model_kwargs: Dict[str, Any] = {} if model_config.torch_dtype is not None: @@ -443,6 +443,9 @@ def _build_wandb_config( ): wandb_section = config.get_section("wandb") if hasattr(config, "get_section") else {} ac_section = config.get_section("ac") if hasattr(config, "get_section") else {} + model_section = ( + config.get_section("agent_model") if hasattr(config, "get_section") else {} + ) output_section = ( config.get_section("output") if hasattr(config, "get_section") else {} ) @@ -470,9 +473,9 @@ def _build_wandb_config( "trainer": { "num_turns": ac_section.get("num_turns", 1), "max_new_tokens": ac_section.get("max_new_tokens", 256), - "temperature": ac_section.get("temperature", 0.6), - "top_p": ac_section.get("top_p", 0.6), - "top_k": ac_section.get("top_k"), + "temperature": model_section.get("temperature"), + "top_p": model_section.get("top_p"), + "top_k": model_section.get("top_k"), "discount": ac_section.get("discount", 0.9), "use_separate_critic": ac_section.get("use_separate_critic", True), }, diff --git a/train_grpo.py b/train_grpo.py index 76610c3..650ec1e 100644 --- a/train_grpo.py +++ b/train_grpo.py @@ -242,9 +242,9 @@ def main(): ) print("Model loaded successfully!") - temperature = grpo_config.get("temperature", model_config.temperature) - top_p = grpo_config.get("top_p", model_config.top_p) - top_k = grpo_config.get("top_k") + temperature = model_config.temperature + top_p = model_config.top_p + top_k = model_config.top_k external_cfg = config.get_section("external") if hasattr(config, "get_section") else {} # Register external context resolver using dataset items (for external modes) diff --git a/train_iac.py b/train_iac.py index 05e81ad..eebf7b7 100644 --- a/train_iac.py +++ b/train_iac.py @@ -352,9 +352,9 @@ def _resolver(prompt: str): if shift_val_f is not None: reward_processor = RewardProcessors.shift(value=shift_val_f) - top_k = iac_cfg.get("top_k") - temperature = iac_cfg.get("temperature", 0.6) - top_p = iac_cfg.get("top_p", 0.6) + top_k = model_config.top_k + temperature = model_config.temperature + top_p = model_config.top_p use_separate_critic = bool(iac_cfg.get("use_separate_critic", True)) model_kwargs: Dict[str, Any] = {} if model_config.torch_dtype is not None: @@ -477,6 +477,9 @@ def _build_wandb_config( ): wandb_section = config.get_section("wandb") if hasattr(config, "get_section") else {} iac_section = config.get_section("iac") if hasattr(config, "get_section") else {} + model_section = ( + config.get_section("agent_model") if hasattr(config, "get_section") else {} + ) output_section = ( config.get_section("output") if hasattr(config, "get_section") else {} ) @@ -504,9 +507,9 @@ def _build_wandb_config( "trainer": { "num_turns": iac_section.get("num_turns", 1), "max_new_tokens": iac_section.get("max_new_tokens", 256), - "temperature": iac_section.get("temperature", 0.6), - "top_p": iac_section.get("top_p", 0.6), - "top_k": iac_section.get("top_k"), + "temperature": model_section.get("temperature"), + "top_p": model_section.get("top_p"), + "top_k": model_section.get("top_k"), "use_separate_critic": iac_section.get( "use_separate_critic", False ), diff --git a/train_maac.py b/train_maac.py index fc59b87..89e2306 100644 --- a/train_maac.py +++ b/train_maac.py @@ -342,9 +342,9 @@ def _resolver(prompt: str): if shift_val_f is not None: reward_processor = RewardProcessors.shift(value=shift_val_f) - top_k = maac_cfg.get("top_k") - temperature = maac_cfg.get("temperature", 0.6) - top_p = maac_cfg.get("top_p", 0.6) + top_k = model_config.top_k + temperature = model_config.temperature + top_p = model_config.top_p model_kwargs: Dict[str, Any] = {} if model_config.torch_dtype is not None: model_kwargs["torch_dtype"] = model_config.torch_dtype @@ -466,6 +466,9 @@ def _build_wandb_config( ): wandb_section = config.get_section("wandb") if hasattr(config, "get_section") else {} maac_section = config.get_section("maac") if hasattr(config, "get_section") else {} + model_section = ( + config.get_section("agent_model") if hasattr(config, "get_section") else {} + ) output_section = ( config.get_section("output") if hasattr(config, "get_section") else {} ) @@ -493,9 +496,9 @@ def _build_wandb_config( "trainer": { "num_turns": maac_section.get("num_turns", 2), "max_new_tokens": maac_section.get("max_new_tokens", 256), - "temperature": maac_section.get("temperature", 0.6), - "top_p": maac_section.get("top_p", 0.6), - "top_k": maac_section.get("top_k"), + "temperature": model_section.get("temperature"), + "top_p": model_section.get("top_p"), + "top_k": model_section.get("top_k"), "discount": maac_section.get("discount", 0.9), "critic_type": maac_section.get("critic_type", "v"), }, diff --git a/train_magrpo.py b/train_magrpo.py index 530c984..3191fcd 100644 --- a/train_magrpo.py +++ b/train_magrpo.py @@ -300,8 +300,9 @@ def main(): ) tokenizer = tokenizers[0] - temperature = magrpo_config.get("temperature", 0.6) - top_p = magrpo_config.get("top_p", 0.6) + temperature = model_config.temperature + top_p = model_config.top_p + top_k = model_config.top_k external_cfg = config.get_section("external") if hasattr(config, "get_section") else {} # Register external context resolver using dataset items @@ -401,9 +402,8 @@ def _resolver(prompt: str): "max_new_tokens": magrpo_config.get("max_new_tokens", 256), "temperature": temperature, "top_p": top_p, + "top_k": top_k, } - if "top_k" in magrpo_config: - magrpo_args_kwargs["top_k"] = magrpo_config.get("top_k") magrpo_args_kwargs.update( { "num_agents": num_agents, From 76c7183880eaa376976373c770323f91c6f656ba Mon Sep 17 00:00:00 2001 From: N!no Date: Sun, 15 Feb 2026 16:33:47 -0500 Subject: [PATCH 05/14] ud --- configs/ac_che_config.yaml | 6 +++--- configs/ac_he_config.yaml | 6 +++--- configs/ac_mbpp_config.yaml | 6 +++--- configs/grpo_che_config.yaml | 4 ++-- configs/grpo_he_config.yaml | 4 ++-- configs/grpo_mbpp_config.yaml | 4 ++-- configs/iac_che_config.yaml | 6 +++--- configs/iac_he_config.yaml | 6 +++--- configs/iac_mbpp_config.yaml | 6 +++--- configs/maac_che_config.yaml | 6 +++--- configs/maac_he_config.yaml | 6 +++--- configs/maac_mbpp_config.yaml | 6 +++--- configs/magrpo_che_config.yaml | 4 ++-- configs/magrpo_he_config.yaml | 4 ++-- configs/magrpo_mbpp_config.yaml | 4 ++-- 15 files changed, 39 insertions(+), 39 deletions(-) diff --git a/configs/ac_che_config.yaml b/configs/ac_che_config.yaml index a92843d..573885a 100644 --- a/configs/ac_che_config.yaml +++ b/configs/ac_che_config.yaml @@ -24,10 +24,10 @@ dataset: eval_split: "train[:16]" output: - base_dir: "output" + base_dir: output_ac_che verbose: false save_final_model: false - save_path: "output/maac_final" + save_path: output_ac_che external: mode: "level_feedback" @@ -54,5 +54,5 @@ wandb: project: "comlrl" entity: "OpenMLRL" name: "ac_coophumaneval" - dir: "output" + dir: output_ac_che tags: ["ac", "coophumaneval", "single-agent", "turns_2"] diff --git a/configs/ac_he_config.yaml b/configs/ac_he_config.yaml index 9766de1..ffec31e 100644 --- a/configs/ac_he_config.yaml +++ b/configs/ac_he_config.yaml @@ -24,10 +24,10 @@ dataset: eval_split: "test[:32]" output: - base_dir: "output" + base_dir: output_ac_he verbose: false save_final_model: false - save_path: "output/maac_final" + save_path: output_ac_he external: mode: "level_feedback" @@ -54,5 +54,5 @@ wandb: project: "comlrl" entity: "OpenMLRL" name: "ac_humaneval" - dir: "output" + dir: output_ac_he tags: ["ac", "humaneval", "single-agent", "turns_2"] diff --git a/configs/ac_mbpp_config.yaml b/configs/ac_mbpp_config.yaml index 71b941c..f76dd7f 100644 --- a/configs/ac_mbpp_config.yaml +++ b/configs/ac_mbpp_config.yaml @@ -24,10 +24,10 @@ dataset: eval_split: "test[:15]" output: - base_dir: "output" + base_dir: output_ac_mbpp verbose: false save_final_model: false - save_path: "output/maac_final" + save_path: output_ac_mbpp external: mode: "level_feedback" @@ -54,5 +54,5 @@ wandb: project: "comlrl" entity: "OpenMLRL" name: "ac_mbpp" - dir: "output" + dir: output_ac_mbpp tags: ["ac", "mbpp", "single-agent", "turns_2"] diff --git a/configs/grpo_che_config.yaml b/configs/grpo_che_config.yaml index 19817c9..ee346bb 100644 --- a/configs/grpo_che_config.yaml +++ b/configs/grpo_che_config.yaml @@ -20,7 +20,7 @@ dataset: eval_split: "train[:16]" output: - base_dir: "output" + base_dir: output_grpo_che save_final_model: false verbose: false @@ -51,5 +51,5 @@ wandb: project: "comlrl" entity: "OpenMLRL" name: "grpo_coophumaneval" - dir: "output" + dir: output_grpo_che tags: ["grpo", "coophumaneval"] diff --git a/configs/grpo_he_config.yaml b/configs/grpo_he_config.yaml index 7236796..3a3288c 100644 --- a/configs/grpo_he_config.yaml +++ b/configs/grpo_he_config.yaml @@ -20,7 +20,7 @@ dataset: eval_split: "test[:32]" output: - base_dir: "output" + base_dir: output_grpo_he save_final_model: false verbose: false @@ -51,5 +51,5 @@ wandb: project: "comlrl" entity: "OpenMLRL" name: "grpo_humaneval" - dir: "output" + dir: output_grpo_he tags: ["grpo", "humaneval"] diff --git a/configs/grpo_mbpp_config.yaml b/configs/grpo_mbpp_config.yaml index bcf002f..5ff09cd 100644 --- a/configs/grpo_mbpp_config.yaml +++ b/configs/grpo_mbpp_config.yaml @@ -20,7 +20,7 @@ dataset: eval_split: "test[:15]" output: - base_dir: "output" + base_dir: output_grpo_mbpp save_final_model: false verbose: false @@ -51,5 +51,5 @@ wandb: project: "comlrl" entity: "OpenMLRL" name: "grpo_mbpp" - dir: "output" + dir: output_grpo_mbpp tags: ["grpo", "mbpp"] diff --git a/configs/iac_che_config.yaml b/configs/iac_che_config.yaml index 5646f2d..faa16fd 100644 --- a/configs/iac_che_config.yaml +++ b/configs/iac_che_config.yaml @@ -24,10 +24,10 @@ dataset: eval_split: "train[:16]" output: - base_dir: "output" + base_dir: output_iac_che verbose: false save_final_model: false - save_path: "output/maac_final" + save_path: output_iac_che external: mode: "level_feedback" @@ -58,5 +58,5 @@ wandb: project: "comlrl" entity: "OpenMLRL" name: "iac_coophumaneval" - dir: "output" + dir: output_iac_che tags: ["iac", "coophumaneval", "multi-agent", "turns_2"] diff --git a/configs/iac_he_config.yaml b/configs/iac_he_config.yaml index 2582103..491d5ba 100644 --- a/configs/iac_he_config.yaml +++ b/configs/iac_he_config.yaml @@ -24,10 +24,10 @@ dataset: eval_split: "test[:32]" output: - base_dir: "output" + base_dir: output_iac_he verbose: false save_final_model: false - save_path: "output/maac_final" + save_path: output_iac_he external: mode: "level_feedback" @@ -58,5 +58,5 @@ wandb: project: "comlrl" entity: "OpenMLRL" name: "iac_humaneval" - dir: "output" + dir: output_iac_he tags: ["iac", "humaneval", "multi-agent", "turns_2"] diff --git a/configs/iac_mbpp_config.yaml b/configs/iac_mbpp_config.yaml index b39f84d..b273ac6 100644 --- a/configs/iac_mbpp_config.yaml +++ b/configs/iac_mbpp_config.yaml @@ -24,10 +24,10 @@ dataset: eval_split: "test[:15]" output: - base_dir: "output" + base_dir: output_iac_mbpp verbose: false save_final_model: false - save_path: "output/maac_final" + save_path: output_iac_mbpp external: mode: "level_feedback" @@ -58,5 +58,5 @@ wandb: project: "comlrl" entity: "OpenMLRL" name: "iac_mbpp" - dir: "output" + dir: output_iac_mbpp tags: ["iac", "mbpp", "multi-agent", "turns_2"] diff --git a/configs/maac_che_config.yaml b/configs/maac_che_config.yaml index 91becf7..2e38e18 100644 --- a/configs/maac_che_config.yaml +++ b/configs/maac_che_config.yaml @@ -24,10 +24,10 @@ dataset: eval_split: "train[:16]" output: - base_dir: "output" + base_dir: output_maac_che verbose: false save_final_model: false - save_path: "output/maac_final" + save_path: output_maac_che external: mode: "level_feedback" @@ -57,5 +57,5 @@ wandb: project: "comlrl" entity: "OpenMLRL" name: "maac_coophumaneval" - dir: "output" + dir: output_maac_che tags: ["maac", "coophumaneval", "multi-agent", "turns_2"] diff --git a/configs/maac_he_config.yaml b/configs/maac_he_config.yaml index 07f0986..af55e33 100644 --- a/configs/maac_he_config.yaml +++ b/configs/maac_he_config.yaml @@ -24,10 +24,10 @@ dataset: eval_split: "test[:32]" output: - base_dir: "output" + base_dir: output_maac_he verbose: false save_final_model: false - save_path: "output/maac_final" + save_path: output_maac_he external: mode: "level_feedback" @@ -57,5 +57,5 @@ wandb: project: "comlrl" entity: "OpenMLRL" name: "maac_humaneval" - dir: "output" + dir: output_maac_he tags: ["maac", "humaneval", "multi-agent", "turns_2"] diff --git a/configs/maac_mbpp_config.yaml b/configs/maac_mbpp_config.yaml index 076d8e4..c0bdc59 100644 --- a/configs/maac_mbpp_config.yaml +++ b/configs/maac_mbpp_config.yaml @@ -24,10 +24,10 @@ dataset: eval_split: "test[:15]" output: - base_dir: "output" + base_dir: output_maac_mbpp verbose: false save_final_model: false - save_path: "output/maac_final" + save_path: output_maac_mbpp external: mode: "level_feedback" @@ -57,5 +57,5 @@ wandb: project: "comlrl" entity: "OpenMLRL" name: "maac_mbpp" - dir: "output" + dir: output_maac_mbpp tags: ["maac", "mbpp", "multi-agent", "turns_2"] diff --git a/configs/magrpo_che_config.yaml b/configs/magrpo_che_config.yaml index 658a718..2671eab 100644 --- a/configs/magrpo_che_config.yaml +++ b/configs/magrpo_che_config.yaml @@ -23,7 +23,7 @@ dataset: seed: 42 output: - base_dir: "output" + base_dir: output_magrpo_che save_final_model: false verbose: false @@ -59,5 +59,5 @@ wandb: project: "comlrl" entity: "OpenMLRL" name: "magrpo_coophumaneval" - dir: "output" + dir: output_magrpo_che tags: ["magrpo", "coophumaneval", "multi-agent", "turns_2"] diff --git a/configs/magrpo_he_config.yaml b/configs/magrpo_he_config.yaml index 34bfb37..dcfb7c1 100644 --- a/configs/magrpo_he_config.yaml +++ b/configs/magrpo_he_config.yaml @@ -23,7 +23,7 @@ dataset: seed: 42 output: - base_dir: "output" + base_dir: output_magrpo_he save_final_model: false verbose: false @@ -59,5 +59,5 @@ wandb: project: "comlrl" entity: "OpenMLRL" name: "magrpo_humaneval" - dir: "output" + dir: output_magrpo_he tags: ["magrpo", "humaneval", "multi-agent"] diff --git a/configs/magrpo_mbpp_config.yaml b/configs/magrpo_mbpp_config.yaml index 09e9991..6ea4992 100644 --- a/configs/magrpo_mbpp_config.yaml +++ b/configs/magrpo_mbpp_config.yaml @@ -23,7 +23,7 @@ dataset: seed: 42 output: - base_dir: "output" + base_dir: output_magrpo_mbpp save_final_model: false verbose: false @@ -59,5 +59,5 @@ wandb: project: "comlrl" entity: "OpenMLRL" name: "magrpo_mbpp" - dir: "output" + dir: output_magrpo_mbpp tags: ["magrpo", "mbpp", "multi-agent"] From 3e9dffb874b2d336fcc8825370650b3af056add5 Mon Sep 17 00:00:00 2001 From: N!no Date: Sun, 15 Feb 2026 20:42:58 -0500 Subject: [PATCH 06/14] ud --- configs/ac_che_config.yaml | 4 ++-- configs/ac_he_config.yaml | 4 ++-- configs/ac_mbpp_config.yaml | 4 ++-- configs/grpo_che_config.yaml | 4 ++-- configs/grpo_he_config.yaml | 4 ++-- configs/grpo_mbpp_config.yaml | 4 ++-- configs/iac_che_config.yaml | 4 ++-- configs/iac_he_config.yaml | 4 ++-- configs/iac_mbpp_config.yaml | 4 ++-- configs/maac_che_config.yaml | 4 ++-- configs/maac_he_config.yaml | 4 ++-- configs/maac_mbpp_config.yaml | 4 ++-- configs/magrpo_che_config.yaml | 4 ++-- configs/magrpo_he_config.yaml | 4 ++-- configs/magrpo_mbpp_config.yaml | 4 ++-- 15 files changed, 30 insertions(+), 30 deletions(-) diff --git a/configs/ac_che_config.yaml b/configs/ac_che_config.yaml index 573885a..ba703e2 100644 --- a/configs/ac_che_config.yaml +++ b/configs/ac_che_config.yaml @@ -1,8 +1,8 @@ agent_model: name: "Qwen/Qwen2.5-Coder-3B" type: "qwen" - temperature: 0.7 - top_p: 0.9 + temperature: 0.6 + top_p: 0.6 top_k: null max_length: 2048 torch_dtype: "bfloat16" diff --git a/configs/ac_he_config.yaml b/configs/ac_he_config.yaml index ffec31e..10052b4 100644 --- a/configs/ac_he_config.yaml +++ b/configs/ac_he_config.yaml @@ -1,8 +1,8 @@ agent_model: name: "Qwen/Qwen2.5-Coder-3B" type: "qwen" - temperature: 0.7 - top_p: 0.9 + temperature: 0.6 + top_p: 0.6 top_k: null max_length: 2048 torch_dtype: "bfloat16" diff --git a/configs/ac_mbpp_config.yaml b/configs/ac_mbpp_config.yaml index f76dd7f..1b28744 100644 --- a/configs/ac_mbpp_config.yaml +++ b/configs/ac_mbpp_config.yaml @@ -1,8 +1,8 @@ agent_model: name: "Qwen/Qwen2.5-Coder-3B" type: "qwen" - temperature: 0.7 - top_p: 0.9 + temperature: 0.6 + top_p: 0.6 top_k: null max_length: 2048 torch_dtype: "bfloat16" diff --git a/configs/grpo_che_config.yaml b/configs/grpo_che_config.yaml index ee346bb..24b7da4 100644 --- a/configs/grpo_che_config.yaml +++ b/configs/grpo_che_config.yaml @@ -1,8 +1,8 @@ agent_model: name: "Qwen/Qwen2.5-Coder-3B" type: "qwen" - temperature: 0.7 - top_p: 0.9 + temperature: 0.6 + top_p: 0.6 top_k: null max_length: 2048 torch_dtype: "bfloat16" diff --git a/configs/grpo_he_config.yaml b/configs/grpo_he_config.yaml index 3a3288c..d09676f 100644 --- a/configs/grpo_he_config.yaml +++ b/configs/grpo_he_config.yaml @@ -1,8 +1,8 @@ agent_model: name: "Qwen/Qwen2.5-Coder-3B" type: "qwen" - temperature: 0.7 - top_p: 0.9 + temperature: 0.6 + top_p: 0.6 top_k: null max_length: 2048 torch_dtype: "bfloat16" diff --git a/configs/grpo_mbpp_config.yaml b/configs/grpo_mbpp_config.yaml index 5ff09cd..2c87c6c 100644 --- a/configs/grpo_mbpp_config.yaml +++ b/configs/grpo_mbpp_config.yaml @@ -1,8 +1,8 @@ agent_model: name: "Qwen/Qwen2.5-Coder-3B" type: "qwen" - temperature: 0.7 - top_p: 0.9 + temperature: 0.6 + top_p: 0.6 top_k: null max_length: 2048 torch_dtype: "bfloat16" diff --git a/configs/iac_che_config.yaml b/configs/iac_che_config.yaml index faa16fd..d90b27b 100644 --- a/configs/iac_che_config.yaml +++ b/configs/iac_che_config.yaml @@ -1,8 +1,8 @@ agent_model: name: "Qwen/Qwen2.5-Coder-3B" type: "qwen" - temperature: 0.7 - top_p: 0.9 + temperature: 0.6 + top_p: 0.6 top_k: null max_length: 2048 torch_dtype: "bfloat16" diff --git a/configs/iac_he_config.yaml b/configs/iac_he_config.yaml index 491d5ba..8c213cb 100644 --- a/configs/iac_he_config.yaml +++ b/configs/iac_he_config.yaml @@ -1,8 +1,8 @@ agent_model: name: "Qwen/Qwen2.5-Coder-3B" type: "qwen" - temperature: 0.7 - top_p: 0.9 + temperature: 0.6 + top_p: 0.6 top_k: null max_length: 2048 torch_dtype: "bfloat16" diff --git a/configs/iac_mbpp_config.yaml b/configs/iac_mbpp_config.yaml index b273ac6..ee10fdb 100644 --- a/configs/iac_mbpp_config.yaml +++ b/configs/iac_mbpp_config.yaml @@ -1,8 +1,8 @@ agent_model: name: "Qwen/Qwen2.5-Coder-3B" type: "qwen" - temperature: 0.7 - top_p: 0.9 + temperature: 0.6 + top_p: 0.6 top_k: null max_length: 2048 torch_dtype: "bfloat16" diff --git a/configs/maac_che_config.yaml b/configs/maac_che_config.yaml index 2e38e18..b8cca73 100644 --- a/configs/maac_che_config.yaml +++ b/configs/maac_che_config.yaml @@ -1,8 +1,8 @@ agent_model: name: "Qwen/Qwen2.5-Coder-3B" type: "qwen" - temperature: 0.7 - top_p: 0.9 + temperature: 0.6 + top_p: 0.6 top_k: null max_length: 2048 torch_dtype: "bfloat16" diff --git a/configs/maac_he_config.yaml b/configs/maac_he_config.yaml index af55e33..9113c23 100644 --- a/configs/maac_he_config.yaml +++ b/configs/maac_he_config.yaml @@ -1,8 +1,8 @@ agent_model: name: "Qwen/Qwen2.5-Coder-3B" type: "qwen" - temperature: 0.7 - top_p: 0.9 + temperature: 0.6 + top_p: 0.6 top_k: null max_length: 2048 torch_dtype: "bfloat16" diff --git a/configs/maac_mbpp_config.yaml b/configs/maac_mbpp_config.yaml index c0bdc59..1c31709 100644 --- a/configs/maac_mbpp_config.yaml +++ b/configs/maac_mbpp_config.yaml @@ -1,8 +1,8 @@ agent_model: name: "Qwen/Qwen2.5-Coder-3B" type: "qwen" - temperature: 0.7 - top_p: 0.9 + temperature: 0.6 + top_p: 0.6 top_k: null max_length: 2048 torch_dtype: "bfloat16" diff --git a/configs/magrpo_che_config.yaml b/configs/magrpo_che_config.yaml index 2671eab..ff94c2e 100644 --- a/configs/magrpo_che_config.yaml +++ b/configs/magrpo_che_config.yaml @@ -1,8 +1,8 @@ agent_model: name: "Qwen/Qwen2.5-Coder-3B" type: "qwen" - temperature: 0.7 - top_p: 0.9 + temperature: 0.6 + top_p: 0.6 top_k: null max_length: 2048 torch_dtype: "bfloat16" diff --git a/configs/magrpo_he_config.yaml b/configs/magrpo_he_config.yaml index dcfb7c1..a48b641 100644 --- a/configs/magrpo_he_config.yaml +++ b/configs/magrpo_he_config.yaml @@ -1,8 +1,8 @@ agent_model: name: "Qwen/Qwen2.5-Coder-3B" type: "qwen" - temperature: 0.7 - top_p: 0.9 + temperature: 0.6 + top_p: 0.6 top_k: null max_length: 2048 torch_dtype: "bfloat16" diff --git a/configs/magrpo_mbpp_config.yaml b/configs/magrpo_mbpp_config.yaml index 6ea4992..5f3d511 100644 --- a/configs/magrpo_mbpp_config.yaml +++ b/configs/magrpo_mbpp_config.yaml @@ -1,8 +1,8 @@ agent_model: name: "Qwen/Qwen2.5-Coder-3B" type: "qwen" - temperature: 0.7 - top_p: 0.9 + temperature: 0.6 + top_p: 0.6 top_k: null max_length: 2048 torch_dtype: "bfloat16" From 8cff3894a4fb7c024097c31dc4a3ab42d6f8a062 Mon Sep 17 00:00:00 2001 From: N!no Date: Sun, 15 Feb 2026 20:51:53 -0500 Subject: [PATCH 07/14] ud --- configs/grpo_che_config.yaml | 40 +++++++++++++----------------- configs/grpo_he_config.yaml | 40 +++++++++++++----------------- configs/grpo_mbpp_config.yaml | 40 +++++++++++++----------------- configs/magrpo_mbpp_config.yaml | 43 ++++++++++++++------------------- 4 files changed, 69 insertions(+), 94 deletions(-) diff --git a/configs/grpo_che_config.yaml b/configs/grpo_che_config.yaml index 24b7da4..472e99f 100644 --- a/configs/grpo_che_config.yaml +++ b/configs/grpo_che_config.yaml @@ -1,38 +1,31 @@ agent_model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" - temperature: 0.6 - top_p: 0.6 + name: Qwen/Qwen2.5-Coder-3B + type: qwen + temperature: 0.8 + top_p: 0.95 top_k: null max_length: 2048 - torch_dtype: "bfloat16" - + torch_dtype: bfloat16 agents: null - critic_model: null - critics: null - dataset: - name: "OpenMLRL/CoopHumanEval" - type: "coophumaneval" - train_split: "train[16:]" - eval_split: "train[:16]" - + name: OpenMLRL/CoopHumanEval + type: coophumaneval + train_split: train[16:] + eval_split: train[:16] output: base_dir: output_grpo_che save_final_model: false verbose: false - external: - mode: "level_feedback" + mode: level_feedback sandbox_slice: 1 - grpo: parallel_training: auto num_turns: 2 num_train_epochs: 20 - agent_learning_rate: 2.0e-5 + agent_learning_rate: 2.0e-05 logging_steps: 50 num_generations: 4 max_new_tokens: 256 @@ -46,10 +39,11 @@ grpo: eval_batch_size: 1 joint_mode: aligned reward_shift: -2.1 - wandb: - project: "comlrl" - entity: "OpenMLRL" - name: "grpo_coophumaneval" + project: comlrl + entity: OpenMLRL + name: grpo_coophumaneval dir: output_grpo_che - tags: ["grpo", "coophumaneval"] + tags: + - grpo + - coophumaneval diff --git a/configs/grpo_he_config.yaml b/configs/grpo_he_config.yaml index d09676f..96d1c93 100644 --- a/configs/grpo_he_config.yaml +++ b/configs/grpo_he_config.yaml @@ -1,38 +1,31 @@ agent_model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" - temperature: 0.6 - top_p: 0.6 + name: Qwen/Qwen2.5-Coder-3B + type: qwen + temperature: 0.8 + top_p: 0.95 top_k: null max_length: 2048 - torch_dtype: "bfloat16" - + torch_dtype: bfloat16 agents: null - critic_model: null - critics: null - dataset: - name: "openai/openai_humaneval" - type: "humaneval" - train_split: "test[33:163]" - eval_split: "test[:32]" - + name: openai/openai_humaneval + type: humaneval + train_split: test[33:163] + eval_split: test[:32] output: base_dir: output_grpo_he save_final_model: false verbose: false - external: - mode: "level_feedback" + mode: level_feedback sandbox_slice: 1 - grpo: parallel_training: auto num_turns: 2 num_train_epochs: 6 - agent_learning_rate: 2.0e-5 + agent_learning_rate: 2.0e-05 logging_steps: 50 num_generations: 4 max_new_tokens: 256 @@ -46,10 +39,11 @@ grpo: eval_batch_size: 1 joint_mode: aligned reward_shift: -2.1 - wandb: - project: "comlrl" - entity: "OpenMLRL" - name: "grpo_humaneval" + project: comlrl + entity: OpenMLRL + name: grpo_humaneval dir: output_grpo_he - tags: ["grpo", "humaneval"] + tags: + - grpo + - humaneval diff --git a/configs/grpo_mbpp_config.yaml b/configs/grpo_mbpp_config.yaml index 2c87c6c..637b8bf 100644 --- a/configs/grpo_mbpp_config.yaml +++ b/configs/grpo_mbpp_config.yaml @@ -1,38 +1,31 @@ agent_model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" - temperature: 0.6 - top_p: 0.6 + name: Qwen/Qwen2.5-Coder-3B + type: qwen + temperature: 0.8 + top_p: 0.95 top_k: null max_length: 2048 - torch_dtype: "bfloat16" - + torch_dtype: bfloat16 agents: null - critic_model: null - critics: null - dataset: - name: "OpenMLRL/MBPP" - type: "mbpp" - train_split: "test[15:65]" - eval_split: "test[:15]" - + name: OpenMLRL/MBPP + type: mbpp + train_split: test[15:65] + eval_split: test[:15] output: base_dir: output_grpo_mbpp save_final_model: false verbose: false - external: - mode: "level_feedback" + mode: level_feedback sandbox_slice: 1 - grpo: parallel_training: auto num_turns: 2 num_train_epochs: 8 - agent_learning_rate: 3.0e-5 + agent_learning_rate: 3.0e-05 logging_steps: 50 num_generations: 4 max_new_tokens: 256 @@ -46,10 +39,11 @@ grpo: eval_batch_size: 1 joint_mode: aligned reward_shift: -2.1 - wandb: - project: "comlrl" - entity: "OpenMLRL" - name: "grpo_mbpp" + project: comlrl + entity: OpenMLRL + name: grpo_mbpp dir: output_grpo_mbpp - tags: ["grpo", "mbpp"] + tags: + - grpo + - mbpp diff --git a/configs/magrpo_mbpp_config.yaml b/configs/magrpo_mbpp_config.yaml index 5f3d511..317f62c 100644 --- a/configs/magrpo_mbpp_config.yaml +++ b/configs/magrpo_mbpp_config.yaml @@ -1,42 +1,34 @@ agent_model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" - temperature: 0.6 - top_p: 0.6 + name: Qwen/Qwen2.5-Coder-3B + type: qwen + temperature: 0.8 + top_p: 0.95 top_k: null max_length: 2048 - torch_dtype: "bfloat16" + torch_dtype: bfloat16 special_tokens: {} - agents: null - critic_model: null - critics: null - dataset: - name: "OpenMLRL/MBPP" - type: "mbpp" - train_split: "test[15:65]" - eval_split: "test[:15]" - + name: OpenMLRL/MBPP + type: mbpp + train_split: test[15:65] + eval_split: test[:15] seed: 42 - output: base_dir: output_magrpo_mbpp save_final_model: false verbose: false - external: - mode: "level_feedback" + mode: level_feedback sandbox_slice: 1 - magrpo: parallel_training: auto num_agents: 2 num_turns: 2 num_train_epochs: 8 - agent_learning_rate: 3.0e-5 + agent_learning_rate: 3.0e-05 logging_steps: 50 num_generations: 4 max_new_tokens: 256 @@ -49,15 +41,16 @@ magrpo: eval_interval: 4 eval_num_samples: 4 eval_batch_size: 1 - reward_processor: enabled: true scale_factor: 1.0 shift: -4 - wandb: - project: "comlrl" - entity: "OpenMLRL" - name: "magrpo_mbpp" + project: comlrl + entity: OpenMLRL + name: magrpo_mbpp dir: output_magrpo_mbpp - tags: ["magrpo", "mbpp", "multi-agent"] + tags: + - magrpo + - mbpp + - multi-agent From 436fcbbb46dc1f9e20c7eea7abd562033dd3e578 Mon Sep 17 00:00:00 2001 From: N!no Date: Mon, 16 Feb 2026 00:08:20 -0500 Subject: [PATCH 08/14] ud --- configs/ac_che_config.yaml | 2 +- configs/ac_he_config.yaml | 2 +- configs/ac_mbpp_config.yaml | 2 +- configs/grpo_che_config.yaml | 2 +- configs/grpo_he_config.yaml | 2 +- configs/grpo_mbpp_config.yaml | 2 +- configs/iac_che_config.yaml | 2 +- configs/iac_he_config.yaml | 2 +- configs/iac_mbpp_config.yaml | 2 +- configs/maac_che_config.yaml | 2 +- configs/maac_he_config.yaml | 2 +- configs/maac_mbpp_config.yaml | 2 +- configs/magrpo_che_config.yaml | 2 +- configs/magrpo_he_config.yaml | 2 +- configs/magrpo_mbpp_config.yaml | 2 +- train_ac.py | 2 +- train_grpo.py | 2 +- train_iac.py | 2 +- train_maac.py | 2 +- train_magrpo.py | 2 +- 20 files changed, 20 insertions(+), 20 deletions(-) diff --git a/configs/ac_che_config.yaml b/configs/ac_che_config.yaml index ba703e2..427510d 100644 --- a/configs/ac_che_config.yaml +++ b/configs/ac_che_config.yaml @@ -34,7 +34,7 @@ external: sandbox_slice: 1 ac: - parallel_training: auto + parallel_training: mp num_turns: 2 num_train_epochs: 80 agent_learning_rate: 5.0e-6 diff --git a/configs/ac_he_config.yaml b/configs/ac_he_config.yaml index 10052b4..0defb3c 100644 --- a/configs/ac_he_config.yaml +++ b/configs/ac_he_config.yaml @@ -34,7 +34,7 @@ external: sandbox_slice: 1 ac: - parallel_training: auto + parallel_training: mp num_turns: 2 num_train_epochs: 80 agent_learning_rate: 5.0e-6 diff --git a/configs/ac_mbpp_config.yaml b/configs/ac_mbpp_config.yaml index 1b28744..58706f0 100644 --- a/configs/ac_mbpp_config.yaml +++ b/configs/ac_mbpp_config.yaml @@ -34,7 +34,7 @@ external: sandbox_slice: 1 ac: - parallel_training: auto + parallel_training: mp num_turns: 2 num_train_epochs: 80 agent_learning_rate: 5.0e-6 diff --git a/configs/grpo_che_config.yaml b/configs/grpo_che_config.yaml index 472e99f..86cf0ca 100644 --- a/configs/grpo_che_config.yaml +++ b/configs/grpo_che_config.yaml @@ -22,7 +22,7 @@ external: mode: level_feedback sandbox_slice: 1 grpo: - parallel_training: auto + parallel_training: mp num_turns: 2 num_train_epochs: 20 agent_learning_rate: 2.0e-05 diff --git a/configs/grpo_he_config.yaml b/configs/grpo_he_config.yaml index 96d1c93..f33c671 100644 --- a/configs/grpo_he_config.yaml +++ b/configs/grpo_he_config.yaml @@ -22,7 +22,7 @@ external: mode: level_feedback sandbox_slice: 1 grpo: - parallel_training: auto + parallel_training: mp num_turns: 2 num_train_epochs: 6 agent_learning_rate: 2.0e-05 diff --git a/configs/grpo_mbpp_config.yaml b/configs/grpo_mbpp_config.yaml index 637b8bf..a1842d0 100644 --- a/configs/grpo_mbpp_config.yaml +++ b/configs/grpo_mbpp_config.yaml @@ -22,7 +22,7 @@ external: mode: level_feedback sandbox_slice: 1 grpo: - parallel_training: auto + parallel_training: mp num_turns: 2 num_train_epochs: 8 agent_learning_rate: 3.0e-05 diff --git a/configs/iac_che_config.yaml b/configs/iac_che_config.yaml index d90b27b..0e638fc 100644 --- a/configs/iac_che_config.yaml +++ b/configs/iac_che_config.yaml @@ -34,7 +34,7 @@ external: sandbox_slice: 1 iac: - parallel_training: auto + parallel_training: mp num_agents: 2 num_turns: 2 use_separate_critic: true diff --git a/configs/iac_he_config.yaml b/configs/iac_he_config.yaml index 8c213cb..ea45400 100644 --- a/configs/iac_he_config.yaml +++ b/configs/iac_he_config.yaml @@ -34,7 +34,7 @@ external: sandbox_slice: 1 iac: - parallel_training: auto + parallel_training: mp num_agents: 2 num_turns: 2 use_separate_critic: true diff --git a/configs/iac_mbpp_config.yaml b/configs/iac_mbpp_config.yaml index ee10fdb..6f74f02 100644 --- a/configs/iac_mbpp_config.yaml +++ b/configs/iac_mbpp_config.yaml @@ -34,7 +34,7 @@ external: sandbox_slice: 1 iac: - parallel_training: auto + parallel_training: mp num_agents: 2 num_turns: 2 use_separate_critic: true diff --git a/configs/maac_che_config.yaml b/configs/maac_che_config.yaml index b8cca73..86baf2d 100644 --- a/configs/maac_che_config.yaml +++ b/configs/maac_che_config.yaml @@ -34,7 +34,7 @@ external: sandbox_slice: 1 maac: - parallel_training: auto + parallel_training: mp num_agents: 2 num_turns: 2 critic_type: "v" diff --git a/configs/maac_he_config.yaml b/configs/maac_he_config.yaml index 9113c23..d661cf4 100644 --- a/configs/maac_he_config.yaml +++ b/configs/maac_he_config.yaml @@ -34,7 +34,7 @@ external: sandbox_slice: 1 maac: - parallel_training: auto + parallel_training: mp num_agents: 2 num_turns: 2 critic_type: "v" diff --git a/configs/maac_mbpp_config.yaml b/configs/maac_mbpp_config.yaml index 1c31709..df688f6 100644 --- a/configs/maac_mbpp_config.yaml +++ b/configs/maac_mbpp_config.yaml @@ -34,7 +34,7 @@ external: sandbox_slice: 1 maac: - parallel_training: auto + parallel_training: mp num_agents: 2 num_turns: 2 critic_type: "v" diff --git a/configs/magrpo_che_config.yaml b/configs/magrpo_che_config.yaml index ff94c2e..56538b9 100644 --- a/configs/magrpo_che_config.yaml +++ b/configs/magrpo_che_config.yaml @@ -32,7 +32,7 @@ external: sandbox_slice: 1 magrpo: - parallel_training: auto + parallel_training: mp num_agents: 2 num_turns: 2 num_train_epochs: 8 diff --git a/configs/magrpo_he_config.yaml b/configs/magrpo_he_config.yaml index a48b641..d88e6b8 100644 --- a/configs/magrpo_he_config.yaml +++ b/configs/magrpo_he_config.yaml @@ -32,7 +32,7 @@ external: sandbox_slice: 1 magrpo: - parallel_training: auto + parallel_training: mp num_agents: 2 num_turns: 2 num_train_epochs: 6 diff --git a/configs/magrpo_mbpp_config.yaml b/configs/magrpo_mbpp_config.yaml index 317f62c..0c62420 100644 --- a/configs/magrpo_mbpp_config.yaml +++ b/configs/magrpo_mbpp_config.yaml @@ -24,7 +24,7 @@ external: mode: level_feedback sandbox_slice: 1 magrpo: - parallel_training: auto + parallel_training: mp num_agents: 2 num_turns: 2 num_train_epochs: 8 diff --git a/train_ac.py b/train_ac.py index 9907d95..741a935 100644 --- a/train_ac.py +++ b/train_ac.py @@ -384,7 +384,7 @@ def external_transition_fn( num_agents=1, num_generations=ac_cfg.get("num_generations", 1), use_separate_critic=use_separate_critic, - parallel_training=str(ac_cfg.get("parallel_training", "auto")).strip().lower(), + parallel_training=str(ac_cfg.get("parallel_training", "mp")).strip().lower(), agent_devices=ac_cfg.get("agent_devices", None), critic_devices=ac_cfg.get("critic_devices", None), critic_value_head_hidden_dim=ac_cfg.get("critic_value_head_hidden_dim"), diff --git a/train_grpo.py b/train_grpo.py index 650ec1e..45d893e 100644 --- a/train_grpo.py +++ b/train_grpo.py @@ -337,7 +337,7 @@ def _resolver(prompt: str): temperature=temperature, top_p=top_p, top_k=top_k, - parallel_training=str(grpo_config.get("parallel_training", "auto")).strip().lower(), + parallel_training=str(grpo_config.get("parallel_training", "mp")).strip().lower(), agent_devices=grpo_config.get("agent_devices", None), discount=grpo_config.get("discount", 0.9), joint_mode=grpo_config.get("joint_mode", "aligned"), diff --git a/train_iac.py b/train_iac.py index eebf7b7..ab21308 100644 --- a/train_iac.py +++ b/train_iac.py @@ -417,7 +417,7 @@ def external_transition_fn( num_agents=num_agents, num_generations=iac_cfg.get("num_generations", 1), use_separate_critic=use_separate_critic, - parallel_training=str(iac_cfg.get("parallel_training", "auto")).strip().lower(), + parallel_training=str(iac_cfg.get("parallel_training", "mp")).strip().lower(), agent_devices=iac_cfg.get("agent_devices", None), critic_devices=iac_cfg.get("critic_devices", None), critic_value_head_hidden_dim=iac_cfg.get("critic_value_head_hidden_dim"), diff --git a/train_maac.py b/train_maac.py index 89e2306..5f486f6 100644 --- a/train_maac.py +++ b/train_maac.py @@ -411,7 +411,7 @@ def external_transition_fn( top_k=top_k, num_agents=num_agents, num_generations=maac_cfg.get("num_generations", 1), - parallel_training=str(maac_cfg.get("parallel_training", "auto")).strip().lower(), + parallel_training=str(maac_cfg.get("parallel_training", "mp")).strip().lower(), agent_devices=maac_cfg.get("agent_devices", None), critic_devices=maac_cfg.get("critic_devices", None), discount=discount, diff --git a/train_magrpo.py b/train_magrpo.py index 3191fcd..0a57c60 100644 --- a/train_magrpo.py +++ b/train_magrpo.py @@ -408,7 +408,7 @@ def _resolver(prompt: str): { "num_agents": num_agents, "parallel_training": str( - magrpo_config.get("parallel_training", "auto") + magrpo_config.get("parallel_training", "mp") ).strip().lower(), "agent_devices": magrpo_config.get("agent_devices", None), "discount": magrpo_config.get("discount", 0.9), From a59ba78f3cf24c341a41d4f9c61ce68a5d229711 Mon Sep 17 00:00:00 2001 From: N!no Date: Mon, 16 Feb 2026 10:10:19 -0500 Subject: [PATCH 09/14] ud --- configs/ac_che_config.yaml | 2 +- configs/ac_he_config.yaml | 2 +- configs/ac_mbpp_config.yaml | 2 +- configs/grpo_che_config.yaml | 2 +- configs/grpo_he_config.yaml | 2 +- configs/grpo_mbpp_config.yaml | 2 +- configs/iac_che_config.yaml | 4 +++- configs/iac_he_config.yaml | 4 +++- configs/iac_mbpp_config.yaml | 4 +++- configs/maac_che_config.yaml | 4 +++- configs/maac_he_config.yaml | 4 +++- configs/maac_mbpp_config.yaml | 4 +++- configs/magrpo_che_config.yaml | 3 ++- configs/magrpo_he_config.yaml | 3 ++- configs/magrpo_mbpp_config.yaml | 3 ++- train_ac.py | 2 +- train_grpo.py | 2 +- train_iac.py | 2 +- train_maac.py | 2 +- train_magrpo.py | 2 +- 20 files changed, 35 insertions(+), 20 deletions(-) diff --git a/configs/ac_che_config.yaml b/configs/ac_che_config.yaml index 427510d..84be84b 100644 --- a/configs/ac_che_config.yaml +++ b/configs/ac_che_config.yaml @@ -34,7 +34,7 @@ external: sandbox_slice: 1 ac: - parallel_training: mp + parallel_training: none num_turns: 2 num_train_epochs: 80 agent_learning_rate: 5.0e-6 diff --git a/configs/ac_he_config.yaml b/configs/ac_he_config.yaml index 0defb3c..05a688d 100644 --- a/configs/ac_he_config.yaml +++ b/configs/ac_he_config.yaml @@ -34,7 +34,7 @@ external: sandbox_slice: 1 ac: - parallel_training: mp + parallel_training: none num_turns: 2 num_train_epochs: 80 agent_learning_rate: 5.0e-6 diff --git a/configs/ac_mbpp_config.yaml b/configs/ac_mbpp_config.yaml index 58706f0..a22fe87 100644 --- a/configs/ac_mbpp_config.yaml +++ b/configs/ac_mbpp_config.yaml @@ -34,7 +34,7 @@ external: sandbox_slice: 1 ac: - parallel_training: mp + parallel_training: none num_turns: 2 num_train_epochs: 80 agent_learning_rate: 5.0e-6 diff --git a/configs/grpo_che_config.yaml b/configs/grpo_che_config.yaml index 86cf0ca..e7998fa 100644 --- a/configs/grpo_che_config.yaml +++ b/configs/grpo_che_config.yaml @@ -22,7 +22,7 @@ external: mode: level_feedback sandbox_slice: 1 grpo: - parallel_training: mp + parallel_training: none num_turns: 2 num_train_epochs: 20 agent_learning_rate: 2.0e-05 diff --git a/configs/grpo_he_config.yaml b/configs/grpo_he_config.yaml index f33c671..3be5b0e 100644 --- a/configs/grpo_he_config.yaml +++ b/configs/grpo_he_config.yaml @@ -22,7 +22,7 @@ external: mode: level_feedback sandbox_slice: 1 grpo: - parallel_training: mp + parallel_training: none num_turns: 2 num_train_epochs: 6 agent_learning_rate: 2.0e-05 diff --git a/configs/grpo_mbpp_config.yaml b/configs/grpo_mbpp_config.yaml index a1842d0..2db1230 100644 --- a/configs/grpo_mbpp_config.yaml +++ b/configs/grpo_mbpp_config.yaml @@ -22,7 +22,7 @@ external: mode: level_feedback sandbox_slice: 1 grpo: - parallel_training: mp + parallel_training: none num_turns: 2 num_train_epochs: 8 agent_learning_rate: 3.0e-05 diff --git a/configs/iac_che_config.yaml b/configs/iac_che_config.yaml index 0e638fc..1e2ed7e 100644 --- a/configs/iac_che_config.yaml +++ b/configs/iac_che_config.yaml @@ -34,7 +34,9 @@ external: sandbox_slice: 1 iac: - parallel_training: mp + parallel_training: none + agent_devices: ["cuda:0"] + critic_devices: ["cuda:0"] num_agents: 2 num_turns: 2 use_separate_critic: true diff --git a/configs/iac_he_config.yaml b/configs/iac_he_config.yaml index ea45400..f6a4880 100644 --- a/configs/iac_he_config.yaml +++ b/configs/iac_he_config.yaml @@ -34,7 +34,9 @@ external: sandbox_slice: 1 iac: - parallel_training: mp + parallel_training: none + agent_devices: ["cuda:0"] + critic_devices: ["cuda:0"] num_agents: 2 num_turns: 2 use_separate_critic: true diff --git a/configs/iac_mbpp_config.yaml b/configs/iac_mbpp_config.yaml index 6f74f02..f202867 100644 --- a/configs/iac_mbpp_config.yaml +++ b/configs/iac_mbpp_config.yaml @@ -34,7 +34,9 @@ external: sandbox_slice: 1 iac: - parallel_training: mp + parallel_training: none + agent_devices: ["cuda:0"] + critic_devices: ["cuda:0"] num_agents: 2 num_turns: 2 use_separate_critic: true diff --git a/configs/maac_che_config.yaml b/configs/maac_che_config.yaml index 86baf2d..b08a0fa 100644 --- a/configs/maac_che_config.yaml +++ b/configs/maac_che_config.yaml @@ -34,7 +34,9 @@ external: sandbox_slice: 1 maac: - parallel_training: mp + parallel_training: none + agent_devices: ["cuda:0"] + critic_devices: ["cuda:0"] num_agents: 2 num_turns: 2 critic_type: "v" diff --git a/configs/maac_he_config.yaml b/configs/maac_he_config.yaml index d661cf4..dc8f137 100644 --- a/configs/maac_he_config.yaml +++ b/configs/maac_he_config.yaml @@ -34,7 +34,9 @@ external: sandbox_slice: 1 maac: - parallel_training: mp + parallel_training: none + agent_devices: ["cuda:0"] + critic_devices: ["cuda:0"] num_agents: 2 num_turns: 2 critic_type: "v" diff --git a/configs/maac_mbpp_config.yaml b/configs/maac_mbpp_config.yaml index df688f6..75efeca 100644 --- a/configs/maac_mbpp_config.yaml +++ b/configs/maac_mbpp_config.yaml @@ -34,7 +34,9 @@ external: sandbox_slice: 1 maac: - parallel_training: mp + parallel_training: none + agent_devices: ["cuda:0"] + critic_devices: ["cuda:0"] num_agents: 2 num_turns: 2 critic_type: "v" diff --git a/configs/magrpo_che_config.yaml b/configs/magrpo_che_config.yaml index 56538b9..85c2b24 100644 --- a/configs/magrpo_che_config.yaml +++ b/configs/magrpo_che_config.yaml @@ -32,7 +32,8 @@ external: sandbox_slice: 1 magrpo: - parallel_training: mp + parallel_training: none + agent_devices: ["cuda:0"] num_agents: 2 num_turns: 2 num_train_epochs: 8 diff --git a/configs/magrpo_he_config.yaml b/configs/magrpo_he_config.yaml index d88e6b8..d873a2e 100644 --- a/configs/magrpo_he_config.yaml +++ b/configs/magrpo_he_config.yaml @@ -32,7 +32,8 @@ external: sandbox_slice: 1 magrpo: - parallel_training: mp + parallel_training: none + agent_devices: ["cuda:0"] num_agents: 2 num_turns: 2 num_train_epochs: 6 diff --git a/configs/magrpo_mbpp_config.yaml b/configs/magrpo_mbpp_config.yaml index 0c62420..0a34801 100644 --- a/configs/magrpo_mbpp_config.yaml +++ b/configs/magrpo_mbpp_config.yaml @@ -24,7 +24,8 @@ external: mode: level_feedback sandbox_slice: 1 magrpo: - parallel_training: mp + parallel_training: none + agent_devices: ["cuda:0"] num_agents: 2 num_turns: 2 num_train_epochs: 8 diff --git a/train_ac.py b/train_ac.py index 741a935..f872d20 100644 --- a/train_ac.py +++ b/train_ac.py @@ -384,7 +384,7 @@ def external_transition_fn( num_agents=1, num_generations=ac_cfg.get("num_generations", 1), use_separate_critic=use_separate_critic, - parallel_training=str(ac_cfg.get("parallel_training", "mp")).strip().lower(), + parallel_training=str(ac_cfg.get("parallel_training", "none")).strip().lower(), agent_devices=ac_cfg.get("agent_devices", None), critic_devices=ac_cfg.get("critic_devices", None), critic_value_head_hidden_dim=ac_cfg.get("critic_value_head_hidden_dim"), diff --git a/train_grpo.py b/train_grpo.py index 45d893e..e69f24a 100644 --- a/train_grpo.py +++ b/train_grpo.py @@ -337,7 +337,7 @@ def _resolver(prompt: str): temperature=temperature, top_p=top_p, top_k=top_k, - parallel_training=str(grpo_config.get("parallel_training", "mp")).strip().lower(), + parallel_training=str(grpo_config.get("parallel_training", "none")).strip().lower(), agent_devices=grpo_config.get("agent_devices", None), discount=grpo_config.get("discount", 0.9), joint_mode=grpo_config.get("joint_mode", "aligned"), diff --git a/train_iac.py b/train_iac.py index ab21308..104701e 100644 --- a/train_iac.py +++ b/train_iac.py @@ -417,7 +417,7 @@ def external_transition_fn( num_agents=num_agents, num_generations=iac_cfg.get("num_generations", 1), use_separate_critic=use_separate_critic, - parallel_training=str(iac_cfg.get("parallel_training", "mp")).strip().lower(), + parallel_training=str(iac_cfg.get("parallel_training", "none")).strip().lower(), agent_devices=iac_cfg.get("agent_devices", None), critic_devices=iac_cfg.get("critic_devices", None), critic_value_head_hidden_dim=iac_cfg.get("critic_value_head_hidden_dim"), diff --git a/train_maac.py b/train_maac.py index 5f486f6..8d84e28 100644 --- a/train_maac.py +++ b/train_maac.py @@ -411,7 +411,7 @@ def external_transition_fn( top_k=top_k, num_agents=num_agents, num_generations=maac_cfg.get("num_generations", 1), - parallel_training=str(maac_cfg.get("parallel_training", "mp")).strip().lower(), + parallel_training=str(maac_cfg.get("parallel_training", "none")).strip().lower(), agent_devices=maac_cfg.get("agent_devices", None), critic_devices=maac_cfg.get("critic_devices", None), discount=discount, diff --git a/train_magrpo.py b/train_magrpo.py index 0a57c60..98aa05b 100644 --- a/train_magrpo.py +++ b/train_magrpo.py @@ -408,7 +408,7 @@ def _resolver(prompt: str): { "num_agents": num_agents, "parallel_training": str( - magrpo_config.get("parallel_training", "mp") + magrpo_config.get("parallel_training", "none") ).strip().lower(), "agent_devices": magrpo_config.get("agent_devices", None), "discount": magrpo_config.get("discount", 0.9), From 676bb261142c086d6c8b60ee6eda396c24e68596 Mon Sep 17 00:00:00 2001 From: N!no Date: Mon, 16 Feb 2026 11:01:27 -0500 Subject: [PATCH 10/14] ud --- configs/magrpo_mbpp_config.yaml | 4 ++-- train_iac.py | 14 +++++++------- train_maac.py | 12 ++++++------ train_magrpo.py | 12 ++++++------ 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/configs/magrpo_mbpp_config.yaml b/configs/magrpo_mbpp_config.yaml index 0a34801..fdeea5e 100644 --- a/configs/magrpo_mbpp_config.yaml +++ b/configs/magrpo_mbpp_config.yaml @@ -1,8 +1,8 @@ agent_model: name: Qwen/Qwen2.5-Coder-3B type: qwen - temperature: 0.8 - top_p: 0.95 + temperature: 0.6 + top_p: 0.6 top_k: null max_length: 2048 torch_dtype: bfloat16 diff --git a/train_iac.py b/train_iac.py index 104701e..d588dd6 100644 --- a/train_iac.py +++ b/train_iac.py @@ -365,9 +365,9 @@ def _resolver(prompt: str): critic_model_kwargs = dict(model_kwargs) if critic_config is not None and critic_config.torch_dtype is not None: critic_model_kwargs["torch_dtype"] = critic_config.torch_dtype - num_turns = iac_cfg.get("num_turns", 1) + num_turns = iac_cfg.get("num_turns", 2) - rollout_buffer_size = iac_cfg.get("rollout_buffer_size", 8) + rollout_buffer_size = iac_cfg.get("rollout_buffer_size", 4) external_transition_fn = None if num_turns > 1: @@ -404,7 +404,7 @@ def external_transition_fn( external_transition=external_transition_fn, args=IACConfig( num_turns=num_turns, - num_train_epochs=iac_cfg.get("num_train_epochs", 40), + num_train_epochs=iac_cfg.get("num_train_epochs", 80), agent_learning_rate=iac_cfg.get("agent_learning_rate", 5e-6), critic_learning_rate=iac_cfg.get("critic_learning_rate", 5e-6), value_loss_coef=iac_cfg.get("value_loss_coef", 0.6), @@ -418,18 +418,18 @@ def external_transition_fn( num_generations=iac_cfg.get("num_generations", 1), use_separate_critic=use_separate_critic, parallel_training=str(iac_cfg.get("parallel_training", "none")).strip().lower(), - agent_devices=iac_cfg.get("agent_devices", None), - critic_devices=iac_cfg.get("critic_devices", None), + agent_devices=iac_cfg.get("agent_devices", ["cuda:0"]), + critic_devices=iac_cfg.get("critic_devices", ["cuda:0"]), critic_value_head_hidden_dim=iac_cfg.get("critic_value_head_hidden_dim"), value_head_hidden_dim=iac_cfg.get("value_head_hidden_dim"), discount=iac_cfg.get("discount", 0.9), early_termination_threshold=iac_cfg.get( "early_termination_threshold", -0.2 ), - eval_interval=iac_cfg.get("eval_interval", 16), + eval_interval=iac_cfg.get("eval_interval", 40), eval_num_samples=iac_cfg.get("eval_num_samples", 4), eval_batch_size=iac_cfg.get("eval_batch_size", 1), - logging_steps=iac_cfg.get("logging_steps", 1), + logging_steps=iac_cfg.get("logging_steps", 10), ), train_dataset=train_dataset, eval_dataset=eval_dataset, diff --git a/train_maac.py b/train_maac.py index 8d84e28..74cd545 100644 --- a/train_maac.py +++ b/train_maac.py @@ -400,11 +400,11 @@ def external_transition_fn( external_transition=external_transition_fn, args=MAACConfig( num_turns=num_turns, - num_train_epochs=maac_cfg.get("num_train_epochs", 40), + num_train_epochs=maac_cfg.get("num_train_epochs", 80), agent_learning_rate=maac_cfg.get("agent_learning_rate", 5e-6), critic_learning_rate=maac_cfg.get("critic_learning_rate", 5e-6), value_loss_coef=maac_cfg.get("value_loss_coef", 0.6), - rollout_buffer_size=maac_cfg.get("rollout_buffer_size", 8), + rollout_buffer_size=maac_cfg.get("rollout_buffer_size", 4), max_new_tokens=maac_cfg.get("max_new_tokens", 256), temperature=temperature, top_p=top_p, @@ -412,17 +412,17 @@ def external_transition_fn( num_agents=num_agents, num_generations=maac_cfg.get("num_generations", 1), parallel_training=str(maac_cfg.get("parallel_training", "none")).strip().lower(), - agent_devices=maac_cfg.get("agent_devices", None), - critic_devices=maac_cfg.get("critic_devices", None), + agent_devices=maac_cfg.get("agent_devices", ["cuda:0"]), + critic_devices=maac_cfg.get("critic_devices", ["cuda:0"]), discount=discount, critic_type=maac_cfg.get("critic_type", "v"), early_termination_threshold=maac_cfg.get( "early_termination_threshold", -0.2 ), - eval_interval=maac_cfg.get("eval_interval", 16), + eval_interval=maac_cfg.get("eval_interval", 40), eval_num_samples=maac_cfg.get("eval_num_samples", 4), eval_batch_size=maac_cfg.get("eval_batch_size", 1), - logging_steps=maac_cfg.get("logging_steps", 1), + logging_steps=maac_cfg.get("logging_steps", 10), ), train_dataset=train_dataset, eval_dataset=eval_dataset, diff --git a/train_magrpo.py b/train_magrpo.py index 98aa05b..19f0f23 100644 --- a/train_magrpo.py +++ b/train_magrpo.py @@ -395,8 +395,8 @@ def _resolver(prompt: str): magrpo_args_kwargs = { "num_turns": num_turns, - "num_train_epochs": magrpo_config.get("num_train_epochs", 20), - "agent_learning_rate": magrpo_config.get("agent_learning_rate", 5e-6), + "num_train_epochs": magrpo_config.get("num_train_epochs", 8), + "agent_learning_rate": magrpo_config.get("agent_learning_rate", 2e-5), "logging_steps": magrpo_config.get("logging_steps", 50), "num_generations": magrpo_config.get("num_generations", 4), "max_new_tokens": magrpo_config.get("max_new_tokens", 256), @@ -410,18 +410,18 @@ def _resolver(prompt: str): "parallel_training": str( magrpo_config.get("parallel_training", "none") ).strip().lower(), - "agent_devices": magrpo_config.get("agent_devices", None), + "agent_devices": magrpo_config.get("agent_devices", ["cuda:0"]), "discount": magrpo_config.get("discount", 0.9), "joint_mode": magrpo_config.get("joint_mode", "aligned"), "early_termination_threshold": magrpo_config.get( "early_termination_threshold", -0.2 ), - "rollout_buffer_size": magrpo_config.get("rollout_buffer_size", 2), - "train_batch_size": magrpo_config.get("train_batch_size", None), + "rollout_buffer_size": magrpo_config.get("rollout_buffer_size", 4), + "train_batch_size": magrpo_config.get("train_batch_size", 4), "advantage_normalization": magrpo_config.get( "advantage_normalization", True ), - "eval_interval": magrpo_config.get("eval_interval", 16), + "eval_interval": magrpo_config.get("eval_interval", 4), "eval_num_samples": magrpo_config.get("eval_num_samples", 4), "eval_batch_size": magrpo_config.get("eval_batch_size", 1), "external_prompt_passthrough": True, From 3b79f517b91b693178a72135f85fc64ac6910133 Mon Sep 17 00:00:00 2001 From: N!no Date: Mon, 16 Feb 2026 14:10:02 -0500 Subject: [PATCH 11/14] ud --- configs/iac_che_config.yaml | 1 + configs/iac_he_config.yaml | 1 + configs/iac_mbpp_config.yaml | 1 + configs/maac_che_config.yaml | 1 + configs/maac_he_config.yaml | 1 + configs/maac_mbpp_config.yaml | 1 + configs/magrpo_che_config.yaml | 1 + configs/magrpo_he_config.yaml | 1 + configs/magrpo_mbpp_config.yaml | 1 + train_iac.py | 12 ++++++++++++ train_maac.py | 13 +++++++++++++ train_magrpo.py | 13 ++++++++++++- 12 files changed, 46 insertions(+), 1 deletion(-) diff --git a/configs/iac_che_config.yaml b/configs/iac_che_config.yaml index 1e2ed7e..502c938 100644 --- a/configs/iac_che_config.yaml +++ b/configs/iac_che_config.yaml @@ -32,6 +32,7 @@ output: external: mode: "level_feedback" sandbox_slice: 1 + external_prompt_passthrough: false iac: parallel_training: none diff --git a/configs/iac_he_config.yaml b/configs/iac_he_config.yaml index f6a4880..cafbec4 100644 --- a/configs/iac_he_config.yaml +++ b/configs/iac_he_config.yaml @@ -32,6 +32,7 @@ output: external: mode: "level_feedback" sandbox_slice: 1 + external_prompt_passthrough: false iac: parallel_training: none diff --git a/configs/iac_mbpp_config.yaml b/configs/iac_mbpp_config.yaml index f202867..55cc5af 100644 --- a/configs/iac_mbpp_config.yaml +++ b/configs/iac_mbpp_config.yaml @@ -32,6 +32,7 @@ output: external: mode: "level_feedback" sandbox_slice: 1 + external_prompt_passthrough: false iac: parallel_training: none diff --git a/configs/maac_che_config.yaml b/configs/maac_che_config.yaml index b08a0fa..1b72779 100644 --- a/configs/maac_che_config.yaml +++ b/configs/maac_che_config.yaml @@ -32,6 +32,7 @@ output: external: mode: "level_feedback" sandbox_slice: 1 + external_prompt_passthrough: false maac: parallel_training: none diff --git a/configs/maac_he_config.yaml b/configs/maac_he_config.yaml index dc8f137..dccade6 100644 --- a/configs/maac_he_config.yaml +++ b/configs/maac_he_config.yaml @@ -32,6 +32,7 @@ output: external: mode: "level_feedback" sandbox_slice: 1 + external_prompt_passthrough: false maac: parallel_training: none diff --git a/configs/maac_mbpp_config.yaml b/configs/maac_mbpp_config.yaml index 75efeca..690b5d0 100644 --- a/configs/maac_mbpp_config.yaml +++ b/configs/maac_mbpp_config.yaml @@ -32,6 +32,7 @@ output: external: mode: "level_feedback" sandbox_slice: 1 + external_prompt_passthrough: false maac: parallel_training: none diff --git a/configs/magrpo_che_config.yaml b/configs/magrpo_che_config.yaml index 85c2b24..301266c 100644 --- a/configs/magrpo_che_config.yaml +++ b/configs/magrpo_che_config.yaml @@ -30,6 +30,7 @@ output: external: mode: "level_feedback" sandbox_slice: 1 + external_prompt_passthrough: false magrpo: parallel_training: none diff --git a/configs/magrpo_he_config.yaml b/configs/magrpo_he_config.yaml index d873a2e..1a9ad90 100644 --- a/configs/magrpo_he_config.yaml +++ b/configs/magrpo_he_config.yaml @@ -30,6 +30,7 @@ output: external: mode: "level_feedback" sandbox_slice: 1 + external_prompt_passthrough: false magrpo: parallel_training: none diff --git a/configs/magrpo_mbpp_config.yaml b/configs/magrpo_mbpp_config.yaml index fdeea5e..65260ea 100644 --- a/configs/magrpo_mbpp_config.yaml +++ b/configs/magrpo_mbpp_config.yaml @@ -23,6 +23,7 @@ output: external: mode: level_feedback sandbox_slice: 1 + external_prompt_passthrough: false magrpo: parallel_training: none agent_devices: ["cuda:0"] diff --git a/train_iac.py b/train_iac.py index d588dd6..52c6c53 100644 --- a/train_iac.py +++ b/train_iac.py @@ -257,6 +257,17 @@ def main() -> None: config.save(config_save_path) external_cfg = config.get_section("external") if hasattr(config, "get_section") else {} + _ext_passthrough = external_cfg.get("external_prompt_passthrough", False) + if isinstance(_ext_passthrough, str): + external_prompt_passthrough = _ext_passthrough.strip().lower() in { + "1", + "true", + "yes", + "y", + "on", + } + else: + external_prompt_passthrough = bool(_ext_passthrough) def _normalize_prompt(p: str) -> str: return " ".join((p or "").split()).strip() @@ -423,6 +434,7 @@ def external_transition_fn( critic_value_head_hidden_dim=iac_cfg.get("critic_value_head_hidden_dim"), value_head_hidden_dim=iac_cfg.get("value_head_hidden_dim"), discount=iac_cfg.get("discount", 0.9), + external_prompt_passthrough=external_prompt_passthrough, early_termination_threshold=iac_cfg.get( "early_termination_threshold", -0.2 ), diff --git a/train_maac.py b/train_maac.py index 74cd545..62cded5 100644 --- a/train_maac.py +++ b/train_maac.py @@ -248,6 +248,18 @@ def main() -> None: config.save(config_save_path) external_cfg = config.get_section("external") if hasattr(config, "get_section") else {} + _ext_passthrough = external_cfg.get("external_prompt_passthrough", False) + if isinstance(_ext_passthrough, str): + external_prompt_passthrough = _ext_passthrough.strip().lower() in { + "1", + "true", + "yes", + "y", + "on", + } + else: + external_prompt_passthrough = bool(_ext_passthrough) + def _normalize_prompt(p: str) -> str: return " ".join((p or "").split()).strip() @@ -415,6 +427,7 @@ def external_transition_fn( agent_devices=maac_cfg.get("agent_devices", ["cuda:0"]), critic_devices=maac_cfg.get("critic_devices", ["cuda:0"]), discount=discount, + external_prompt_passthrough=external_prompt_passthrough, critic_type=maac_cfg.get("critic_type", "v"), early_termination_threshold=maac_cfg.get( "early_termination_threshold", -0.2 diff --git a/train_magrpo.py b/train_magrpo.py index 19f0f23..851f3fb 100644 --- a/train_magrpo.py +++ b/train_magrpo.py @@ -304,6 +304,17 @@ def main(): top_p = model_config.top_p top_k = model_config.top_k external_cfg = config.get_section("external") if hasattr(config, "get_section") else {} + _ext_passthrough = external_cfg.get("external_prompt_passthrough", False) + if isinstance(_ext_passthrough, str): + external_prompt_passthrough = _ext_passthrough.strip().lower() in { + "1", + "true", + "yes", + "y", + "on", + } + else: + external_prompt_passthrough = bool(_ext_passthrough) # Register external context resolver using dataset items def _normalize_prompt(p: str) -> str: @@ -424,7 +435,7 @@ def _resolver(prompt: str): "eval_interval": magrpo_config.get("eval_interval", 4), "eval_num_samples": magrpo_config.get("eval_num_samples", 4), "eval_batch_size": magrpo_config.get("eval_batch_size", 1), - "external_prompt_passthrough": True, + "external_prompt_passthrough": external_prompt_passthrough, } ) magrpo_args = MAGRPOConfig(**magrpo_args_kwargs) From f33c0068269df56a588a71f31f6471ea1cb100ab Mon Sep 17 00:00:00 2001 From: N!no Date: Mon, 16 Feb 2026 16:19:11 -0500 Subject: [PATCH 12/14] Update train_magrpo.py --- train_magrpo.py | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) diff --git a/train_magrpo.py b/train_magrpo.py index 851f3fb..415c37f 100644 --- a/train_magrpo.py +++ b/train_magrpo.py @@ -17,7 +17,7 @@ from config import Config, add_config_args, parse_overrides from datasets import load_dataset import torch -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoTokenizer from loggers.mt_code_logger import ( aggregate_mt_humaneval_metrics_for_logging, @@ -490,26 +490,6 @@ def _resolver(prompt: str): code_rewards.VERBOSE = bool(output_verbose) import external as external_mod external_mod.VERBOSE = bool(output_verbose) - model_kwargs: Dict[str, Any] = {} - if model_config.torch_dtype is not None: - model_kwargs["torch_dtype"] = model_config.torch_dtype - if agent_names: - agents = [ - AutoModelForCausalLM.from_pretrained( - name, - **model_kwargs, - ) - for name in agent_names - ] - else: - agents = [ - AutoModelForCausalLM.from_pretrained( - model_name, - **model_kwargs, - ) - for _ in range(num_agents) - ] - reward_processor = None if config.get("reward_processor.enabled", True): scale_factor = config.get("reward_processor.scale_factor", 1.0) @@ -525,11 +505,17 @@ def _resolver(prompt: str): prev = reward_processor reward_processor = (lambda p=prev, s=shift_proc: (lambda x: s(p(x))))() # Build trainer kwargs (grouped: model/data, reward/formatting, logging, args) + model_arg = model_name or None + agents_arg = agent_names trainer_kwargs = { - "agent_model": model_name or None, - "agents": agents, + "agent_model": model_arg, + "agents": agents_arg, "num_agents": num_agents, "tokenizer": tokenizers if agent_names else tokenizer, + "model_config": { + "torch_dtype": model_config.torch_dtype, + "special_tokens": model_config.special_tokens, + }, "train_dataset": train_dataset, "eval_dataset": eval_dataset, "reward_func": reward_func, From 6c233abf37068169318143683d8d0de816443ec5 Mon Sep 17 00:00:00 2001 From: N!no Date: Mon, 16 Feb 2026 17:01:30 -0500 Subject: [PATCH 13/14] ud --- configs/ac_che_config.yaml | 40 +++++++++++++++------------- configs/ac_he_config.yaml | 40 +++++++++++++++------------- configs/ac_mbpp_config.yaml | 40 +++++++++++++++------------- configs/grpo_che_config.yaml | 12 +++++++-- configs/grpo_he_config.yaml | 12 +++++++-- configs/grpo_mbpp_config.yaml | 12 +++++++-- configs/iac_che_config.yaml | 44 +++++++++++++++++-------------- configs/iac_he_config.yaml | 44 +++++++++++++++++-------------- configs/iac_mbpp_config.yaml | 44 +++++++++++++++++-------------- configs/maac_che_config.yaml | 46 +++++++++++++++++++-------------- configs/maac_he_config.yaml | 46 +++++++++++++++++++-------------- configs/maac_mbpp_config.yaml | 46 +++++++++++++++++++-------------- configs/magrpo_che_config.yaml | 35 ++++++++++++++----------- configs/magrpo_he_config.yaml | 34 +++++++++++++----------- configs/magrpo_mbpp_config.yaml | 17 +++++++++--- 15 files changed, 302 insertions(+), 210 deletions(-) diff --git a/configs/ac_che_config.yaml b/configs/ac_che_config.yaml index 84be84b..fd7cf0d 100644 --- a/configs/ac_che_config.yaml +++ b/configs/ac_che_config.yaml @@ -1,27 +1,27 @@ agent_model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" + name: Qwen/Qwen2.5-Coder-3B + type: qwen temperature: 0.6 top_p: 0.6 top_k: null max_length: 2048 - torch_dtype: "bfloat16" + torch_dtype: bfloat16 agents: null critic_model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" + name: Qwen/Qwen2.5-Coder-3B + type: qwen max_length: 2048 - torch_dtype: "bfloat16" + torch_dtype: bfloat16 critics: null dataset: - name: "OpenMLRL/CoopHumanEval" - type: "coophumaneval" - train_split: "train[16:]" - eval_split: "train[:16]" + name: OpenMLRL/CoopHumanEval + type: coophumaneval + train_split: train[16:] + eval_split: train[:16] output: base_dir: output_ac_che @@ -30,29 +30,33 @@ output: save_path: output_ac_che external: - mode: "level_feedback" + mode: level_feedback sandbox_slice: 1 ac: parallel_training: none num_turns: 2 num_train_epochs: 80 - agent_learning_rate: 5.0e-6 - critic_learning_rate: 5.0e-6 + agent_learning_rate: 5.0e-06 + critic_learning_rate: 5.0e-06 value_loss_coef: 0.6 - advantage_normalization: true rollout_buffer_size: 4 max_new_tokens: 256 discount: 0.9 early_termination_threshold: -0.2 + advantage_normalization: true eval_interval: 40 eval_num_samples: 4 eval_batch_size: 1 reward_shift: -4 wandb: - project: "comlrl" - entity: "OpenMLRL" - name: "ac_coophumaneval" + project: comlrl + entity: OpenMLRL + name: ac_coophumaneval dir: output_ac_che - tags: ["ac", "coophumaneval", "single-agent", "turns_2"] + tags: + - ac + - coophumaneval + - single-agent + - turns_2 diff --git a/configs/ac_he_config.yaml b/configs/ac_he_config.yaml index 05a688d..b17532a 100644 --- a/configs/ac_he_config.yaml +++ b/configs/ac_he_config.yaml @@ -1,27 +1,27 @@ agent_model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" + name: Qwen/Qwen2.5-Coder-3B + type: qwen temperature: 0.6 top_p: 0.6 top_k: null max_length: 2048 - torch_dtype: "bfloat16" + torch_dtype: bfloat16 agents: null critic_model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" + name: Qwen/Qwen2.5-Coder-3B + type: qwen max_length: 2048 - torch_dtype: "bfloat16" + torch_dtype: bfloat16 critics: null dataset: - name: "openai/openai_humaneval" - type: "humaneval" - train_split: "test[33:163]" - eval_split: "test[:32]" + name: openai/openai_humaneval + type: humaneval + train_split: test[33:163] + eval_split: test[:32] output: base_dir: output_ac_he @@ -30,29 +30,33 @@ output: save_path: output_ac_he external: - mode: "level_feedback" + mode: level_feedback sandbox_slice: 1 ac: parallel_training: none num_turns: 2 num_train_epochs: 80 - agent_learning_rate: 5.0e-6 - critic_learning_rate: 5.0e-6 + agent_learning_rate: 5.0e-06 + critic_learning_rate: 5.0e-06 value_loss_coef: 0.6 - advantage_normalization: true rollout_buffer_size: 4 max_new_tokens: 256 discount: 0.9 early_termination_threshold: -0.2 + advantage_normalization: true eval_interval: 40 eval_num_samples: 4 eval_batch_size: 1 reward_shift: -4 wandb: - project: "comlrl" - entity: "OpenMLRL" - name: "ac_humaneval" + project: comlrl + entity: OpenMLRL + name: ac_humaneval dir: output_ac_he - tags: ["ac", "humaneval", "single-agent", "turns_2"] + tags: + - ac + - humaneval + - single-agent + - turns_2 diff --git a/configs/ac_mbpp_config.yaml b/configs/ac_mbpp_config.yaml index a22fe87..1b49439 100644 --- a/configs/ac_mbpp_config.yaml +++ b/configs/ac_mbpp_config.yaml @@ -1,27 +1,27 @@ agent_model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" + name: Qwen/Qwen2.5-Coder-3B + type: qwen temperature: 0.6 top_p: 0.6 top_k: null max_length: 2048 - torch_dtype: "bfloat16" + torch_dtype: bfloat16 agents: null critic_model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" + name: Qwen/Qwen2.5-Coder-3B + type: qwen max_length: 2048 - torch_dtype: "bfloat16" + torch_dtype: bfloat16 critics: null dataset: - name: "OpenMLRL/MBPP" - type: "mbpp" - train_split: "test[15:65]" - eval_split: "test[:15]" + name: OpenMLRL/MBPP + type: mbpp + train_split: test[15:65] + eval_split: test[:15] output: base_dir: output_ac_mbpp @@ -30,29 +30,33 @@ output: save_path: output_ac_mbpp external: - mode: "level_feedback" + mode: level_feedback sandbox_slice: 1 ac: parallel_training: none num_turns: 2 num_train_epochs: 80 - agent_learning_rate: 5.0e-6 - critic_learning_rate: 5.0e-6 + agent_learning_rate: 5.0e-06 + critic_learning_rate: 5.0e-06 value_loss_coef: 0.6 - advantage_normalization: true rollout_buffer_size: 4 max_new_tokens: 256 discount: 0.9 early_termination_threshold: -0.2 + advantage_normalization: true eval_interval: 40 eval_num_samples: 4 eval_batch_size: 1 reward_shift: -4 wandb: - project: "comlrl" - entity: "OpenMLRL" - name: "ac_mbpp" + project: comlrl + entity: OpenMLRL + name: ac_mbpp dir: output_ac_mbpp - tags: ["ac", "mbpp", "single-agent", "turns_2"] + tags: + - ac + - mbpp + - single-agent + - turns_2 diff --git a/configs/grpo_che_config.yaml b/configs/grpo_che_config.yaml index e7998fa..14f1b68 100644 --- a/configs/grpo_che_config.yaml +++ b/configs/grpo_che_config.yaml @@ -6,21 +6,28 @@ agent_model: top_k: null max_length: 2048 torch_dtype: bfloat16 + agents: null + critic_model: null + critics: null + dataset: name: OpenMLRL/CoopHumanEval type: coophumaneval train_split: train[16:] eval_split: train[:16] + output: base_dir: output_grpo_che - save_final_model: false verbose: false + save_final_model: false + external: mode: level_feedback sandbox_slice: 1 + grpo: parallel_training: none num_turns: 2 @@ -30,6 +37,7 @@ grpo: num_generations: 4 max_new_tokens: 256 discount: 0.9 + joint_mode: aligned early_termination_threshold: -0.1 rollout_buffer_size: 2 train_batch_size: 2 @@ -37,8 +45,8 @@ grpo: eval_interval: 4 eval_num_samples: 4 eval_batch_size: 1 - joint_mode: aligned reward_shift: -2.1 + wandb: project: comlrl entity: OpenMLRL diff --git a/configs/grpo_he_config.yaml b/configs/grpo_he_config.yaml index 3be5b0e..51d9639 100644 --- a/configs/grpo_he_config.yaml +++ b/configs/grpo_he_config.yaml @@ -6,21 +6,28 @@ agent_model: top_k: null max_length: 2048 torch_dtype: bfloat16 + agents: null + critic_model: null + critics: null + dataset: name: openai/openai_humaneval type: humaneval train_split: test[33:163] eval_split: test[:32] + output: base_dir: output_grpo_he - save_final_model: false verbose: false + save_final_model: false + external: mode: level_feedback sandbox_slice: 1 + grpo: parallel_training: none num_turns: 2 @@ -30,6 +37,7 @@ grpo: num_generations: 4 max_new_tokens: 256 discount: 0.9 + joint_mode: aligned early_termination_threshold: -0.1 rollout_buffer_size: 2 train_batch_size: 2 @@ -37,8 +45,8 @@ grpo: eval_interval: 4 eval_num_samples: 4 eval_batch_size: 1 - joint_mode: aligned reward_shift: -2.1 + wandb: project: comlrl entity: OpenMLRL diff --git a/configs/grpo_mbpp_config.yaml b/configs/grpo_mbpp_config.yaml index 2db1230..6d2ddda 100644 --- a/configs/grpo_mbpp_config.yaml +++ b/configs/grpo_mbpp_config.yaml @@ -6,21 +6,28 @@ agent_model: top_k: null max_length: 2048 torch_dtype: bfloat16 + agents: null + critic_model: null + critics: null + dataset: name: OpenMLRL/MBPP type: mbpp train_split: test[15:65] eval_split: test[:15] + output: base_dir: output_grpo_mbpp - save_final_model: false verbose: false + save_final_model: false + external: mode: level_feedback sandbox_slice: 1 + grpo: parallel_training: none num_turns: 2 @@ -30,6 +37,7 @@ grpo: num_generations: 4 max_new_tokens: 256 discount: 0.9 + joint_mode: aligned early_termination_threshold: -0.1 rollout_buffer_size: 2 train_batch_size: 2 @@ -37,8 +45,8 @@ grpo: eval_interval: 4 eval_num_samples: 4 eval_batch_size: 1 - joint_mode: aligned reward_shift: -2.1 + wandb: project: comlrl entity: OpenMLRL diff --git a/configs/iac_che_config.yaml b/configs/iac_che_config.yaml index 502c938..e8944ea 100644 --- a/configs/iac_che_config.yaml +++ b/configs/iac_che_config.yaml @@ -1,27 +1,27 @@ agent_model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" + name: Qwen/Qwen2.5-Coder-3B + type: qwen temperature: 0.6 top_p: 0.6 top_k: null max_length: 2048 - torch_dtype: "bfloat16" + torch_dtype: bfloat16 agents: null critic_model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" + name: Qwen/Qwen2.5-Coder-3B + type: qwen max_length: 2048 - torch_dtype: "bfloat16" + torch_dtype: bfloat16 critics: null dataset: - name: "OpenMLRL/CoopHumanEval" - type: "coophumaneval" - train_split: "train[16:]" - eval_split: "train[:16]" + name: OpenMLRL/CoopHumanEval + type: coophumaneval + train_split: train[16:] + eval_split: train[:16] output: base_dir: output_iac_che @@ -30,20 +30,22 @@ output: save_path: output_iac_che external: - mode: "level_feedback" + mode: level_feedback sandbox_slice: 1 external_prompt_passthrough: false iac: parallel_training: none - agent_devices: ["cuda:0"] - critic_devices: ["cuda:0"] + agent_devices: + - cuda:0 + critic_devices: + - cuda:0 num_agents: 2 num_turns: 2 use_separate_critic: true num_train_epochs: 80 - agent_learning_rate: 5.0e-6 - critic_learning_rate: 5.0e-6 + agent_learning_rate: 5.0e-06 + critic_learning_rate: 5.0e-06 value_loss_coef: 0.6 value_clip_range: 0.2 rollout_buffer_size: 4 @@ -58,8 +60,12 @@ iac: reward_shift: -4 wandb: - project: "comlrl" - entity: "OpenMLRL" - name: "iac_coophumaneval" + project: comlrl + entity: OpenMLRL + name: iac_coophumaneval dir: output_iac_che - tags: ["iac", "coophumaneval", "multi-agent", "turns_2"] + tags: + - iac + - coophumaneval + - multi-agent + - turns_2 diff --git a/configs/iac_he_config.yaml b/configs/iac_he_config.yaml index cafbec4..4a9c2ac 100644 --- a/configs/iac_he_config.yaml +++ b/configs/iac_he_config.yaml @@ -1,27 +1,27 @@ agent_model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" + name: Qwen/Qwen2.5-Coder-3B + type: qwen temperature: 0.6 top_p: 0.6 top_k: null max_length: 2048 - torch_dtype: "bfloat16" + torch_dtype: bfloat16 agents: null critic_model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" + name: Qwen/Qwen2.5-Coder-3B + type: qwen max_length: 2048 - torch_dtype: "bfloat16" + torch_dtype: bfloat16 critics: null dataset: - name: "openai/openai_humaneval" - type: "humaneval" - train_split: "test[33:163]" - eval_split: "test[:32]" + name: openai/openai_humaneval + type: humaneval + train_split: test[33:163] + eval_split: test[:32] output: base_dir: output_iac_he @@ -30,20 +30,22 @@ output: save_path: output_iac_he external: - mode: "level_feedback" + mode: level_feedback sandbox_slice: 1 external_prompt_passthrough: false iac: parallel_training: none - agent_devices: ["cuda:0"] - critic_devices: ["cuda:0"] + agent_devices: + - cuda:0 + critic_devices: + - cuda:0 num_agents: 2 num_turns: 2 use_separate_critic: true num_train_epochs: 80 - agent_learning_rate: 5.0e-6 - critic_learning_rate: 5.0e-6 + agent_learning_rate: 5.0e-06 + critic_learning_rate: 5.0e-06 value_loss_coef: 0.6 value_clip_range: 0.2 rollout_buffer_size: 4 @@ -58,8 +60,12 @@ iac: reward_shift: -4 wandb: - project: "comlrl" - entity: "OpenMLRL" - name: "iac_humaneval" + project: comlrl + entity: OpenMLRL + name: iac_humaneval dir: output_iac_he - tags: ["iac", "humaneval", "multi-agent", "turns_2"] + tags: + - iac + - humaneval + - multi-agent + - turns_2 diff --git a/configs/iac_mbpp_config.yaml b/configs/iac_mbpp_config.yaml index 55cc5af..209fe8b 100644 --- a/configs/iac_mbpp_config.yaml +++ b/configs/iac_mbpp_config.yaml @@ -1,27 +1,27 @@ agent_model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" + name: Qwen/Qwen2.5-Coder-3B + type: qwen temperature: 0.6 top_p: 0.6 top_k: null max_length: 2048 - torch_dtype: "bfloat16" + torch_dtype: bfloat16 agents: null critic_model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" + name: Qwen/Qwen2.5-Coder-3B + type: qwen max_length: 2048 - torch_dtype: "bfloat16" + torch_dtype: bfloat16 critics: null dataset: - name: "OpenMLRL/MBPP" - type: "mbpp" - train_split: "test[15:65]" - eval_split: "test[:15]" + name: OpenMLRL/MBPP + type: mbpp + train_split: test[15:65] + eval_split: test[:15] output: base_dir: output_iac_mbpp @@ -30,20 +30,22 @@ output: save_path: output_iac_mbpp external: - mode: "level_feedback" + mode: level_feedback sandbox_slice: 1 external_prompt_passthrough: false iac: parallel_training: none - agent_devices: ["cuda:0"] - critic_devices: ["cuda:0"] + agent_devices: + - cuda:0 + critic_devices: + - cuda:0 num_agents: 2 num_turns: 2 use_separate_critic: true num_train_epochs: 80 - agent_learning_rate: 5.0e-6 - critic_learning_rate: 5.0e-6 + agent_learning_rate: 5.0e-06 + critic_learning_rate: 5.0e-06 value_loss_coef: 0.6 value_clip_range: 0.2 rollout_buffer_size: 4 @@ -58,8 +60,12 @@ iac: reward_shift: -4 wandb: - project: "comlrl" - entity: "OpenMLRL" - name: "iac_mbpp" + project: comlrl + entity: OpenMLRL + name: iac_mbpp dir: output_iac_mbpp - tags: ["iac", "mbpp", "multi-agent", "turns_2"] + tags: + - iac + - mbpp + - multi-agent + - turns_2 diff --git a/configs/maac_che_config.yaml b/configs/maac_che_config.yaml index 1b72779..3ca2c20 100644 --- a/configs/maac_che_config.yaml +++ b/configs/maac_che_config.yaml @@ -1,27 +1,27 @@ agent_model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" + name: Qwen/Qwen2.5-Coder-3B + type: qwen temperature: 0.6 top_p: 0.6 top_k: null max_length: 2048 - torch_dtype: "bfloat16" + torch_dtype: bfloat16 agents: null critic_model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" + name: Qwen/Qwen2.5-Coder-3B + type: qwen max_length: 2048 - torch_dtype: "bfloat16" + torch_dtype: bfloat16 critics: null dataset: - name: "OpenMLRL/CoopHumanEval" - type: "coophumaneval" - train_split: "train[16:]" - eval_split: "train[:16]" + name: OpenMLRL/CoopHumanEval + type: coophumaneval + train_split: train[16:] + eval_split: train[:16] output: base_dir: output_maac_che @@ -30,20 +30,22 @@ output: save_path: output_maac_che external: - mode: "level_feedback" + mode: level_feedback sandbox_slice: 1 external_prompt_passthrough: false maac: parallel_training: none - agent_devices: ["cuda:0"] - critic_devices: ["cuda:0"] + agent_devices: + - cuda:0 + critic_devices: + - cuda:0 num_agents: 2 num_turns: 2 - critic_type: "v" + critic_type: v num_train_epochs: 80 - agent_learning_rate: 5.0e-6 - critic_learning_rate: 5.0e-6 + agent_learning_rate: 5.0e-06 + critic_learning_rate: 5.0e-06 value_loss_coef: 0.6 rollout_buffer_size: 4 train_batch_size: 4 @@ -57,8 +59,12 @@ maac: reward_shift: -4 wandb: - project: "comlrl" - entity: "OpenMLRL" - name: "maac_coophumaneval" + project: comlrl + entity: OpenMLRL + name: maac_coophumaneval dir: output_maac_che - tags: ["maac", "coophumaneval", "multi-agent", "turns_2"] + tags: + - maac + - coophumaneval + - multi-agent + - turns_2 diff --git a/configs/maac_he_config.yaml b/configs/maac_he_config.yaml index dccade6..7c6c60e 100644 --- a/configs/maac_he_config.yaml +++ b/configs/maac_he_config.yaml @@ -1,27 +1,27 @@ agent_model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" + name: Qwen/Qwen2.5-Coder-3B + type: qwen temperature: 0.6 top_p: 0.6 top_k: null max_length: 2048 - torch_dtype: "bfloat16" + torch_dtype: bfloat16 agents: null critic_model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" + name: Qwen/Qwen2.5-Coder-3B + type: qwen max_length: 2048 - torch_dtype: "bfloat16" + torch_dtype: bfloat16 critics: null dataset: - name: "openai/openai_humaneval" - type: "humaneval" - train_split: "test[33:163]" - eval_split: "test[:32]" + name: openai/openai_humaneval + type: humaneval + train_split: test[33:163] + eval_split: test[:32] output: base_dir: output_maac_he @@ -30,20 +30,22 @@ output: save_path: output_maac_he external: - mode: "level_feedback" + mode: level_feedback sandbox_slice: 1 external_prompt_passthrough: false maac: parallel_training: none - agent_devices: ["cuda:0"] - critic_devices: ["cuda:0"] + agent_devices: + - cuda:0 + critic_devices: + - cuda:0 num_agents: 2 num_turns: 2 - critic_type: "v" + critic_type: v num_train_epochs: 80 - agent_learning_rate: 5.0e-6 - critic_learning_rate: 5.0e-6 + agent_learning_rate: 5.0e-06 + critic_learning_rate: 5.0e-06 value_loss_coef: 0.6 rollout_buffer_size: 4 train_batch_size: 4 @@ -57,8 +59,12 @@ maac: reward_shift: -4 wandb: - project: "comlrl" - entity: "OpenMLRL" - name: "maac_humaneval" + project: comlrl + entity: OpenMLRL + name: maac_humaneval dir: output_maac_he - tags: ["maac", "humaneval", "multi-agent", "turns_2"] + tags: + - maac + - humaneval + - multi-agent + - turns_2 diff --git a/configs/maac_mbpp_config.yaml b/configs/maac_mbpp_config.yaml index 690b5d0..0ef3f41 100644 --- a/configs/maac_mbpp_config.yaml +++ b/configs/maac_mbpp_config.yaml @@ -1,27 +1,27 @@ agent_model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" + name: Qwen/Qwen2.5-Coder-3B + type: qwen temperature: 0.6 top_p: 0.6 top_k: null max_length: 2048 - torch_dtype: "bfloat16" + torch_dtype: bfloat16 agents: null critic_model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" + name: Qwen/Qwen2.5-Coder-3B + type: qwen max_length: 2048 - torch_dtype: "bfloat16" + torch_dtype: bfloat16 critics: null dataset: - name: "OpenMLRL/MBPP" - type: "mbpp" - train_split: "test[15:65]" - eval_split: "test[:15]" + name: OpenMLRL/MBPP + type: mbpp + train_split: test[15:65] + eval_split: test[:15] output: base_dir: output_maac_mbpp @@ -30,20 +30,22 @@ output: save_path: output_maac_mbpp external: - mode: "level_feedback" + mode: level_feedback sandbox_slice: 1 external_prompt_passthrough: false maac: parallel_training: none - agent_devices: ["cuda:0"] - critic_devices: ["cuda:0"] + agent_devices: + - cuda:0 + critic_devices: + - cuda:0 num_agents: 2 num_turns: 2 - critic_type: "v" + critic_type: v num_train_epochs: 80 - agent_learning_rate: 5.0e-6 - critic_learning_rate: 5.0e-6 + agent_learning_rate: 5.0e-06 + critic_learning_rate: 5.0e-06 value_loss_coef: 0.6 rollout_buffer_size: 4 train_batch_size: 4 @@ -57,8 +59,12 @@ maac: reward_shift: -4 wandb: - project: "comlrl" - entity: "OpenMLRL" - name: "maac_mbpp" + project: comlrl + entity: OpenMLRL + name: maac_mbpp dir: output_maac_mbpp - tags: ["maac", "mbpp", "multi-agent", "turns_2"] + tags: + - maac + - mbpp + - multi-agent + - turns_2 diff --git a/configs/magrpo_che_config.yaml b/configs/magrpo_che_config.yaml index 301266c..3f3e013 100644 --- a/configs/magrpo_che_config.yaml +++ b/configs/magrpo_che_config.yaml @@ -1,12 +1,12 @@ agent_model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" + name: Qwen/Qwen2.5-Coder-3B + type: qwen temperature: 0.6 top_p: 0.6 top_k: null max_length: 2048 - torch_dtype: "bfloat16" special_tokens: {} + torch_dtype: bfloat16 agents: null @@ -15,30 +15,31 @@ critic_model: null critics: null dataset: - name: "OpenMLRL/CoopHumanEval" - type: "coophumaneval" - train_split: "train[16:]" - eval_split: "train[:16]" + name: OpenMLRL/CoopHumanEval + type: coophumaneval + train_split: train[16:] + eval_split: train[:16] seed: 42 output: base_dir: output_magrpo_che - save_final_model: false verbose: false + save_final_model: false external: - mode: "level_feedback" + mode: level_feedback sandbox_slice: 1 external_prompt_passthrough: false magrpo: parallel_training: none - agent_devices: ["cuda:0"] + agent_devices: + - cuda:0 num_agents: 2 num_turns: 2 num_train_epochs: 8 - agent_learning_rate: 2.0e-5 + agent_learning_rate: 2.0e-05 logging_steps: 50 num_generations: 4 max_new_tokens: 256 @@ -58,8 +59,12 @@ reward_processor: shift: -4 wandb: - project: "comlrl" - entity: "OpenMLRL" - name: "magrpo_coophumaneval" + project: comlrl + entity: OpenMLRL + name: magrpo_coophumaneval dir: output_magrpo_che - tags: ["magrpo", "coophumaneval", "multi-agent", "turns_2"] + tags: + - magrpo + - coophumaneval + - multi-agent + - turns_2 diff --git a/configs/magrpo_he_config.yaml b/configs/magrpo_he_config.yaml index 1a9ad90..0525ab1 100644 --- a/configs/magrpo_he_config.yaml +++ b/configs/magrpo_he_config.yaml @@ -1,12 +1,12 @@ agent_model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" + name: Qwen/Qwen2.5-Coder-3B + type: qwen temperature: 0.6 top_p: 0.6 top_k: null max_length: 2048 - torch_dtype: "bfloat16" special_tokens: {} + torch_dtype: bfloat16 agents: null @@ -15,30 +15,31 @@ critic_model: null critics: null dataset: - name: "openai/openai_humaneval" - type: "humaneval" - train_split: "test[33:163]" - eval_split: "test[:32]" + name: openai/openai_humaneval + type: humaneval + train_split: test[33:163] + eval_split: test[:32] seed: 42 output: base_dir: output_magrpo_he - save_final_model: false verbose: false + save_final_model: false external: - mode: "level_feedback" + mode: level_feedback sandbox_slice: 1 external_prompt_passthrough: false magrpo: parallel_training: none - agent_devices: ["cuda:0"] + agent_devices: + - cuda:0 num_agents: 2 num_turns: 2 num_train_epochs: 6 - agent_learning_rate: 2.0e-5 + agent_learning_rate: 2.0e-05 logging_steps: 50 num_generations: 4 max_new_tokens: 256 @@ -58,8 +59,11 @@ reward_processor: shift: -4 wandb: - project: "comlrl" - entity: "OpenMLRL" - name: "magrpo_humaneval" + project: comlrl + entity: OpenMLRL + name: magrpo_humaneval dir: output_magrpo_he - tags: ["magrpo", "humaneval", "multi-agent"] + tags: + - magrpo + - humaneval + - multi-agent diff --git a/configs/magrpo_mbpp_config.yaml b/configs/magrpo_mbpp_config.yaml index 65260ea..b7d64f1 100644 --- a/configs/magrpo_mbpp_config.yaml +++ b/configs/magrpo_mbpp_config.yaml @@ -5,28 +5,37 @@ agent_model: top_p: 0.6 top_k: null max_length: 2048 - torch_dtype: bfloat16 special_tokens: {} + torch_dtype: bfloat16 + agents: null + critic_model: null + critics: null + dataset: name: OpenMLRL/MBPP type: mbpp train_split: test[15:65] eval_split: test[:15] + seed: 42 + output: base_dir: output_magrpo_mbpp - save_final_model: false verbose: false + save_final_model: false + external: mode: level_feedback sandbox_slice: 1 external_prompt_passthrough: false + magrpo: parallel_training: none - agent_devices: ["cuda:0"] + agent_devices: + - cuda:0 num_agents: 2 num_turns: 2 num_train_epochs: 8 @@ -43,10 +52,12 @@ magrpo: eval_interval: 4 eval_num_samples: 4 eval_batch_size: 1 + reward_processor: enabled: true scale_factor: 1.0 shift: -4 + wandb: project: comlrl entity: OpenMLRL From 3cde27b500040cc3eb3fe2b1cae1532cad8e62d2 Mon Sep 17 00:00:00 2001 From: N!no Date: Mon, 16 Feb 2026 22:29:02 -0500 Subject: [PATCH 14/14] ud --- external/level_feedback.py | 3 ++- loggers/code_logger.py | 3 ++- rewards/code_rewards.py | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/external/level_feedback.py b/external/level_feedback.py index a51ad5e..505f581 100644 --- a/external/level_feedback.py +++ b/external/level_feedback.py @@ -1,5 +1,6 @@ import ast import signal +import math from typing import Dict, List, Tuple, Optional from rewards.code_utils import ( @@ -36,7 +37,7 @@ def _run_tests( MAX_TIMEOUTS = 3 # Prepare execution environment - exec_globals: Dict[str, object] = {} + exec_globals: Dict[str, object] = {"math": math} try: exec(combined_code, exec_globals) except Exception as e: diff --git a/loggers/code_logger.py b/loggers/code_logger.py index c32c9d4..08da5cf 100644 --- a/loggers/code_logger.py +++ b/loggers/code_logger.py @@ -1,4 +1,5 @@ import signal +import math @@ -135,7 +136,7 @@ def code_reward_logger( try: # Load code definitions - exec_globals = {} + exec_globals = {"math": math} exec(combined_code, exec_globals) # Run individual test cases diff --git a/rewards/code_rewards.py b/rewards/code_rewards.py index f5021ea..a36f65a 100644 --- a/rewards/code_rewards.py +++ b/rewards/code_rewards.py @@ -1,5 +1,6 @@ import re import signal +import math from typing import List import builtins @@ -188,7 +189,7 @@ def print(*args, **kwargs): # type: ignore try: # Create execution environment (no timeout needed for function definitions) - exec_globals = {} + exec_globals = {"math": math} exec(combined_code, exec_globals) print("✅ Code definitions loaded successfully")