From 34807826c7750dad0d9c4aa022f6810da5e22e14 Mon Sep 17 00:00:00 2001 From: N!no Date: Sun, 15 Feb 2026 11:27:16 -0500 Subject: [PATCH 01/12] Resolve CoMLRL from local workspace --- config.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/config.py b/config.py index 8a65304..60d21c7 100644 --- a/config.py +++ b/config.py @@ -4,6 +4,8 @@ """ import argparse +import os +import sys from dataclasses import dataclass, field from pathlib import Path from typing import Any, Dict, Optional @@ -11,6 +13,12 @@ import yaml +REPO_ROOT = os.path.dirname(os.path.abspath(__file__)) +COMLRL_ROOT = os.path.join(os.path.dirname(REPO_ROOT), "CoMLRL") +if COMLRL_ROOT not in sys.path: + sys.path.insert(0, COMLRL_ROOT) + + @dataclass(frozen=True) class ModelConfig: """Configuration for model loading and generation.""" From 85c6124c501144a535273e10e40b50f64d2b9674 Mon Sep 17 00:00:00 2001 From: N!no Date: Sun, 15 Feb 2026 13:39:25 -0500 Subject: [PATCH 02/12] propagate parallel mode and device scheduling settings to trainers --- train_ac.py | 3 +++ train_grpo.py | 2 ++ train_iac.py | 3 +++ train_maac.py | 3 +++ train_magrpo.py | 2 ++ 5 files changed, 13 insertions(+) diff --git a/train_ac.py b/train_ac.py index 80139db..675acb6 100644 --- a/train_ac.py +++ b/train_ac.py @@ -291,6 +291,9 @@ def main() -> None: num_agents=1, num_generations=ac_cfg.get("num_generations", 1), use_separate_critic=use_separate_critic, + parallel_mode=str(ac_cfg.get("parallel_mode", "auto")).strip().lower(), + agent_devices=ac_cfg.get("agent_devices", None), + critic_devices=ac_cfg.get("critic_devices", None), critic_value_head_hidden_dim=ac_cfg.get("critic_value_head_hidden_dim"), value_head_hidden_dim=ac_cfg.get("value_head_hidden_dim"), discount=ac_cfg.get("discount", 0.9), diff --git a/train_grpo.py b/train_grpo.py index cc6f585..c8bb39f 100644 --- a/train_grpo.py +++ b/train_grpo.py @@ -225,6 +225,8 @@ def main(): top_p=top_p, top_k=top_k, num_agents=1, + parallel_mode=str(grpo_cfg.get("parallel_mode", "auto")).strip().lower(), + agent_devices=grpo_cfg.get("agent_devices", None), early_termination_threshold=grpo_cfg.get( "early_termination_threshold", -0.2 ), diff --git a/train_iac.py b/train_iac.py index e625e7a..793be3b 100644 --- a/train_iac.py +++ b/train_iac.py @@ -318,6 +318,9 @@ def main() -> None: num_agents=num_agents, num_generations=iac_cfg.get("num_generations", 1), use_separate_critic=use_separate_critic, + parallel_mode=str(iac_cfg.get("parallel_mode", "auto")).strip().lower(), + agent_devices=iac_cfg.get("agent_devices", None), + critic_devices=iac_cfg.get("critic_devices", None), critic_value_head_hidden_dim=iac_cfg.get("critic_value_head_hidden_dim"), value_head_hidden_dim=iac_cfg.get("value_head_hidden_dim"), discount=iac_cfg.get("discount", 0.9), diff --git a/train_maac.py b/train_maac.py index 51b798d..ed58574 100644 --- a/train_maac.py +++ b/train_maac.py @@ -314,6 +314,9 @@ def main() -> None: top_k=top_k, num_agents=num_agents, num_generations=maac_cfg.get("num_generations", 1), + parallel_mode=str(maac_cfg.get("parallel_mode", "auto")).strip().lower(), + agent_devices=maac_cfg.get("agent_devices", None), + critic_devices=maac_cfg.get("critic_devices", None), discount=maac_cfg.get("discount", 0.9), critic_type=maac_cfg.get("critic_type", "v"), eval_interval=maac_cfg.get("eval_interval", 4), diff --git a/train_magrpo.py b/train_magrpo.py index 47d486d..cf1eaa8 100644 --- a/train_magrpo.py +++ b/train_magrpo.py @@ -341,6 +341,8 @@ def main(): top_p=top_p, top_k=top_k, num_agents=num_agents, + parallel_mode=str(magrpo_cfg.get("parallel_mode", "auto")).strip().lower(), + agent_devices=magrpo_cfg.get("agent_devices", None), early_termination_threshold=magrpo_cfg.get( "early_termination_threshold", -0.2 ), From 6c583456be6748332f4ba4c7a0b2da9cb2ec9515 Mon Sep 17 00:00:00 2001 From: N!no Date: Sun, 15 Feb 2026 14:02:38 -0500 Subject: [PATCH 03/12] use parallel_training key with explicit auto defaults --- configs/ac_arxiv_config.yaml | 1 + configs/ac_tldr_config.yaml | 1 + configs/grpo_arxiv_config.yaml | 1 + configs/grpo_tldr_config.yaml | 1 + configs/iac_arxiv_config.yaml | 1 + configs/iac_tldr_config.yaml | 1 + configs/maac_arxiv_config.yaml | 1 + configs/maac_tldr_config.yaml | 1 + configs/magrpo_arxiv_config.yaml | 1 + configs/magrpo_tldr_config.yaml | 1 + train_ac.py | 2 +- train_grpo.py | 2 +- train_iac.py | 2 +- train_maac.py | 2 +- train_magrpo.py | 2 +- 15 files changed, 15 insertions(+), 5 deletions(-) diff --git a/configs/ac_arxiv_config.yaml b/configs/ac_arxiv_config.yaml index 948242f..c9d4a0a 100644 --- a/configs/ac_arxiv_config.yaml +++ b/configs/ac_arxiv_config.yaml @@ -34,6 +34,7 @@ output: save_path: "./ac_output/ac_arxiv" ac: + parallel_training: auto num_turns: 1 num_train_epochs: 4 agent_learning_rate: 5.0e-6 diff --git a/configs/ac_tldr_config.yaml b/configs/ac_tldr_config.yaml index 855ce83..462fe25 100644 --- a/configs/ac_tldr_config.yaml +++ b/configs/ac_tldr_config.yaml @@ -34,6 +34,7 @@ output: save_path: "./ac_output/ac_tldr" ac: + parallel_training: auto num_turns: 1 num_train_epochs: 4 agent_learning_rate: 5.0e-6 diff --git a/configs/grpo_arxiv_config.yaml b/configs/grpo_arxiv_config.yaml index ef37939..d9ee357 100644 --- a/configs/grpo_arxiv_config.yaml +++ b/configs/grpo_arxiv_config.yaml @@ -28,6 +28,7 @@ output: save_path: "./grpo_output/arxiv_single" grpo: + parallel_training: auto num_turns: 1 num_train_epochs: 1 agent_learning_rate: 5.0e-6 diff --git a/configs/grpo_tldr_config.yaml b/configs/grpo_tldr_config.yaml index f19e4a1..1e6b18d 100644 --- a/configs/grpo_tldr_config.yaml +++ b/configs/grpo_tldr_config.yaml @@ -28,6 +28,7 @@ output: save_path: "./grpo_output/tldr_single" grpo: + parallel_training: auto num_turns: 1 num_train_epochs: 1 agent_learning_rate: 5.0e-6 diff --git a/configs/iac_arxiv_config.yaml b/configs/iac_arxiv_config.yaml index 5548117..6f73912 100644 --- a/configs/iac_arxiv_config.yaml +++ b/configs/iac_arxiv_config.yaml @@ -34,6 +34,7 @@ output: save_path: "./iac_output/iac_arxiv" iac: + parallel_training: auto num_agents: 2 num_turns: 1 use_separate_critic: true diff --git a/configs/iac_tldr_config.yaml b/configs/iac_tldr_config.yaml index 823bb3c..fe76881 100644 --- a/configs/iac_tldr_config.yaml +++ b/configs/iac_tldr_config.yaml @@ -34,6 +34,7 @@ output: save_path: "./iac_output/iac_tldr" iac: + parallel_training: auto num_agents: 2 num_turns: 1 use_separate_critic: true diff --git a/configs/maac_arxiv_config.yaml b/configs/maac_arxiv_config.yaml index a6d74ae..045216b 100644 --- a/configs/maac_arxiv_config.yaml +++ b/configs/maac_arxiv_config.yaml @@ -34,6 +34,7 @@ output: save_path: "./maac_output/maac_arxiv" maac: + parallel_training: auto num_agents: 2 num_turns: 1 critic_type: "v" diff --git a/configs/maac_tldr_config.yaml b/configs/maac_tldr_config.yaml index 39f9927..eb2d1c9 100644 --- a/configs/maac_tldr_config.yaml +++ b/configs/maac_tldr_config.yaml @@ -34,6 +34,7 @@ output: save_path: "./maac_output/maac_tldr" maac: + parallel_training: auto num_agents: 2 num_turns: 1 critic_type: "v" diff --git a/configs/magrpo_arxiv_config.yaml b/configs/magrpo_arxiv_config.yaml index 7648710..fc1224e 100644 --- a/configs/magrpo_arxiv_config.yaml +++ b/configs/magrpo_arxiv_config.yaml @@ -28,6 +28,7 @@ output: save_path: "./magrpo_output/arxiv" magrpo: + parallel_training: auto num_turns: 1 num_train_epochs: 2 agent_learning_rate: 5.0e-6 diff --git a/configs/magrpo_tldr_config.yaml b/configs/magrpo_tldr_config.yaml index 96f81a9..4fdaf44 100644 --- a/configs/magrpo_tldr_config.yaml +++ b/configs/magrpo_tldr_config.yaml @@ -28,6 +28,7 @@ output: save_path: "./magrpo_output/tldr" magrpo: + parallel_training: auto num_turns: 1 num_train_epochs: 2 agent_learning_rate: 5.0e-6 diff --git a/train_ac.py b/train_ac.py index 675acb6..35d6896 100644 --- a/train_ac.py +++ b/train_ac.py @@ -291,7 +291,7 @@ def main() -> None: num_agents=1, num_generations=ac_cfg.get("num_generations", 1), use_separate_critic=use_separate_critic, - parallel_mode=str(ac_cfg.get("parallel_mode", "auto")).strip().lower(), + parallel_training=str(ac_cfg.get("parallel_training", "auto")).strip().lower(), agent_devices=ac_cfg.get("agent_devices", None), critic_devices=ac_cfg.get("critic_devices", None), critic_value_head_hidden_dim=ac_cfg.get("critic_value_head_hidden_dim"), diff --git a/train_grpo.py b/train_grpo.py index c8bb39f..d7e20aa 100644 --- a/train_grpo.py +++ b/train_grpo.py @@ -225,7 +225,7 @@ def main(): top_p=top_p, top_k=top_k, num_agents=1, - parallel_mode=str(grpo_cfg.get("parallel_mode", "auto")).strip().lower(), + parallel_training=str(grpo_cfg.get("parallel_training", "auto")).strip().lower(), agent_devices=grpo_cfg.get("agent_devices", None), early_termination_threshold=grpo_cfg.get( "early_termination_threshold", -0.2 diff --git a/train_iac.py b/train_iac.py index 793be3b..6be8290 100644 --- a/train_iac.py +++ b/train_iac.py @@ -318,7 +318,7 @@ def main() -> None: num_agents=num_agents, num_generations=iac_cfg.get("num_generations", 1), use_separate_critic=use_separate_critic, - parallel_mode=str(iac_cfg.get("parallel_mode", "auto")).strip().lower(), + parallel_training=str(iac_cfg.get("parallel_training", "auto")).strip().lower(), agent_devices=iac_cfg.get("agent_devices", None), critic_devices=iac_cfg.get("critic_devices", None), critic_value_head_hidden_dim=iac_cfg.get("critic_value_head_hidden_dim"), diff --git a/train_maac.py b/train_maac.py index ed58574..412d849 100644 --- a/train_maac.py +++ b/train_maac.py @@ -314,7 +314,7 @@ def main() -> None: top_k=top_k, num_agents=num_agents, num_generations=maac_cfg.get("num_generations", 1), - parallel_mode=str(maac_cfg.get("parallel_mode", "auto")).strip().lower(), + parallel_training=str(maac_cfg.get("parallel_training", "auto")).strip().lower(), agent_devices=maac_cfg.get("agent_devices", None), critic_devices=maac_cfg.get("critic_devices", None), discount=maac_cfg.get("discount", 0.9), diff --git a/train_magrpo.py b/train_magrpo.py index cf1eaa8..59191a7 100644 --- a/train_magrpo.py +++ b/train_magrpo.py @@ -341,7 +341,7 @@ def main(): top_p=top_p, top_k=top_k, num_agents=num_agents, - parallel_mode=str(magrpo_cfg.get("parallel_mode", "auto")).strip().lower(), + parallel_training=str(magrpo_cfg.get("parallel_training", "auto")).strip().lower(), agent_devices=magrpo_cfg.get("agent_devices", None), early_termination_threshold=magrpo_cfg.get( "early_termination_threshold", -0.2 From 1d2dc96b1cbea0b88420eba00cc993209f02d8f2 Mon Sep 17 00:00:00 2001 From: N!no Date: Sun, 15 Feb 2026 15:57:31 -0500 Subject: [PATCH 04/12] ud --- config.py | 56 ++++++++++++++++++++++++++++---- configs/ac_arxiv_config.yaml | 6 +--- configs/ac_tldr_config.yaml | 6 +--- configs/grpo_arxiv_config.yaml | 2 +- configs/grpo_tldr_config.yaml | 2 +- configs/iac_arxiv_config.yaml | 6 +--- configs/iac_tldr_config.yaml | 6 +--- configs/maac_arxiv_config.yaml | 6 +--- configs/maac_tldr_config.yaml | 6 +--- configs/magrpo_arxiv_config.yaml | 4 +-- configs/magrpo_tldr_config.yaml | 4 +-- train_ac.py | 6 ++-- train_grpo.py | 6 ++-- train_iac.py | 6 ++-- train_maac.py | 6 ++-- train_magrpo.py | 6 ++-- 16 files changed, 74 insertions(+), 60 deletions(-) diff --git a/config.py b/config.py index 60d21c7..ebfb5a5 100644 --- a/config.py +++ b/config.py @@ -25,20 +25,62 @@ class ModelConfig: name: str type: str = "qwen" - temperature: float = 0.7 - top_p: float = 0.9 + temperature: Optional[float] = None + top_p: Optional[float] = None + top_k: Optional[int] = None max_length: int = 2048 special_tokens: Dict[str, str] = field(default_factory=dict) torch_dtype: Optional[str] = None @classmethod - def from_dict(cls, config_dict: Dict[str, Any]) -> "ModelConfig": + def from_dict( + cls, + config_dict: Dict[str, Any], + *, + require_sampling: bool = True, + ) -> "ModelConfig": """Create ModelConfig from dictionary.""" + if require_sampling: + missing = [ + key + for key in ("temperature", "top_p", "top_k") + if key not in config_dict + ] + if missing: + raise ValueError( + f"agent_model is missing required sampling fields: {', '.join(missing)}" + ) + + def _as_optional_float(value: Any) -> Optional[float]: + if value is None: + return None + try: + return float(value) + except (TypeError, ValueError) as exc: + raise ValueError(f"Invalid float value: {value}") from exc + + def _as_optional_int(value: Any) -> Optional[int]: + if value is None: + return None + if isinstance(value, str) and value.strip().lower() in ("none", "null", ""): + return None + try: + return int(float(value)) + except (TypeError, ValueError) as exc: + raise ValueError(f"Invalid int value: {value}") from exc + + temperature = _as_optional_float(config_dict.get("temperature")) + top_p = _as_optional_float(config_dict.get("top_p")) + top_k = _as_optional_int(config_dict.get("top_k")) + if require_sampling and (temperature is None or top_p is None): + raise ValueError("agent_model.temperature and agent_model.top_p must be non-null.") + return cls( name=config_dict.get("name", ""), type=config_dict.get("type", "qwen"), - temperature=config_dict.get("temperature", 0.7), - top_p=config_dict.get("top_p", 0.9), + temperature=temperature, + top_p=top_p, + top_k=top_k, max_length=config_dict.get("max_length", 2048), special_tokens=config_dict.get("special_tokens", {}), torch_dtype=( @@ -82,7 +124,7 @@ def get_agent_model_config(self) -> ModelConfig: model_section = self.get_section("agent_model") if not model_section: raise ValueError("No 'agent_model' section found in configuration") - return ModelConfig.from_dict(model_section) + return ModelConfig.from_dict(model_section, require_sampling=True) def get_critic_model_config(self, required: bool = True) -> Optional[ModelConfig]: """Get critic model configuration as ModelConfig object.""" @@ -91,7 +133,7 @@ def get_critic_model_config(self, required: bool = True) -> Optional[ModelConfig if required: raise ValueError("No 'critic_model' section found in configuration") return None - return ModelConfig.from_dict(critic_section) + return ModelConfig.from_dict(critic_section, require_sampling=False) def update(self, updates: Dict[str, Any]): """Update configuration with new values (deep merge).""" diff --git a/configs/ac_arxiv_config.yaml b/configs/ac_arxiv_config.yaml index c9d4a0a..c10f0e7 100644 --- a/configs/ac_arxiv_config.yaml +++ b/configs/ac_arxiv_config.yaml @@ -3,6 +3,7 @@ agent_model: type: "qwen" temperature: 0.7 top_p: 0.9 + top_k: null max_length: 2048 torch_dtype: "auto" @@ -11,8 +12,6 @@ agents: null critic_model: name: "Qwen/Qwen3-1.7B" type: "qwen" - temperature: 0.7 - top_p: 0.9 max_length: 2048 torch_dtype: "auto" @@ -43,9 +42,6 @@ ac: advantage_normalization: true rollout_buffer_size: 4 max_new_tokens: 512 - temperature: 0.7 - top_p: 0.9 - top_k: null eval_interval: 20 eval_num_samples: 4 eval_batch_size: 1 diff --git a/configs/ac_tldr_config.yaml b/configs/ac_tldr_config.yaml index 462fe25..f13045c 100644 --- a/configs/ac_tldr_config.yaml +++ b/configs/ac_tldr_config.yaml @@ -3,6 +3,7 @@ agent_model: type: "qwen" temperature: 0.7 top_p: 0.9 + top_k: null max_length: 2048 torch_dtype: "auto" @@ -11,8 +12,6 @@ agents: null critic_model: name: "Qwen/Qwen3-1.7B" type: "qwen" - temperature: 0.7 - top_p: 0.9 max_length: 2048 torch_dtype: "auto" @@ -43,9 +42,6 @@ ac: advantage_normalization: true rollout_buffer_size: 4 max_new_tokens: 256 - temperature: 0.7 - top_p: 0.9 - top_k: null eval_interval: 20 eval_num_samples: 4 eval_batch_size: 1 diff --git a/configs/grpo_arxiv_config.yaml b/configs/grpo_arxiv_config.yaml index d9ee357..5e89e28 100644 --- a/configs/grpo_arxiv_config.yaml +++ b/configs/grpo_arxiv_config.yaml @@ -3,6 +3,7 @@ agent_model: type: "qwen" temperature: 0.7 top_p: 0.9 + top_k: null max_length: 2048 torch_dtype: "auto" @@ -36,7 +37,6 @@ grpo: num_generations: 4 joint_mode: aligned max_new_tokens: 512 - top_k: null rollout_buffer_size: 2 train_batch_size: 2 advantage_normalization: true diff --git a/configs/grpo_tldr_config.yaml b/configs/grpo_tldr_config.yaml index 1e6b18d..d703ed4 100644 --- a/configs/grpo_tldr_config.yaml +++ b/configs/grpo_tldr_config.yaml @@ -3,6 +3,7 @@ agent_model: type: "qwen" temperature: 0.7 top_p: 0.9 + top_k: null max_length: 2048 torch_dtype: "auto" @@ -36,7 +37,6 @@ grpo: num_generations: 4 joint_mode: aligned max_new_tokens: 256 - top_k: null rollout_buffer_size: 2 train_batch_size: 2 advantage_normalization: true diff --git a/configs/iac_arxiv_config.yaml b/configs/iac_arxiv_config.yaml index 6f73912..0168c50 100644 --- a/configs/iac_arxiv_config.yaml +++ b/configs/iac_arxiv_config.yaml @@ -3,6 +3,7 @@ agent_model: type: "qwen" temperature: 0.7 top_p: 0.9 + top_k: null max_length: 2048 torch_dtype: "auto" @@ -11,8 +12,6 @@ agents: null critic_model: name: "Qwen/Qwen3-1.7B" type: "qwen" - temperature: 0.7 - top_p: 0.9 max_length: 2048 torch_dtype: "auto" @@ -46,9 +45,6 @@ iac: rollout_buffer_size: 4 train_batch_size: 4 max_new_tokens: 256 - temperature: 0.7 - top_p: 0.9 - top_k: null eval_interval: 20 eval_num_samples: 4 eval_batch_size: 1 diff --git a/configs/iac_tldr_config.yaml b/configs/iac_tldr_config.yaml index fe76881..3ce4f9c 100644 --- a/configs/iac_tldr_config.yaml +++ b/configs/iac_tldr_config.yaml @@ -3,6 +3,7 @@ agent_model: type: "qwen" temperature: 0.7 top_p: 0.9 + top_k: null max_length: 2048 torch_dtype: "auto" @@ -11,8 +12,6 @@ agents: null critic_model: name: "Qwen/Qwen3-1.7B" type: "qwen" - temperature: 0.7 - top_p: 0.9 max_length: 2048 torch_dtype: "auto" @@ -46,9 +45,6 @@ iac: rollout_buffer_size: 4 train_batch_size: 4 max_new_tokens: 256 - temperature: 0.7 - top_p: 0.9 - top_k: null eval_interval: 20 eval_num_samples: 4 eval_batch_size: 1 diff --git a/configs/maac_arxiv_config.yaml b/configs/maac_arxiv_config.yaml index 045216b..d765524 100644 --- a/configs/maac_arxiv_config.yaml +++ b/configs/maac_arxiv_config.yaml @@ -3,6 +3,7 @@ agent_model: type: "qwen" temperature: 0.7 top_p: 0.9 + top_k: null max_length: 2048 torch_dtype: "auto" @@ -11,8 +12,6 @@ agents: null critic_model: name: "Qwen/Qwen3-1.7B" type: "qwen" - temperature: 0.7bao - top_p: 0.9 max_length: 2048 torch_dtype: "auto" @@ -45,9 +44,6 @@ maac: rollout_buffer_size: 4 train_batch_size: 4 max_new_tokens: 256 - temperature: 0.7 - top_p: 0.9 - top_k: null eval_interval: 20 eval_num_samples: 4 eval_batch_size: 1 diff --git a/configs/maac_tldr_config.yaml b/configs/maac_tldr_config.yaml index eb2d1c9..c591d7e 100644 --- a/configs/maac_tldr_config.yaml +++ b/configs/maac_tldr_config.yaml @@ -3,6 +3,7 @@ agent_model: type: "qwen" temperature: 0.7 top_p: 0.9 + top_k: null max_length: 2048 torch_dtype: "auto" @@ -11,8 +12,6 @@ agents: null critic_model: name: "Qwen/Qwen3-1.7B" type: "qwen" - temperature: 0.7 - top_p: 0.9 max_length: 2048 torch_dtype: "auto" @@ -45,9 +44,6 @@ maac: rollout_buffer_size: 4 train_batch_size: 4 max_new_tokens: 256 - temperature: 0.7 - top_p: 0.9 - top_k: null eval_interval: 20 eval_num_samples: 4 eval_batch_size: 1 diff --git a/configs/magrpo_arxiv_config.yaml b/configs/magrpo_arxiv_config.yaml index fc1224e..53b5b3c 100644 --- a/configs/magrpo_arxiv_config.yaml +++ b/configs/magrpo_arxiv_config.yaml @@ -3,6 +3,7 @@ agent_model: type: "qwen" temperature: 0.7 top_p: 0.9 + top_k: null max_length: 2048 torch_dtype: "auto" @@ -35,9 +36,6 @@ magrpo: logging_steps: 50 num_generations: 4 max_new_tokens: 256 - temperature: 0.7 - top_p: 0.9 - top_k: null joint_mode: aligned rollout_buffer_size: 1 train_batch_size: 1 diff --git a/configs/magrpo_tldr_config.yaml b/configs/magrpo_tldr_config.yaml index 4fdaf44..f7617ca 100644 --- a/configs/magrpo_tldr_config.yaml +++ b/configs/magrpo_tldr_config.yaml @@ -3,6 +3,7 @@ agent_model: type: "qwen" temperature: 0.7 top_p: 0.9 + top_k: null max_length: 2048 torch_dtype: "auto" @@ -35,9 +36,6 @@ magrpo: logging_steps: 50 num_generations: 4 max_new_tokens: 256 - temperature: 0.7 - top_p: 0.9 - top_k: null joint_mode: aligned rollout_buffer_size: 1 train_batch_size: 1 diff --git a/train_ac.py b/train_ac.py index 35d6896..0b0562b 100644 --- a/train_ac.py +++ b/train_ac.py @@ -222,9 +222,9 @@ def main() -> None: f"Single-agent AC expects num_agents=1; received num_agents={num_agents}." ) - temperature = ac_cfg.get("temperature", model_config.temperature) - top_p = ac_cfg.get("top_p", model_config.top_p) - top_k = ac_cfg.get("top_k") + temperature = model_config.temperature + top_p = model_config.top_p + top_k = model_config.top_k use_separate_critic = bool(ac_cfg.get("use_separate_critic", True)) model_kwargs: Dict[str, Any] = {} if model_config.torch_dtype is not None: diff --git a/train_grpo.py b/train_grpo.py index d7e20aa..354ace9 100644 --- a/train_grpo.py +++ b/train_grpo.py @@ -210,9 +210,9 @@ def main(): "Please set grpo.num_turns=1 (or remove the field) in the config." ) - temperature = grpo_cfg.get("temperature", model_config.temperature) - top_p = grpo_cfg.get("top_p", model_config.top_p) - top_k = grpo_cfg.get("top_k") + temperature = model_config.temperature + top_p = model_config.top_p + top_k = model_config.top_k grpo_args = MAGRPOConfig( num_turns=1, diff --git a/train_iac.py b/train_iac.py index 6be8290..c1e9244 100644 --- a/train_iac.py +++ b/train_iac.py @@ -255,9 +255,9 @@ def main() -> None: tok.add_special_tokens(model_config.special_tokens) tokenizer = tokenizers[0] - temperature = iac_cfg.get("temperature", model_config.temperature) - top_p = iac_cfg.get("top_p", model_config.top_p) - top_k = iac_cfg.get("top_k") + temperature = model_config.temperature + top_p = model_config.top_p + top_k = model_config.top_k use_separate_critic = bool(iac_cfg.get("use_separate_critic", True)) model_kwargs: Dict[str, Any] = {} if model_config.torch_dtype is not None: diff --git a/train_maac.py b/train_maac.py index 412d849..47317d6 100644 --- a/train_maac.py +++ b/train_maac.py @@ -246,9 +246,9 @@ def main() -> None: tok.add_special_tokens(model_config.special_tokens) tokenizer = tokenizers[0] - temperature = maac_cfg.get("temperature", model_config.temperature) - top_p = maac_cfg.get("top_p", model_config.top_p) - top_k = maac_cfg.get("top_k") + temperature = model_config.temperature + top_p = model_config.top_p + top_k = model_config.top_k model_kwargs: Dict[str, Any] = {} if model_config.torch_dtype is not None: model_kwargs["torch_dtype"] = model_config.torch_dtype diff --git a/train_magrpo.py b/train_magrpo.py index 59191a7..bfcded7 100644 --- a/train_magrpo.py +++ b/train_magrpo.py @@ -326,9 +326,9 @@ def main(): "Please set magrpo.num_turns=1 (or remove the field) in the config." ) - temperature = magrpo_cfg.get("temperature", model_config.temperature) - top_p = magrpo_cfg.get("top_p", model_config.top_p) - top_k = magrpo_cfg.get("top_k") + temperature = model_config.temperature + top_p = model_config.top_p + top_k = model_config.top_k magrpo_args = MAGRPOConfig( num_turns=1, From a3ea8ce72981540820dd35045d0febd5d9081d5b Mon Sep 17 00:00:00 2001 From: N!no Date: Sun, 15 Feb 2026 16:33:56 -0500 Subject: [PATCH 05/12] ud --- configs/ac_arxiv_config.yaml | 4 ++-- configs/ac_tldr_config.yaml | 4 ++-- configs/grpo_arxiv_config.yaml | 4 ++-- configs/grpo_tldr_config.yaml | 4 ++-- configs/iac_arxiv_config.yaml | 4 ++-- configs/iac_tldr_config.yaml | 4 ++-- configs/maac_arxiv_config.yaml | 4 ++-- configs/maac_tldr_config.yaml | 4 ++-- configs/magrpo_arxiv_config.yaml | 4 ++-- configs/magrpo_tldr_config.yaml | 4 ++-- 10 files changed, 20 insertions(+), 20 deletions(-) diff --git a/configs/ac_arxiv_config.yaml b/configs/ac_arxiv_config.yaml index c10f0e7..f2b171c 100644 --- a/configs/ac_arxiv_config.yaml +++ b/configs/ac_arxiv_config.yaml @@ -27,10 +27,10 @@ tokenizer: padding_side: "left" output: - base_dir: "./ac_output" + base_dir: output_ac_arxiv verbose: false save_final_model: true - save_path: "./ac_output/ac_arxiv" + save_path: output_ac_arxiv ac: parallel_training: auto diff --git a/configs/ac_tldr_config.yaml b/configs/ac_tldr_config.yaml index f13045c..64c1b9d 100644 --- a/configs/ac_tldr_config.yaml +++ b/configs/ac_tldr_config.yaml @@ -27,10 +27,10 @@ tokenizer: padding_side: "left" output: - base_dir: "./ac_output" + base_dir: output_ac_tldr verbose: false save_final_model: true - save_path: "./ac_output/ac_tldr" + save_path: output_ac_tldr ac: parallel_training: auto diff --git a/configs/grpo_arxiv_config.yaml b/configs/grpo_arxiv_config.yaml index 5e89e28..b90c4f3 100644 --- a/configs/grpo_arxiv_config.yaml +++ b/configs/grpo_arxiv_config.yaml @@ -23,10 +23,10 @@ tokenizer: padding_side: "left" output: - base_dir: "./grpo_output" + base_dir: output_grpo_arxiv verbose: false save_final_model: true - save_path: "./grpo_output/arxiv_single" + save_path: output_grpo_arxiv grpo: parallel_training: auto diff --git a/configs/grpo_tldr_config.yaml b/configs/grpo_tldr_config.yaml index d703ed4..08995b8 100644 --- a/configs/grpo_tldr_config.yaml +++ b/configs/grpo_tldr_config.yaml @@ -23,10 +23,10 @@ tokenizer: padding_side: "left" output: - base_dir: "./grpo_output" + base_dir: output_grpo_tldr verbose: false save_final_model: true - save_path: "./grpo_output/tldr_single" + save_path: output_grpo_tldr grpo: parallel_training: auto diff --git a/configs/iac_arxiv_config.yaml b/configs/iac_arxiv_config.yaml index 0168c50..9bc47f7 100644 --- a/configs/iac_arxiv_config.yaml +++ b/configs/iac_arxiv_config.yaml @@ -27,10 +27,10 @@ tokenizer: padding_side: "left" output: - base_dir: "./iac_output" + base_dir: output_iac_arxiv verbose: false save_final_model: true - save_path: "./iac_output/iac_arxiv" + save_path: output_iac_arxiv iac: parallel_training: auto diff --git a/configs/iac_tldr_config.yaml b/configs/iac_tldr_config.yaml index 3ce4f9c..a005a40 100644 --- a/configs/iac_tldr_config.yaml +++ b/configs/iac_tldr_config.yaml @@ -27,10 +27,10 @@ tokenizer: padding_side: "left" output: - base_dir: "./iac_output" + base_dir: output_iac_tldr verbose: false save_final_model: true - save_path: "./iac_output/iac_tldr" + save_path: output_iac_tldr iac: parallel_training: auto diff --git a/configs/maac_arxiv_config.yaml b/configs/maac_arxiv_config.yaml index d765524..82568b6 100644 --- a/configs/maac_arxiv_config.yaml +++ b/configs/maac_arxiv_config.yaml @@ -27,10 +27,10 @@ tokenizer: padding_side: "left" output: - base_dir: "./maac_output" + base_dir: output_maac_arxiv verbose: false save_final_model: true - save_path: "./maac_output/maac_arxiv" + save_path: output_maac_arxiv maac: parallel_training: auto diff --git a/configs/maac_tldr_config.yaml b/configs/maac_tldr_config.yaml index c591d7e..4d353e2 100644 --- a/configs/maac_tldr_config.yaml +++ b/configs/maac_tldr_config.yaml @@ -27,10 +27,10 @@ tokenizer: padding_side: "left" output: - base_dir: "./maac_output" + base_dir: output_maac_tldr verbose: false save_final_model: true - save_path: "./maac_output/maac_tldr" + save_path: output_maac_tldr maac: parallel_training: auto diff --git a/configs/magrpo_arxiv_config.yaml b/configs/magrpo_arxiv_config.yaml index 53b5b3c..29854d1 100644 --- a/configs/magrpo_arxiv_config.yaml +++ b/configs/magrpo_arxiv_config.yaml @@ -23,10 +23,10 @@ tokenizer: padding_side: "left" output: - base_dir: "./magrpo_output" + base_dir: output_magrpo_arxiv verbose: false save_final_model: true - save_path: "./magrpo_output/arxiv" + save_path: output_magrpo_arxiv magrpo: parallel_training: auto diff --git a/configs/magrpo_tldr_config.yaml b/configs/magrpo_tldr_config.yaml index f7617ca..413888a 100644 --- a/configs/magrpo_tldr_config.yaml +++ b/configs/magrpo_tldr_config.yaml @@ -23,10 +23,10 @@ tokenizer: padding_side: "left" output: - base_dir: "./magrpo_output" + base_dir: output_magrpo_tldr verbose: false save_final_model: true - save_path: "./magrpo_output/tldr" + save_path: output_magrpo_tldr magrpo: parallel_training: auto From 800bc8164abf12ed1890dd6d4da67cbafc63a3f1 Mon Sep 17 00:00:00 2001 From: N!no Date: Sun, 15 Feb 2026 20:43:03 -0500 Subject: [PATCH 06/12] ud --- configs/ac_arxiv_config.yaml | 4 ++-- configs/ac_tldr_config.yaml | 4 ++-- configs/grpo_arxiv_config.yaml | 4 ++-- configs/grpo_tldr_config.yaml | 4 ++-- configs/iac_arxiv_config.yaml | 4 ++-- configs/iac_tldr_config.yaml | 4 ++-- configs/maac_arxiv_config.yaml | 4 ++-- configs/maac_tldr_config.yaml | 4 ++-- configs/magrpo_arxiv_config.yaml | 4 ++-- configs/magrpo_tldr_config.yaml | 4 ++-- 10 files changed, 20 insertions(+), 20 deletions(-) diff --git a/configs/ac_arxiv_config.yaml b/configs/ac_arxiv_config.yaml index f2b171c..1623095 100644 --- a/configs/ac_arxiv_config.yaml +++ b/configs/ac_arxiv_config.yaml @@ -1,8 +1,8 @@ agent_model: name: "Qwen/Qwen3-1.7B" type: "qwen" - temperature: 0.7 - top_p: 0.9 + temperature: 0.6 + top_p: 0.6 top_k: null max_length: 2048 torch_dtype: "auto" diff --git a/configs/ac_tldr_config.yaml b/configs/ac_tldr_config.yaml index 64c1b9d..f58e677 100644 --- a/configs/ac_tldr_config.yaml +++ b/configs/ac_tldr_config.yaml @@ -1,8 +1,8 @@ agent_model: name: "Qwen/Qwen3-1.7B" type: "qwen" - temperature: 0.7 - top_p: 0.9 + temperature: 0.6 + top_p: 0.6 top_k: null max_length: 2048 torch_dtype: "auto" diff --git a/configs/grpo_arxiv_config.yaml b/configs/grpo_arxiv_config.yaml index b90c4f3..049fe0e 100644 --- a/configs/grpo_arxiv_config.yaml +++ b/configs/grpo_arxiv_config.yaml @@ -1,8 +1,8 @@ agent_model: name: "Qwen/Qwen3-1.7B" type: "qwen" - temperature: 0.7 - top_p: 0.9 + temperature: 0.6 + top_p: 0.6 top_k: null max_length: 2048 torch_dtype: "auto" diff --git a/configs/grpo_tldr_config.yaml b/configs/grpo_tldr_config.yaml index 08995b8..fe18f0c 100644 --- a/configs/grpo_tldr_config.yaml +++ b/configs/grpo_tldr_config.yaml @@ -1,8 +1,8 @@ agent_model: name: "Qwen/Qwen3-1.7B" type: "qwen" - temperature: 0.7 - top_p: 0.9 + temperature: 0.6 + top_p: 0.6 top_k: null max_length: 2048 torch_dtype: "auto" diff --git a/configs/iac_arxiv_config.yaml b/configs/iac_arxiv_config.yaml index 9bc47f7..c00dab6 100644 --- a/configs/iac_arxiv_config.yaml +++ b/configs/iac_arxiv_config.yaml @@ -1,8 +1,8 @@ agent_model: name: "Qwen/Qwen3-1.7B" type: "qwen" - temperature: 0.7 - top_p: 0.9 + temperature: 0.6 + top_p: 0.6 top_k: null max_length: 2048 torch_dtype: "auto" diff --git a/configs/iac_tldr_config.yaml b/configs/iac_tldr_config.yaml index a005a40..ba3d4d0 100644 --- a/configs/iac_tldr_config.yaml +++ b/configs/iac_tldr_config.yaml @@ -1,8 +1,8 @@ agent_model: name: "Qwen/Qwen3-1.7B" type: "qwen" - temperature: 0.7 - top_p: 0.9 + temperature: 0.6 + top_p: 0.6 top_k: null max_length: 2048 torch_dtype: "auto" diff --git a/configs/maac_arxiv_config.yaml b/configs/maac_arxiv_config.yaml index 82568b6..1030306 100644 --- a/configs/maac_arxiv_config.yaml +++ b/configs/maac_arxiv_config.yaml @@ -1,8 +1,8 @@ agent_model: name: "Qwen/Qwen3-1.7B" type: "qwen" - temperature: 0.7 - top_p: 0.9 + temperature: 0.6 + top_p: 0.6 top_k: null max_length: 2048 torch_dtype: "auto" diff --git a/configs/maac_tldr_config.yaml b/configs/maac_tldr_config.yaml index 4d353e2..5457189 100644 --- a/configs/maac_tldr_config.yaml +++ b/configs/maac_tldr_config.yaml @@ -1,8 +1,8 @@ agent_model: name: "Qwen/Qwen3-1.7B" type: "qwen" - temperature: 0.7 - top_p: 0.9 + temperature: 0.6 + top_p: 0.6 top_k: null max_length: 2048 torch_dtype: "auto" diff --git a/configs/magrpo_arxiv_config.yaml b/configs/magrpo_arxiv_config.yaml index 29854d1..c5e6383 100644 --- a/configs/magrpo_arxiv_config.yaml +++ b/configs/magrpo_arxiv_config.yaml @@ -1,8 +1,8 @@ agent_model: name: "Qwen/Qwen3-1.7B" type: "qwen" - temperature: 0.7 - top_p: 0.9 + temperature: 0.6 + top_p: 0.6 top_k: null max_length: 2048 torch_dtype: "auto" diff --git a/configs/magrpo_tldr_config.yaml b/configs/magrpo_tldr_config.yaml index 413888a..dd114d1 100644 --- a/configs/magrpo_tldr_config.yaml +++ b/configs/magrpo_tldr_config.yaml @@ -1,8 +1,8 @@ agent_model: name: "Qwen/Qwen3-1.7B" type: "qwen" - temperature: 0.7 - top_p: 0.9 + temperature: 0.6 + top_p: 0.6 top_k: null max_length: 2048 torch_dtype: "auto" From a64e093c4809d81e33f80d8e746fc6a9d2ca1384 Mon Sep 17 00:00:00 2001 From: N!no Date: Sun, 15 Feb 2026 20:51:57 -0500 Subject: [PATCH 07/12] ud --- configs/ac_arxiv_config.yaml | 50 ++++++++++++++---------------- configs/ac_tldr_config.yaml | 50 ++++++++++++++---------------- configs/iac_arxiv_config.yaml | 50 ++++++++++++++---------------- configs/iac_tldr_config.yaml | 50 ++++++++++++++---------------- configs/maac_arxiv_config.yaml | 52 ++++++++++++++------------------ configs/maac_tldr_config.yaml | 52 ++++++++++++++------------------ configs/magrpo_arxiv_config.yaml | 42 +++++++++++--------------- configs/magrpo_tldr_config.yaml | 42 +++++++++++--------------- 8 files changed, 170 insertions(+), 218 deletions(-) diff --git a/configs/ac_arxiv_config.yaml b/configs/ac_arxiv_config.yaml index 1623095..d829f8b 100644 --- a/configs/ac_arxiv_config.yaml +++ b/configs/ac_arxiv_config.yaml @@ -1,43 +1,36 @@ agent_model: - name: "Qwen/Qwen3-1.7B" - type: "qwen" - temperature: 0.6 - top_p: 0.6 + name: Qwen/Qwen3-1.7B + type: qwen + temperature: 0.7 + top_p: 0.9 top_k: null max_length: 2048 - torch_dtype: "auto" - + torch_dtype: auto agents: null - critic_model: - name: "Qwen/Qwen3-1.7B" - type: "qwen" + name: Qwen/Qwen3-1.7B + type: qwen max_length: 2048 - torch_dtype: "auto" - + torch_dtype: auto critics: null - dataset: - name: "OpenMLRL/arXiv_abstract" - type: "arxiv" - train_split: "train[:1000]" - eval_split: "val[:1000]" - + name: OpenMLRL/arXiv_abstract + type: arxiv + train_split: train[:1000] + eval_split: val[:1000] tokenizer: - padding_side: "left" - + padding_side: left output: base_dir: output_ac_arxiv verbose: false save_final_model: true save_path: output_ac_arxiv - ac: parallel_training: auto num_turns: 1 num_train_epochs: 4 - agent_learning_rate: 5.0e-6 - critic_learning_rate: 5.0e-6 + agent_learning_rate: 5.0e-06 + critic_learning_rate: 5.0e-06 value_loss_coef: 0.6 advantage_normalization: true rollout_buffer_size: 4 @@ -46,14 +39,15 @@ ac: eval_num_samples: 4 eval_batch_size: 1 logging_steps: 100 - reward_processor: enabled: true scale_factor: 1.0 shift: 0.0 - wandb: - project: "comlrl" - entity: "OpenMLRL" - name: "ac_arxiv" - tags: ["ac", "arxiv", "single-agent"] + project: comlrl + entity: OpenMLRL + name: ac_arxiv + tags: + - ac + - arxiv + - single-agent diff --git a/configs/ac_tldr_config.yaml b/configs/ac_tldr_config.yaml index f58e677..f98b6f5 100644 --- a/configs/ac_tldr_config.yaml +++ b/configs/ac_tldr_config.yaml @@ -1,43 +1,36 @@ agent_model: - name: "Qwen/Qwen3-1.7B" - type: "qwen" - temperature: 0.6 - top_p: 0.6 + name: Qwen/Qwen3-1.7B + type: qwen + temperature: 0.7 + top_p: 0.9 top_k: null max_length: 2048 - torch_dtype: "auto" - + torch_dtype: auto agents: null - critic_model: - name: "Qwen/Qwen3-1.7B" - type: "qwen" + name: Qwen/Qwen3-1.7B + type: qwen max_length: 2048 - torch_dtype: "auto" - + torch_dtype: auto critics: null - dataset: - name: "trl-lib/tldr" - type: "tldr" - train_split: "train[:1000]" - eval_split: "test[:1000]" - + name: trl-lib/tldr + type: tldr + train_split: train[:1000] + eval_split: test[:1000] tokenizer: - padding_side: "left" - + padding_side: left output: base_dir: output_ac_tldr verbose: false save_final_model: true save_path: output_ac_tldr - ac: parallel_training: auto num_turns: 1 num_train_epochs: 4 - agent_learning_rate: 5.0e-6 - critic_learning_rate: 5.0e-6 + agent_learning_rate: 5.0e-06 + critic_learning_rate: 5.0e-06 value_loss_coef: 0.6 advantage_normalization: true rollout_buffer_size: 4 @@ -46,14 +39,15 @@ ac: eval_num_samples: 4 eval_batch_size: 1 logging_steps: 100 - reward_processor: enabled: true scale_factor: 1.0 shift: 0.0 - wandb: - project: "comlrl" - entity: "OpenMLRL" - name: "ac_tldr" - tags: ["ac", "tldr", "single-agent"] + project: comlrl + entity: OpenMLRL + name: ac_tldr + tags: + - ac + - tldr + - single-agent diff --git a/configs/iac_arxiv_config.yaml b/configs/iac_arxiv_config.yaml index c00dab6..92b1f24 100644 --- a/configs/iac_arxiv_config.yaml +++ b/configs/iac_arxiv_config.yaml @@ -1,45 +1,38 @@ agent_model: - name: "Qwen/Qwen3-1.7B" - type: "qwen" - temperature: 0.6 - top_p: 0.6 + name: Qwen/Qwen3-1.7B + type: qwen + temperature: 0.7 + top_p: 0.9 top_k: null max_length: 2048 - torch_dtype: "auto" - + torch_dtype: auto agents: null - critic_model: - name: "Qwen/Qwen3-1.7B" - type: "qwen" + name: Qwen/Qwen3-1.7B + type: qwen max_length: 2048 - torch_dtype: "auto" - + torch_dtype: auto critics: null - dataset: - name: "OpenMLRL/arXiv_abstract" - type: "arxiv" - train_split: "train[:1000]" - eval_split: "val[:1000]" - + name: OpenMLRL/arXiv_abstract + type: arxiv + train_split: train[:1000] + eval_split: val[:1000] tokenizer: - padding_side: "left" - + padding_side: left output: base_dir: output_iac_arxiv verbose: false save_final_model: true save_path: output_iac_arxiv - iac: parallel_training: auto num_agents: 2 num_turns: 1 use_separate_critic: true num_train_epochs: 20 - agent_learning_rate: 5.0e-6 - critic_learning_rate: 5.0e-6 + agent_learning_rate: 5.0e-06 + critic_learning_rate: 5.0e-06 value_loss_coef: 0.6 value_clip_range: 0.2 rollout_buffer_size: 4 @@ -49,14 +42,15 @@ iac: eval_num_samples: 4 eval_batch_size: 1 logging_steps: 50 - reward_processor: enabled: true scale_factor: 1.0 shift: 0.0 - wandb: - project: "comlrl" - entity: "OpenMLRL" - name: "iac_arxiv" - tags: ["iac", "arxiv", "multi-agent"] + project: comlrl + entity: OpenMLRL + name: iac_arxiv + tags: + - iac + - arxiv + - multi-agent diff --git a/configs/iac_tldr_config.yaml b/configs/iac_tldr_config.yaml index ba3d4d0..b5a42af 100644 --- a/configs/iac_tldr_config.yaml +++ b/configs/iac_tldr_config.yaml @@ -1,45 +1,38 @@ agent_model: - name: "Qwen/Qwen3-1.7B" - type: "qwen" - temperature: 0.6 - top_p: 0.6 + name: Qwen/Qwen3-1.7B + type: qwen + temperature: 0.7 + top_p: 0.9 top_k: null max_length: 2048 - torch_dtype: "auto" - + torch_dtype: auto agents: null - critic_model: - name: "Qwen/Qwen3-1.7B" - type: "qwen" + name: Qwen/Qwen3-1.7B + type: qwen max_length: 2048 - torch_dtype: "auto" - + torch_dtype: auto critics: null - dataset: - name: "trl-lib/tldr" - type: "tldr" - train_split: "train[:1000]" - eval_split: "test[:1000]" - + name: trl-lib/tldr + type: tldr + train_split: train[:1000] + eval_split: test[:1000] tokenizer: - padding_side: "left" - + padding_side: left output: base_dir: output_iac_tldr verbose: false save_final_model: true save_path: output_iac_tldr - iac: parallel_training: auto num_agents: 2 num_turns: 1 use_separate_critic: true num_train_epochs: 20 - agent_learning_rate: 5.0e-6 - critic_learning_rate: 5.0e-6 + agent_learning_rate: 5.0e-06 + critic_learning_rate: 5.0e-06 value_loss_coef: 0.6 value_clip_range: 0.2 rollout_buffer_size: 4 @@ -49,14 +42,15 @@ iac: eval_num_samples: 4 eval_batch_size: 1 logging_steps: 50 - reward_processor: enabled: true scale_factor: 1.0 shift: 0.0 - wandb: - project: "comlrl" - entity: "OpenMLRL" - name: "iac_tldr" - tags: ["iac", "tldr", "multi-agent"] + project: comlrl + entity: OpenMLRL + name: iac_tldr + tags: + - iac + - tldr + - multi-agent diff --git a/configs/maac_arxiv_config.yaml b/configs/maac_arxiv_config.yaml index 1030306..a109848 100644 --- a/configs/maac_arxiv_config.yaml +++ b/configs/maac_arxiv_config.yaml @@ -1,45 +1,38 @@ agent_model: - name: "Qwen/Qwen3-1.7B" - type: "qwen" - temperature: 0.6 - top_p: 0.6 + name: Qwen/Qwen3-1.7B + type: qwen + temperature: 0.7 + top_p: 0.9 top_k: null max_length: 2048 - torch_dtype: "auto" - + torch_dtype: auto agents: null - critic_model: - name: "Qwen/Qwen3-1.7B" - type: "qwen" + name: Qwen/Qwen3-1.7B + type: qwen max_length: 2048 - torch_dtype: "auto" - + torch_dtype: auto critics: null - dataset: - name: "OpenMLRL/arXiv_abstract" - type: "arxiv" - train_split: "train[:1100]" - eval_split: "val[:1100]" - + name: OpenMLRL/arXiv_abstract + type: arxiv + train_split: train[:1100] + eval_split: val[:1100] tokenizer: - padding_side: "left" - + padding_side: left output: base_dir: output_maac_arxiv verbose: false save_final_model: true save_path: output_maac_arxiv - maac: parallel_training: auto num_agents: 2 num_turns: 1 - critic_type: "v" + critic_type: v num_train_epochs: 20 - agent_learning_rate: 5.0e-6 - critic_learning_rate: 5.0e-6 + agent_learning_rate: 5.0e-06 + critic_learning_rate: 5.0e-06 value_loss_coef: 0.6 rollout_buffer_size: 4 train_batch_size: 4 @@ -48,14 +41,15 @@ maac: eval_num_samples: 4 eval_batch_size: 1 logging_steps: 50 - reward_processor: enabled: true scale_factor: 1.0 shift: 0.0 - wandb: - project: "comlrl" - entity: "OpenMLRL" - name: "maac_arxiv" - tags: ["maac", "arxiv", "multi-agent"] + project: comlrl + entity: OpenMLRL + name: maac_arxiv + tags: + - maac + - arxiv + - multi-agent diff --git a/configs/maac_tldr_config.yaml b/configs/maac_tldr_config.yaml index 5457189..36f59c2 100644 --- a/configs/maac_tldr_config.yaml +++ b/configs/maac_tldr_config.yaml @@ -1,45 +1,38 @@ agent_model: - name: "Qwen/Qwen3-1.7B" - type: "qwen" - temperature: 0.6 - top_p: 0.6 + name: Qwen/Qwen3-1.7B + type: qwen + temperature: 0.7 + top_p: 0.9 top_k: null max_length: 2048 - torch_dtype: "auto" - + torch_dtype: auto agents: null - critic_model: - name: "Qwen/Qwen3-1.7B" - type: "qwen" + name: Qwen/Qwen3-1.7B + type: qwen max_length: 2048 - torch_dtype: "auto" - + torch_dtype: auto critics: null - dataset: - name: "trl-lib/tldr" - type: "tldr" - train_split: "train[:1100]" - eval_split: "test[:1100]" - + name: trl-lib/tldr + type: tldr + train_split: train[:1100] + eval_split: test[:1100] tokenizer: - padding_side: "left" - + padding_side: left output: base_dir: output_maac_tldr verbose: false save_final_model: true save_path: output_maac_tldr - maac: parallel_training: auto num_agents: 2 num_turns: 1 - critic_type: "v" + critic_type: v num_train_epochs: 20 - agent_learning_rate: 5.0e-6 - critic_learning_rate: 5.0e-6 + agent_learning_rate: 5.0e-06 + critic_learning_rate: 5.0e-06 value_loss_coef: 0.6 rollout_buffer_size: 4 train_batch_size: 4 @@ -48,14 +41,15 @@ maac: eval_num_samples: 4 eval_batch_size: 1 logging_steps: 50 - reward_processor: enabled: true scale_factor: 1.0 shift: 0.0 - wandb: - project: "comlrl" - entity: "OpenMLRL" - name: "maac_tldr" - tags: ["maac", "tldr", "multi-agent"] + project: comlrl + entity: OpenMLRL + name: maac_tldr + tags: + - maac + - tldr + - multi-agent diff --git a/configs/magrpo_arxiv_config.yaml b/configs/magrpo_arxiv_config.yaml index c5e6383..291e750 100644 --- a/configs/magrpo_arxiv_config.yaml +++ b/configs/magrpo_arxiv_config.yaml @@ -1,38 +1,31 @@ agent_model: - name: "Qwen/Qwen3-1.7B" - type: "qwen" - temperature: 0.6 - top_p: 0.6 + name: Qwen/Qwen3-1.7B + type: qwen + temperature: 0.7 + top_p: 0.9 top_k: null max_length: 2048 - torch_dtype: "auto" - + torch_dtype: auto agents: null - critic_model: null - critics: null - dataset: - name: "OpenMLRL/arXiv_abstract" - type: "arxiv" - train_split: "train[:1100]" - eval_split: "val[:1100]" - + name: OpenMLRL/arXiv_abstract + type: arxiv + train_split: train[:1100] + eval_split: val[:1100] tokenizer: - padding_side: "left" - + padding_side: left output: base_dir: output_magrpo_arxiv verbose: false save_final_model: true save_path: output_magrpo_arxiv - magrpo: parallel_training: auto num_turns: 1 num_train_epochs: 2 - agent_learning_rate: 5.0e-6 + agent_learning_rate: 5.0e-06 logging_steps: 50 num_generations: 4 max_new_tokens: 256 @@ -43,14 +36,15 @@ magrpo: eval_interval: 20 eval_num_samples: 4 eval_batch_size: 1 - reward_processor: enabled: true scale_factor: 1.0 shift: 0.0 - wandb: - project: "comlrl" - entity: "OpenMLRL" - name: "magrpo_arxiv" - tags: ["magrpo", "arxiv", "multi-agent"] + project: comlrl + entity: OpenMLRL + name: magrpo_arxiv + tags: + - magrpo + - arxiv + - multi-agent diff --git a/configs/magrpo_tldr_config.yaml b/configs/magrpo_tldr_config.yaml index dd114d1..7a2c1a9 100644 --- a/configs/magrpo_tldr_config.yaml +++ b/configs/magrpo_tldr_config.yaml @@ -1,38 +1,31 @@ agent_model: - name: "Qwen/Qwen3-1.7B" - type: "qwen" - temperature: 0.6 - top_p: 0.6 + name: Qwen/Qwen3-1.7B + type: qwen + temperature: 0.7 + top_p: 0.9 top_k: null max_length: 2048 - torch_dtype: "auto" - + torch_dtype: auto agents: null - critic_model: null - critics: null - dataset: - name: "trl-lib/tldr" - type: "tldr" - train_split: "train[:1100]" - eval_split: "test[:1100]" - + name: trl-lib/tldr + type: tldr + train_split: train[:1100] + eval_split: test[:1100] tokenizer: - padding_side: "left" - + padding_side: left output: base_dir: output_magrpo_tldr verbose: false save_final_model: true save_path: output_magrpo_tldr - magrpo: parallel_training: auto num_turns: 1 num_train_epochs: 2 - agent_learning_rate: 5.0e-6 + agent_learning_rate: 5.0e-06 logging_steps: 50 num_generations: 4 max_new_tokens: 256 @@ -43,14 +36,15 @@ magrpo: eval_interval: 20 eval_num_samples: 4 eval_batch_size: 1 - reward_processor: enabled: true scale_factor: 1.0 shift: 0.0 - wandb: - project: "comlrl" - entity: "OpenMLRL" - name: "magrpo_tldr" - tags: ["magrpo", "tldr", "multi-agent"] + project: comlrl + entity: OpenMLRL + name: magrpo_tldr + tags: + - magrpo + - tldr + - multi-agent From eef58e4c9f988390352f0ee3c2beef57868e852e Mon Sep 17 00:00:00 2001 From: N!no Date: Mon, 16 Feb 2026 00:08:25 -0500 Subject: [PATCH 08/12] ud --- configs/ac_arxiv_config.yaml | 2 +- configs/ac_tldr_config.yaml | 2 +- configs/grpo_arxiv_config.yaml | 2 +- configs/grpo_tldr_config.yaml | 2 +- configs/iac_arxiv_config.yaml | 2 +- configs/iac_tldr_config.yaml | 2 +- configs/maac_arxiv_config.yaml | 2 +- configs/maac_tldr_config.yaml | 2 +- configs/magrpo_arxiv_config.yaml | 2 +- configs/magrpo_tldr_config.yaml | 2 +- train_ac.py | 2 +- train_grpo.py | 2 +- train_iac.py | 2 +- train_maac.py | 2 +- train_magrpo.py | 2 +- 15 files changed, 15 insertions(+), 15 deletions(-) diff --git a/configs/ac_arxiv_config.yaml b/configs/ac_arxiv_config.yaml index d829f8b..fc76337 100644 --- a/configs/ac_arxiv_config.yaml +++ b/configs/ac_arxiv_config.yaml @@ -26,7 +26,7 @@ output: save_final_model: true save_path: output_ac_arxiv ac: - parallel_training: auto + parallel_training: mp num_turns: 1 num_train_epochs: 4 agent_learning_rate: 5.0e-06 diff --git a/configs/ac_tldr_config.yaml b/configs/ac_tldr_config.yaml index f98b6f5..1b43c45 100644 --- a/configs/ac_tldr_config.yaml +++ b/configs/ac_tldr_config.yaml @@ -26,7 +26,7 @@ output: save_final_model: true save_path: output_ac_tldr ac: - parallel_training: auto + parallel_training: mp num_turns: 1 num_train_epochs: 4 agent_learning_rate: 5.0e-06 diff --git a/configs/grpo_arxiv_config.yaml b/configs/grpo_arxiv_config.yaml index 049fe0e..9ced681 100644 --- a/configs/grpo_arxiv_config.yaml +++ b/configs/grpo_arxiv_config.yaml @@ -29,7 +29,7 @@ output: save_path: output_grpo_arxiv grpo: - parallel_training: auto + parallel_training: mp num_turns: 1 num_train_epochs: 1 agent_learning_rate: 5.0e-6 diff --git a/configs/grpo_tldr_config.yaml b/configs/grpo_tldr_config.yaml index fe18f0c..44ef1dc 100644 --- a/configs/grpo_tldr_config.yaml +++ b/configs/grpo_tldr_config.yaml @@ -29,7 +29,7 @@ output: save_path: output_grpo_tldr grpo: - parallel_training: auto + parallel_training: mp num_turns: 1 num_train_epochs: 1 agent_learning_rate: 5.0e-6 diff --git a/configs/iac_arxiv_config.yaml b/configs/iac_arxiv_config.yaml index 92b1f24..2bd84c3 100644 --- a/configs/iac_arxiv_config.yaml +++ b/configs/iac_arxiv_config.yaml @@ -26,7 +26,7 @@ output: save_final_model: true save_path: output_iac_arxiv iac: - parallel_training: auto + parallel_training: mp num_agents: 2 num_turns: 1 use_separate_critic: true diff --git a/configs/iac_tldr_config.yaml b/configs/iac_tldr_config.yaml index b5a42af..753da11 100644 --- a/configs/iac_tldr_config.yaml +++ b/configs/iac_tldr_config.yaml @@ -26,7 +26,7 @@ output: save_final_model: true save_path: output_iac_tldr iac: - parallel_training: auto + parallel_training: mp num_agents: 2 num_turns: 1 use_separate_critic: true diff --git a/configs/maac_arxiv_config.yaml b/configs/maac_arxiv_config.yaml index a109848..1928ce0 100644 --- a/configs/maac_arxiv_config.yaml +++ b/configs/maac_arxiv_config.yaml @@ -26,7 +26,7 @@ output: save_final_model: true save_path: output_maac_arxiv maac: - parallel_training: auto + parallel_training: mp num_agents: 2 num_turns: 1 critic_type: v diff --git a/configs/maac_tldr_config.yaml b/configs/maac_tldr_config.yaml index 36f59c2..b55fa8b 100644 --- a/configs/maac_tldr_config.yaml +++ b/configs/maac_tldr_config.yaml @@ -26,7 +26,7 @@ output: save_final_model: true save_path: output_maac_tldr maac: - parallel_training: auto + parallel_training: mp num_agents: 2 num_turns: 1 critic_type: v diff --git a/configs/magrpo_arxiv_config.yaml b/configs/magrpo_arxiv_config.yaml index 291e750..df9448e 100644 --- a/configs/magrpo_arxiv_config.yaml +++ b/configs/magrpo_arxiv_config.yaml @@ -22,7 +22,7 @@ output: save_final_model: true save_path: output_magrpo_arxiv magrpo: - parallel_training: auto + parallel_training: mp num_turns: 1 num_train_epochs: 2 agent_learning_rate: 5.0e-06 diff --git a/configs/magrpo_tldr_config.yaml b/configs/magrpo_tldr_config.yaml index 7a2c1a9..83d56d7 100644 --- a/configs/magrpo_tldr_config.yaml +++ b/configs/magrpo_tldr_config.yaml @@ -22,7 +22,7 @@ output: save_final_model: true save_path: output_magrpo_tldr magrpo: - parallel_training: auto + parallel_training: mp num_turns: 1 num_train_epochs: 2 agent_learning_rate: 5.0e-06 diff --git a/train_ac.py b/train_ac.py index 0b0562b..a8db47a 100644 --- a/train_ac.py +++ b/train_ac.py @@ -291,7 +291,7 @@ def main() -> None: num_agents=1, num_generations=ac_cfg.get("num_generations", 1), use_separate_critic=use_separate_critic, - parallel_training=str(ac_cfg.get("parallel_training", "auto")).strip().lower(), + parallel_training=str(ac_cfg.get("parallel_training", "mp")).strip().lower(), agent_devices=ac_cfg.get("agent_devices", None), critic_devices=ac_cfg.get("critic_devices", None), critic_value_head_hidden_dim=ac_cfg.get("critic_value_head_hidden_dim"), diff --git a/train_grpo.py b/train_grpo.py index 354ace9..b3d8324 100644 --- a/train_grpo.py +++ b/train_grpo.py @@ -225,7 +225,7 @@ def main(): top_p=top_p, top_k=top_k, num_agents=1, - parallel_training=str(grpo_cfg.get("parallel_training", "auto")).strip().lower(), + parallel_training=str(grpo_cfg.get("parallel_training", "mp")).strip().lower(), agent_devices=grpo_cfg.get("agent_devices", None), early_termination_threshold=grpo_cfg.get( "early_termination_threshold", -0.2 diff --git a/train_iac.py b/train_iac.py index c1e9244..0ca0f32 100644 --- a/train_iac.py +++ b/train_iac.py @@ -318,7 +318,7 @@ def main() -> None: num_agents=num_agents, num_generations=iac_cfg.get("num_generations", 1), use_separate_critic=use_separate_critic, - parallel_training=str(iac_cfg.get("parallel_training", "auto")).strip().lower(), + parallel_training=str(iac_cfg.get("parallel_training", "mp")).strip().lower(), agent_devices=iac_cfg.get("agent_devices", None), critic_devices=iac_cfg.get("critic_devices", None), critic_value_head_hidden_dim=iac_cfg.get("critic_value_head_hidden_dim"), diff --git a/train_maac.py b/train_maac.py index 47317d6..bd7853d 100644 --- a/train_maac.py +++ b/train_maac.py @@ -314,7 +314,7 @@ def main() -> None: top_k=top_k, num_agents=num_agents, num_generations=maac_cfg.get("num_generations", 1), - parallel_training=str(maac_cfg.get("parallel_training", "auto")).strip().lower(), + parallel_training=str(maac_cfg.get("parallel_training", "mp")).strip().lower(), agent_devices=maac_cfg.get("agent_devices", None), critic_devices=maac_cfg.get("critic_devices", None), discount=maac_cfg.get("discount", 0.9), diff --git a/train_magrpo.py b/train_magrpo.py index bfcded7..8d2bb59 100644 --- a/train_magrpo.py +++ b/train_magrpo.py @@ -341,7 +341,7 @@ def main(): top_p=top_p, top_k=top_k, num_agents=num_agents, - parallel_training=str(magrpo_cfg.get("parallel_training", "auto")).strip().lower(), + parallel_training=str(magrpo_cfg.get("parallel_training", "mp")).strip().lower(), agent_devices=magrpo_cfg.get("agent_devices", None), early_termination_threshold=magrpo_cfg.get( "early_termination_threshold", -0.2 From 5cb92d5bc363f24233ed77578bee1836dcb429cc Mon Sep 17 00:00:00 2001 From: N!no Date: Mon, 16 Feb 2026 10:10:28 -0500 Subject: [PATCH 09/12] ud --- configs/ac_arxiv_config.yaml | 2 +- configs/ac_tldr_config.yaml | 2 +- configs/grpo_arxiv_config.yaml | 2 +- configs/grpo_tldr_config.yaml | 2 +- configs/iac_arxiv_config.yaml | 4 +++- configs/iac_tldr_config.yaml | 4 +++- configs/maac_arxiv_config.yaml | 4 +++- configs/maac_tldr_config.yaml | 4 +++- configs/magrpo_arxiv_config.yaml | 3 ++- configs/magrpo_tldr_config.yaml | 3 ++- train_ac.py | 2 +- train_grpo.py | 2 +- train_iac.py | 2 +- train_maac.py | 2 +- train_magrpo.py | 2 +- 15 files changed, 25 insertions(+), 15 deletions(-) diff --git a/configs/ac_arxiv_config.yaml b/configs/ac_arxiv_config.yaml index fc76337..852521e 100644 --- a/configs/ac_arxiv_config.yaml +++ b/configs/ac_arxiv_config.yaml @@ -26,7 +26,7 @@ output: save_final_model: true save_path: output_ac_arxiv ac: - parallel_training: mp + parallel_training: none num_turns: 1 num_train_epochs: 4 agent_learning_rate: 5.0e-06 diff --git a/configs/ac_tldr_config.yaml b/configs/ac_tldr_config.yaml index 1b43c45..eed4651 100644 --- a/configs/ac_tldr_config.yaml +++ b/configs/ac_tldr_config.yaml @@ -26,7 +26,7 @@ output: save_final_model: true save_path: output_ac_tldr ac: - parallel_training: mp + parallel_training: none num_turns: 1 num_train_epochs: 4 agent_learning_rate: 5.0e-06 diff --git a/configs/grpo_arxiv_config.yaml b/configs/grpo_arxiv_config.yaml index 9ced681..3e38654 100644 --- a/configs/grpo_arxiv_config.yaml +++ b/configs/grpo_arxiv_config.yaml @@ -29,7 +29,7 @@ output: save_path: output_grpo_arxiv grpo: - parallel_training: mp + parallel_training: none num_turns: 1 num_train_epochs: 1 agent_learning_rate: 5.0e-6 diff --git a/configs/grpo_tldr_config.yaml b/configs/grpo_tldr_config.yaml index 44ef1dc..cf158f3 100644 --- a/configs/grpo_tldr_config.yaml +++ b/configs/grpo_tldr_config.yaml @@ -29,7 +29,7 @@ output: save_path: output_grpo_tldr grpo: - parallel_training: mp + parallel_training: none num_turns: 1 num_train_epochs: 1 agent_learning_rate: 5.0e-6 diff --git a/configs/iac_arxiv_config.yaml b/configs/iac_arxiv_config.yaml index 2bd84c3..bdbc264 100644 --- a/configs/iac_arxiv_config.yaml +++ b/configs/iac_arxiv_config.yaml @@ -26,7 +26,9 @@ output: save_final_model: true save_path: output_iac_arxiv iac: - parallel_training: mp + parallel_training: none + agent_devices: ["cuda:0"] + critic_devices: ["cuda:0"] num_agents: 2 num_turns: 1 use_separate_critic: true diff --git a/configs/iac_tldr_config.yaml b/configs/iac_tldr_config.yaml index 753da11..88be512 100644 --- a/configs/iac_tldr_config.yaml +++ b/configs/iac_tldr_config.yaml @@ -26,7 +26,9 @@ output: save_final_model: true save_path: output_iac_tldr iac: - parallel_training: mp + parallel_training: none + agent_devices: ["cuda:0"] + critic_devices: ["cuda:0"] num_agents: 2 num_turns: 1 use_separate_critic: true diff --git a/configs/maac_arxiv_config.yaml b/configs/maac_arxiv_config.yaml index 1928ce0..41c38ff 100644 --- a/configs/maac_arxiv_config.yaml +++ b/configs/maac_arxiv_config.yaml @@ -26,7 +26,9 @@ output: save_final_model: true save_path: output_maac_arxiv maac: - parallel_training: mp + parallel_training: none + agent_devices: ["cuda:0"] + critic_devices: ["cuda:0"] num_agents: 2 num_turns: 1 critic_type: v diff --git a/configs/maac_tldr_config.yaml b/configs/maac_tldr_config.yaml index b55fa8b..374cded 100644 --- a/configs/maac_tldr_config.yaml +++ b/configs/maac_tldr_config.yaml @@ -26,7 +26,9 @@ output: save_final_model: true save_path: output_maac_tldr maac: - parallel_training: mp + parallel_training: none + agent_devices: ["cuda:0"] + critic_devices: ["cuda:0"] num_agents: 2 num_turns: 1 critic_type: v diff --git a/configs/magrpo_arxiv_config.yaml b/configs/magrpo_arxiv_config.yaml index df9448e..91feacf 100644 --- a/configs/magrpo_arxiv_config.yaml +++ b/configs/magrpo_arxiv_config.yaml @@ -22,7 +22,8 @@ output: save_final_model: true save_path: output_magrpo_arxiv magrpo: - parallel_training: mp + parallel_training: none + agent_devices: ["cuda:0"] num_turns: 1 num_train_epochs: 2 agent_learning_rate: 5.0e-06 diff --git a/configs/magrpo_tldr_config.yaml b/configs/magrpo_tldr_config.yaml index 83d56d7..8bc89f2 100644 --- a/configs/magrpo_tldr_config.yaml +++ b/configs/magrpo_tldr_config.yaml @@ -22,7 +22,8 @@ output: save_final_model: true save_path: output_magrpo_tldr magrpo: - parallel_training: mp + parallel_training: none + agent_devices: ["cuda:0"] num_turns: 1 num_train_epochs: 2 agent_learning_rate: 5.0e-06 diff --git a/train_ac.py b/train_ac.py index a8db47a..6488f8d 100644 --- a/train_ac.py +++ b/train_ac.py @@ -291,7 +291,7 @@ def main() -> None: num_agents=1, num_generations=ac_cfg.get("num_generations", 1), use_separate_critic=use_separate_critic, - parallel_training=str(ac_cfg.get("parallel_training", "mp")).strip().lower(), + parallel_training=str(ac_cfg.get("parallel_training", "none")).strip().lower(), agent_devices=ac_cfg.get("agent_devices", None), critic_devices=ac_cfg.get("critic_devices", None), critic_value_head_hidden_dim=ac_cfg.get("critic_value_head_hidden_dim"), diff --git a/train_grpo.py b/train_grpo.py index b3d8324..2a3a903 100644 --- a/train_grpo.py +++ b/train_grpo.py @@ -225,7 +225,7 @@ def main(): top_p=top_p, top_k=top_k, num_agents=1, - parallel_training=str(grpo_cfg.get("parallel_training", "mp")).strip().lower(), + parallel_training=str(grpo_cfg.get("parallel_training", "none")).strip().lower(), agent_devices=grpo_cfg.get("agent_devices", None), early_termination_threshold=grpo_cfg.get( "early_termination_threshold", -0.2 diff --git a/train_iac.py b/train_iac.py index 0ca0f32..57ab491 100644 --- a/train_iac.py +++ b/train_iac.py @@ -318,7 +318,7 @@ def main() -> None: num_agents=num_agents, num_generations=iac_cfg.get("num_generations", 1), use_separate_critic=use_separate_critic, - parallel_training=str(iac_cfg.get("parallel_training", "mp")).strip().lower(), + parallel_training=str(iac_cfg.get("parallel_training", "none")).strip().lower(), agent_devices=iac_cfg.get("agent_devices", None), critic_devices=iac_cfg.get("critic_devices", None), critic_value_head_hidden_dim=iac_cfg.get("critic_value_head_hidden_dim"), diff --git a/train_maac.py b/train_maac.py index bd7853d..ad99cfe 100644 --- a/train_maac.py +++ b/train_maac.py @@ -314,7 +314,7 @@ def main() -> None: top_k=top_k, num_agents=num_agents, num_generations=maac_cfg.get("num_generations", 1), - parallel_training=str(maac_cfg.get("parallel_training", "mp")).strip().lower(), + parallel_training=str(maac_cfg.get("parallel_training", "none")).strip().lower(), agent_devices=maac_cfg.get("agent_devices", None), critic_devices=maac_cfg.get("critic_devices", None), discount=maac_cfg.get("discount", 0.9), diff --git a/train_magrpo.py b/train_magrpo.py index 8d2bb59..c4e2061 100644 --- a/train_magrpo.py +++ b/train_magrpo.py @@ -341,7 +341,7 @@ def main(): top_p=top_p, top_k=top_k, num_agents=num_agents, - parallel_training=str(magrpo_cfg.get("parallel_training", "mp")).strip().lower(), + parallel_training=str(magrpo_cfg.get("parallel_training", "none")).strip().lower(), agent_devices=magrpo_cfg.get("agent_devices", None), early_termination_threshold=magrpo_cfg.get( "early_termination_threshold", -0.2 From 4bd98b1eec28795b62ae5e239f621ee4cb1dce31 Mon Sep 17 00:00:00 2001 From: N!no Date: Mon, 16 Feb 2026 11:01:18 -0500 Subject: [PATCH 10/12] ud --- train_iac.py | 10 +++++----- train_maac.py | 10 +++++----- train_magrpo.py | 12 ++++++------ 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/train_iac.py b/train_iac.py index 57ab491..7c3736d 100644 --- a/train_iac.py +++ b/train_iac.py @@ -305,7 +305,7 @@ def main() -> None: external_transition=None, args=IACConfig( num_turns=1, - num_train_epochs=iac_cfg.get("num_train_epochs", 1), + num_train_epochs=iac_cfg.get("num_train_epochs", 20), agent_learning_rate=iac_cfg.get("agent_learning_rate", 5e-6), critic_learning_rate=iac_cfg.get("critic_learning_rate", 5e-6), value_loss_coef=iac_cfg.get("value_loss_coef", 0.6), @@ -319,15 +319,15 @@ def main() -> None: num_generations=iac_cfg.get("num_generations", 1), use_separate_critic=use_separate_critic, parallel_training=str(iac_cfg.get("parallel_training", "none")).strip().lower(), - agent_devices=iac_cfg.get("agent_devices", None), - critic_devices=iac_cfg.get("critic_devices", None), + agent_devices=iac_cfg.get("agent_devices", ["cuda:0"]), + critic_devices=iac_cfg.get("critic_devices", ["cuda:0"]), critic_value_head_hidden_dim=iac_cfg.get("critic_value_head_hidden_dim"), value_head_hidden_dim=iac_cfg.get("value_head_hidden_dim"), discount=iac_cfg.get("discount", 0.9), - eval_interval=iac_cfg.get("eval_interval", 4), + eval_interval=iac_cfg.get("eval_interval", 20), eval_num_samples=iac_cfg.get("eval_num_samples", 4), eval_batch_size=iac_cfg.get("eval_batch_size", 1), - logging_steps=iac_cfg.get("logging_steps", 1), + logging_steps=iac_cfg.get("logging_steps", 50), ), train_dataset=train_dataset, eval_dataset=eval_dataset, diff --git a/train_maac.py b/train_maac.py index ad99cfe..167621c 100644 --- a/train_maac.py +++ b/train_maac.py @@ -303,7 +303,7 @@ def main() -> None: external_transition=None, args=MAACConfig( num_turns=1, - num_train_epochs=maac_cfg.get("num_train_epochs", 1), + num_train_epochs=maac_cfg.get("num_train_epochs", 20), agent_learning_rate=maac_cfg.get("agent_learning_rate", 5e-6), critic_learning_rate=maac_cfg.get("critic_learning_rate", 5e-6), value_loss_coef=maac_cfg.get("value_loss_coef", 0.6), @@ -315,14 +315,14 @@ def main() -> None: num_agents=num_agents, num_generations=maac_cfg.get("num_generations", 1), parallel_training=str(maac_cfg.get("parallel_training", "none")).strip().lower(), - agent_devices=maac_cfg.get("agent_devices", None), - critic_devices=maac_cfg.get("critic_devices", None), + agent_devices=maac_cfg.get("agent_devices", ["cuda:0"]), + critic_devices=maac_cfg.get("critic_devices", ["cuda:0"]), discount=maac_cfg.get("discount", 0.9), critic_type=maac_cfg.get("critic_type", "v"), - eval_interval=maac_cfg.get("eval_interval", 4), + eval_interval=maac_cfg.get("eval_interval", 20), eval_num_samples=maac_cfg.get("eval_num_samples", 4), eval_batch_size=maac_cfg.get("eval_batch_size", 1), - logging_steps=maac_cfg.get("logging_steps", 1), + logging_steps=maac_cfg.get("logging_steps", 50), ), train_dataset=train_dataset, eval_dataset=eval_dataset, diff --git a/train_magrpo.py b/train_magrpo.py index c4e2061..31d9041 100644 --- a/train_magrpo.py +++ b/train_magrpo.py @@ -332,9 +332,9 @@ def main(): magrpo_args = MAGRPOConfig( num_turns=1, - num_train_epochs=magrpo_cfg.get("num_train_epochs", 1), + num_train_epochs=magrpo_cfg.get("num_train_epochs", 2), agent_learning_rate=magrpo_cfg.get("agent_learning_rate", 5e-6), - logging_steps=magrpo_cfg.get("logging_steps", 10), + logging_steps=magrpo_cfg.get("logging_steps", 50), num_generations=magrpo_cfg.get("num_generations", 4), max_new_tokens=magrpo_cfg.get("max_new_tokens", 256), temperature=temperature, @@ -342,14 +342,14 @@ def main(): top_k=top_k, num_agents=num_agents, parallel_training=str(magrpo_cfg.get("parallel_training", "none")).strip().lower(), - agent_devices=magrpo_cfg.get("agent_devices", None), + agent_devices=magrpo_cfg.get("agent_devices", ["cuda:0"]), early_termination_threshold=magrpo_cfg.get( "early_termination_threshold", -0.2 ), - rollout_buffer_size=magrpo_cfg.get("rollout_buffer_size", 2), - train_batch_size=magrpo_cfg.get("train_batch_size"), + rollout_buffer_size=magrpo_cfg.get("rollout_buffer_size", 1), + train_batch_size=magrpo_cfg.get("train_batch_size", 1), advantage_normalization=magrpo_cfg.get("advantage_normalization", True), - eval_interval=magrpo_cfg.get("eval_interval", 4), + eval_interval=magrpo_cfg.get("eval_interval", 20), eval_num_samples=magrpo_cfg.get("eval_num_samples", 4), eval_batch_size=magrpo_cfg.get("eval_batch_size", 1), ) From cbb394708acdf06d6cc9e5eb3bba642f1b511c8d Mon Sep 17 00:00:00 2001 From: N!no Date: Mon, 16 Feb 2026 17:02:02 -0500 Subject: [PATCH 11/12] ud --- configs/ac_arxiv_config.yaml | 11 ++++++++++- configs/ac_tldr_config.yaml | 11 ++++++++++- configs/grpo_arxiv_config.yaml | 31 +++++++++++++++++-------------- configs/grpo_tldr_config.yaml | 31 +++++++++++++++++-------------- configs/iac_arxiv_config.yaml | 15 +++++++++++++-- configs/iac_tldr_config.yaml | 15 +++++++++++++-- configs/maac_arxiv_config.yaml | 15 +++++++++++++-- configs/maac_tldr_config.yaml | 15 +++++++++++++-- configs/magrpo_arxiv_config.yaml | 12 +++++++++++- configs/magrpo_tldr_config.yaml | 12 +++++++++++- 10 files changed, 128 insertions(+), 40 deletions(-) diff --git a/configs/ac_arxiv_config.yaml b/configs/ac_arxiv_config.yaml index 852521e..e15a060 100644 --- a/configs/ac_arxiv_config.yaml +++ b/configs/ac_arxiv_config.yaml @@ -6,25 +6,32 @@ agent_model: top_k: null max_length: 2048 torch_dtype: auto + agents: null + critic_model: name: Qwen/Qwen3-1.7B type: qwen max_length: 2048 torch_dtype: auto + critics: null + dataset: name: OpenMLRL/arXiv_abstract type: arxiv train_split: train[:1000] eval_split: val[:1000] + tokenizer: padding_side: left + output: base_dir: output_ac_arxiv verbose: false save_final_model: true save_path: output_ac_arxiv + ac: parallel_training: none num_turns: 1 @@ -32,17 +39,19 @@ ac: agent_learning_rate: 5.0e-06 critic_learning_rate: 5.0e-06 value_loss_coef: 0.6 - advantage_normalization: true rollout_buffer_size: 4 max_new_tokens: 512 + advantage_normalization: true eval_interval: 20 eval_num_samples: 4 eval_batch_size: 1 logging_steps: 100 + reward_processor: enabled: true scale_factor: 1.0 shift: 0.0 + wandb: project: comlrl entity: OpenMLRL diff --git a/configs/ac_tldr_config.yaml b/configs/ac_tldr_config.yaml index eed4651..8c26b76 100644 --- a/configs/ac_tldr_config.yaml +++ b/configs/ac_tldr_config.yaml @@ -6,25 +6,32 @@ agent_model: top_k: null max_length: 2048 torch_dtype: auto + agents: null + critic_model: name: Qwen/Qwen3-1.7B type: qwen max_length: 2048 torch_dtype: auto + critics: null + dataset: name: trl-lib/tldr type: tldr train_split: train[:1000] eval_split: test[:1000] + tokenizer: padding_side: left + output: base_dir: output_ac_tldr verbose: false save_final_model: true save_path: output_ac_tldr + ac: parallel_training: none num_turns: 1 @@ -32,17 +39,19 @@ ac: agent_learning_rate: 5.0e-06 critic_learning_rate: 5.0e-06 value_loss_coef: 0.6 - advantage_normalization: true rollout_buffer_size: 4 max_new_tokens: 256 + advantage_normalization: true eval_interval: 20 eval_num_samples: 4 eval_batch_size: 1 logging_steps: 100 + reward_processor: enabled: true scale_factor: 1.0 shift: 0.0 + wandb: project: comlrl entity: OpenMLRL diff --git a/configs/grpo_arxiv_config.yaml b/configs/grpo_arxiv_config.yaml index 3e38654..e887935 100644 --- a/configs/grpo_arxiv_config.yaml +++ b/configs/grpo_arxiv_config.yaml @@ -1,11 +1,11 @@ agent_model: - name: "Qwen/Qwen3-1.7B" - type: "qwen" + name: Qwen/Qwen3-1.7B + type: qwen temperature: 0.6 top_p: 0.6 top_k: null max_length: 2048 - torch_dtype: "auto" + torch_dtype: auto agents: null @@ -14,13 +14,13 @@ critic_model: null critics: null dataset: - name: "OpenMLRL/arXiv_abstract" - type: "arxiv" - train_split: "train[:1000]" - eval_split: "val[:1000]" + name: OpenMLRL/arXiv_abstract + type: arxiv + train_split: train[:1000] + eval_split: val[:1000] tokenizer: - padding_side: "left" + padding_side: left output: base_dir: output_grpo_arxiv @@ -32,11 +32,11 @@ grpo: parallel_training: none num_turns: 1 num_train_epochs: 1 - agent_learning_rate: 5.0e-6 + agent_learning_rate: 5.0e-06 logging_steps: 400 num_generations: 4 - joint_mode: aligned max_new_tokens: 512 + joint_mode: aligned rollout_buffer_size: 2 train_batch_size: 2 advantage_normalization: true @@ -50,7 +50,10 @@ reward_processor: shift: 0.0 wandb: - project: "comlrl" - entity: "OpenMLRL" - name: "grpo_arxiv" - tags: ["grpo", "arxiv", "single-agent"] + project: comlrl + entity: OpenMLRL + name: grpo_arxiv + tags: + - grpo + - arxiv + - single-agent diff --git a/configs/grpo_tldr_config.yaml b/configs/grpo_tldr_config.yaml index cf158f3..f1b0be9 100644 --- a/configs/grpo_tldr_config.yaml +++ b/configs/grpo_tldr_config.yaml @@ -1,11 +1,11 @@ agent_model: - name: "Qwen/Qwen3-1.7B" - type: "qwen" + name: Qwen/Qwen3-1.7B + type: qwen temperature: 0.6 top_p: 0.6 top_k: null max_length: 2048 - torch_dtype: "auto" + torch_dtype: auto agents: null @@ -14,13 +14,13 @@ critic_model: null critics: null dataset: - name: "trl-lib/tldr" - type: "tldr" - train_split: "train[:1000]" - eval_split: "test[:1000]" + name: trl-lib/tldr + type: tldr + train_split: train[:1000] + eval_split: test[:1000] tokenizer: - padding_side: "left" + padding_side: left output: base_dir: output_grpo_tldr @@ -32,11 +32,11 @@ grpo: parallel_training: none num_turns: 1 num_train_epochs: 1 - agent_learning_rate: 5.0e-6 + agent_learning_rate: 5.0e-06 logging_steps: 400 num_generations: 4 - joint_mode: aligned max_new_tokens: 256 + joint_mode: aligned rollout_buffer_size: 2 train_batch_size: 2 advantage_normalization: true @@ -50,7 +50,10 @@ reward_processor: shift: 0.0 wandb: - project: "comlrl" - entity: "OpenMLRL" - name: "grpo_tldr" - tags: ["grpo", "tldr", "single-agent"] + project: comlrl + entity: OpenMLRL + name: grpo_tldr + tags: + - grpo + - tldr + - single-agent diff --git a/configs/iac_arxiv_config.yaml b/configs/iac_arxiv_config.yaml index bdbc264..5c006be 100644 --- a/configs/iac_arxiv_config.yaml +++ b/configs/iac_arxiv_config.yaml @@ -6,29 +6,38 @@ agent_model: top_k: null max_length: 2048 torch_dtype: auto + agents: null + critic_model: name: Qwen/Qwen3-1.7B type: qwen max_length: 2048 torch_dtype: auto + critics: null + dataset: name: OpenMLRL/arXiv_abstract type: arxiv train_split: train[:1000] eval_split: val[:1000] + tokenizer: padding_side: left + output: base_dir: output_iac_arxiv verbose: false save_final_model: true save_path: output_iac_arxiv + iac: parallel_training: none - agent_devices: ["cuda:0"] - critic_devices: ["cuda:0"] + agent_devices: + - cuda:0 + critic_devices: + - cuda:0 num_agents: 2 num_turns: 1 use_separate_critic: true @@ -44,10 +53,12 @@ iac: eval_num_samples: 4 eval_batch_size: 1 logging_steps: 50 + reward_processor: enabled: true scale_factor: 1.0 shift: 0.0 + wandb: project: comlrl entity: OpenMLRL diff --git a/configs/iac_tldr_config.yaml b/configs/iac_tldr_config.yaml index 88be512..9cce1f0 100644 --- a/configs/iac_tldr_config.yaml +++ b/configs/iac_tldr_config.yaml @@ -6,29 +6,38 @@ agent_model: top_k: null max_length: 2048 torch_dtype: auto + agents: null + critic_model: name: Qwen/Qwen3-1.7B type: qwen max_length: 2048 torch_dtype: auto + critics: null + dataset: name: trl-lib/tldr type: tldr train_split: train[:1000] eval_split: test[:1000] + tokenizer: padding_side: left + output: base_dir: output_iac_tldr verbose: false save_final_model: true save_path: output_iac_tldr + iac: parallel_training: none - agent_devices: ["cuda:0"] - critic_devices: ["cuda:0"] + agent_devices: + - cuda:0 + critic_devices: + - cuda:0 num_agents: 2 num_turns: 1 use_separate_critic: true @@ -44,10 +53,12 @@ iac: eval_num_samples: 4 eval_batch_size: 1 logging_steps: 50 + reward_processor: enabled: true scale_factor: 1.0 shift: 0.0 + wandb: project: comlrl entity: OpenMLRL diff --git a/configs/maac_arxiv_config.yaml b/configs/maac_arxiv_config.yaml index 41c38ff..48aaab2 100644 --- a/configs/maac_arxiv_config.yaml +++ b/configs/maac_arxiv_config.yaml @@ -6,29 +6,38 @@ agent_model: top_k: null max_length: 2048 torch_dtype: auto + agents: null + critic_model: name: Qwen/Qwen3-1.7B type: qwen max_length: 2048 torch_dtype: auto + critics: null + dataset: name: OpenMLRL/arXiv_abstract type: arxiv train_split: train[:1100] eval_split: val[:1100] + tokenizer: padding_side: left + output: base_dir: output_maac_arxiv verbose: false save_final_model: true save_path: output_maac_arxiv + maac: parallel_training: none - agent_devices: ["cuda:0"] - critic_devices: ["cuda:0"] + agent_devices: + - cuda:0 + critic_devices: + - cuda:0 num_agents: 2 num_turns: 1 critic_type: v @@ -43,10 +52,12 @@ maac: eval_num_samples: 4 eval_batch_size: 1 logging_steps: 50 + reward_processor: enabled: true scale_factor: 1.0 shift: 0.0 + wandb: project: comlrl entity: OpenMLRL diff --git a/configs/maac_tldr_config.yaml b/configs/maac_tldr_config.yaml index 374cded..c2c49d3 100644 --- a/configs/maac_tldr_config.yaml +++ b/configs/maac_tldr_config.yaml @@ -6,29 +6,38 @@ agent_model: top_k: null max_length: 2048 torch_dtype: auto + agents: null + critic_model: name: Qwen/Qwen3-1.7B type: qwen max_length: 2048 torch_dtype: auto + critics: null + dataset: name: trl-lib/tldr type: tldr train_split: train[:1100] eval_split: test[:1100] + tokenizer: padding_side: left + output: base_dir: output_maac_tldr verbose: false save_final_model: true save_path: output_maac_tldr + maac: parallel_training: none - agent_devices: ["cuda:0"] - critic_devices: ["cuda:0"] + agent_devices: + - cuda:0 + critic_devices: + - cuda:0 num_agents: 2 num_turns: 1 critic_type: v @@ -43,10 +52,12 @@ maac: eval_num_samples: 4 eval_batch_size: 1 logging_steps: 50 + reward_processor: enabled: true scale_factor: 1.0 shift: 0.0 + wandb: project: comlrl entity: OpenMLRL diff --git a/configs/magrpo_arxiv_config.yaml b/configs/magrpo_arxiv_config.yaml index 91feacf..2f76c1b 100644 --- a/configs/magrpo_arxiv_config.yaml +++ b/configs/magrpo_arxiv_config.yaml @@ -6,24 +6,32 @@ agent_model: top_k: null max_length: 2048 torch_dtype: auto + agents: null + critic_model: null + critics: null + dataset: name: OpenMLRL/arXiv_abstract type: arxiv train_split: train[:1100] eval_split: val[:1100] + tokenizer: padding_side: left + output: base_dir: output_magrpo_arxiv verbose: false save_final_model: true save_path: output_magrpo_arxiv + magrpo: parallel_training: none - agent_devices: ["cuda:0"] + agent_devices: + - cuda:0 num_turns: 1 num_train_epochs: 2 agent_learning_rate: 5.0e-06 @@ -37,10 +45,12 @@ magrpo: eval_interval: 20 eval_num_samples: 4 eval_batch_size: 1 + reward_processor: enabled: true scale_factor: 1.0 shift: 0.0 + wandb: project: comlrl entity: OpenMLRL diff --git a/configs/magrpo_tldr_config.yaml b/configs/magrpo_tldr_config.yaml index 8bc89f2..f52672b 100644 --- a/configs/magrpo_tldr_config.yaml +++ b/configs/magrpo_tldr_config.yaml @@ -6,24 +6,32 @@ agent_model: top_k: null max_length: 2048 torch_dtype: auto + agents: null + critic_model: null + critics: null + dataset: name: trl-lib/tldr type: tldr train_split: train[:1100] eval_split: test[:1100] + tokenizer: padding_side: left + output: base_dir: output_magrpo_tldr verbose: false save_final_model: true save_path: output_magrpo_tldr + magrpo: parallel_training: none - agent_devices: ["cuda:0"] + agent_devices: + - cuda:0 num_turns: 1 num_train_epochs: 2 agent_learning_rate: 5.0e-06 @@ -37,10 +45,12 @@ magrpo: eval_interval: 20 eval_num_samples: 4 eval_batch_size: 1 + reward_processor: enabled: true scale_factor: 1.0 shift: 0.0 + wandb: project: comlrl entity: OpenMLRL From 19b70ac1689c9d24c642e1f7bf8a11fcfa045efd Mon Sep 17 00:00:00 2001 From: N!no Date: Mon, 16 Feb 2026 17:09:11 -0500 Subject: [PATCH 12/12] u d --- train_magrpo.py | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/train_magrpo.py b/train_magrpo.py index 31d9041..9367113 100644 --- a/train_magrpo.py +++ b/train_magrpo.py @@ -11,7 +11,7 @@ from config import Config, add_config_args, parse_overrides from datasets import load_dataset -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoTokenizer from loggers.arxiv_logger import ( aggregate_arxiv_metrics_for_logging, @@ -263,10 +263,6 @@ def main(): train_dataset = load_dataset(dataset_name, split=train_split) eval_dataset = load_dataset(dataset_name, split=eval_split) - model_kwargs: Dict[str, Any] = {} - if model_config.torch_dtype is not None: - model_kwargs["torch_dtype"] = model_config.torch_dtype - agents_field = config.get("agents") agent_names = None if isinstance(agents_field, (list, tuple)): @@ -302,22 +298,6 @@ def main(): tok.add_special_tokens(model_config.special_tokens) tokenizer = tokenizers[0] - if agent_names: - agents = [ - AutoModelForCausalLM.from_pretrained( - name, - **model_kwargs, - ) - for name in agent_names - ] - else: - agents = [ - AutoModelForCausalLM.from_pretrained( - model_name, - **model_kwargs, - ) - for _ in range(num_agents) - ] magrpo_cfg = config.get_section("magrpo") num_turns_cfg = magrpo_cfg.get("num_turns") if num_turns_cfg is not None and int(num_turns_cfg) != 1: @@ -407,8 +387,12 @@ def main(): trainer_kwargs: Dict[str, Any] = { "agent_model": model_name or None, - "agents": agents, + "agents": agent_names, "num_agents": num_agents, + "model_config": { + "torch_dtype": model_config.torch_dtype, + "special_tokens": model_config.special_tokens, + }, "reward_func": reward_func, "formatters": formatters, "args": magrpo_args,