From a0d54a42e58d8b9fd05c2fb0287c3f9c7139a39f Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Fri, 6 Mar 2026 13:47:25 +0200 Subject: [PATCH 01/30] Make node placement correctly propagate M-Bridge --- .../workloads/megatron_bridge/slurm_command_gen_strategy.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py index 8d08c5696..1fae8e229 100644 --- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py @@ -400,6 +400,12 @@ def add_field(field: str, flag: str, value: Any) -> None: add_field("nsys_trace", "--nsys_trace", self._list_or_comma_str(args.nsys_trace)) add_field("nsys_extra_args", "--nsys_extra_args", self._list_or_comma_str(args.nsys_extra_args)) + # Node placement (pass resolved nodelist to Megatron-Bridge via --additional_slurm_params) + _, node_list = self.get_cached_nodes_spec() + if node_list: + nodelist_str = ",".join(node_list) + parts.extend(["--additional_slurm_params", shlex.quote(f"nodelist={nodelist_str}")]) + # Config variant add_field("config_variant", "-cv", args.config_variant) if args.list_config_variants and "list_config_variants" in fields_set: From e9b1218d72d0f18ac1fb900f0c93a65c38147c0a Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Tue, 10 Mar 2026 14:01:43 +0200 Subject: [PATCH 02/30] Allow excluding nodes from group of nodes --- src/cloudai/_core/test_scenario.py | 1 + src/cloudai/models/scenario.py | 4 +++ .../slurm/slurm_command_gen_strategy.py | 10 ++++-- src/cloudai/systems/slurm/slurm_system.py | 36 ++++++++++++++----- src/cloudai/test_scenario_parser.py | 1 + tests/systems/slurm/test_allocation.py | 21 +++++++++++ tests/systems/slurm/test_system.py | 2 +- 7 files changed, 64 insertions(+), 11 deletions(-) diff --git a/src/cloudai/_core/test_scenario.py b/src/cloudai/_core/test_scenario.py index 5480adf78..22484ddc1 100644 --- a/src/cloudai/_core/test_scenario.py +++ b/src/cloudai/_core/test_scenario.py @@ -75,6 +75,7 @@ class TestRun: post_test: Optional[TestScenario] = None reports: Set[Type[ReportGenerationStrategy]] = field(default_factory=set) extra_srun_args: str | None = None + exclude_nodes: str | None = None def __hash__(self) -> int: return hash(self.name + self.test.name + str(self.iterations) + str(self.current_iteration)) diff --git a/src/cloudai/models/scenario.py b/src/cloudai/models/scenario.py index cb5452af2..823a83527 100644 --- a/src/cloudai/models/scenario.py +++ b/src/cloudai/models/scenario.py @@ -80,6 +80,10 @@ class TestRunModel(BaseModel): time_limit: Optional[str] = None dependencies: list[TestRunDependencyModel] = Field(default_factory=list) extra_srun_args: str | None = None + exclude_nodes: str | None = Field( + default=None, + description="Comma-separated hostnames to exclude from the resolved node list.", + ) # test definition fields name: Optional[str] = None diff --git a/src/cloudai/systems/slurm/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/slurm_command_gen_strategy.py index 5c17b4d17..661eda35f 100644 --- a/src/cloudai/systems/slurm/slurm_command_gen_strategy.py +++ b/src/cloudai/systems/slurm/slurm_command_gen_strategy.py @@ -26,7 +26,7 @@ from cloudai.core import CommandGenStrategy, Registry, System, TestRun, TestScenario from cloudai.models.scenario import TestRunDetails -from .slurm_system import SlurmSystem +from .slurm_system import SlurmSystem, parse_node_list class SlurmCommandGenStrategy(CommandGenStrategy): @@ -465,6 +465,7 @@ def get_cached_nodes_spec(self) -> tuple[int, list[str]]: str(self.test_run.step), str(self.test_run.nnodes), ",".join(self.test_run.nodes), + self.test_run.exclude_nodes or "", ] ) @@ -472,5 +473,10 @@ def get_cached_nodes_spec(self) -> tuple[int, list[str]]: logging.debug(f"Using cached node allocation for {cache_key}: {self._node_spec_cache[cache_key]}") return self._node_spec_cache[cache_key] - self._node_spec_cache[cache_key] = self.system.get_nodes_by_spec(self.test_run.nnodes, self.test_run.nodes) + excluded = set(parse_node_list(self.test_run.exclude_nodes)) if self.test_run.exclude_nodes else None + num_nodes, node_list = self.system.get_nodes_by_spec( + self.test_run.nnodes, self.test_run.nodes, exclude_nodes=excluded + ) + + self._node_spec_cache[cache_key] = (num_nodes, node_list) return self._node_spec_cache[cache_key] diff --git a/src/cloudai/systems/slurm/slurm_system.py b/src/cloudai/systems/slurm/slurm_system.py index 5edbed01a..36395b3bb 100644 --- a/src/cloudai/systems/slurm/slurm_system.py +++ b/src/cloudai/systems/slurm/slurm_system.py @@ -443,7 +443,11 @@ def format_range(lst: List[int], padding: int) -> List[str]: return ", ".join(formatted_ranges) def get_available_nodes_from_group( - self, partition_name: str, group_name: str, number_of_nodes: Union[int, str] + self, + partition_name: str, + group_name: str, + number_of_nodes: Union[int, str], + exclude_nodes: set[str] | None = None, ) -> List[SlurmNode]: """ Retrieve a specific number of potentially available nodes from a group within a partition. @@ -456,6 +460,7 @@ def get_available_nodes_from_group( group_name (str): The name of the group. number_of_nodes (Union[int,str]): The number of nodes to retrieve. Could also be 'all' to retrieve all the nodes from the group. + exclude_nodes (set[str] | None): Node names to exclude from the pool before selection. Returns: List[SlurmNode]: Objects that are potentially available for use. @@ -468,7 +473,7 @@ def get_available_nodes_from_group( self.validate_partition_and_group(partition_name, group_name) - grouped_nodes = self.group_nodes_by_state(partition_name, group_name) + grouped_nodes = self.group_nodes_by_state(partition_name, group_name, exclude_nodes=exclude_nodes) try: allocated_nodes = self.allocate_nodes(grouped_nodes, number_of_nodes, group_name) @@ -505,14 +510,19 @@ def validate_partition_and_group(self, partition_name: str, group_name: str) -> if group_name not in self.groups[partition_name]: raise ValueError(f"Group '{group_name}' not found in partition '{partition_name}'.") - def group_nodes_by_state(self, partition_name: str, group_name: str) -> Dict[SlurmNodeState, List[SlurmNode]]: + def group_nodes_by_state( + self, + partition_name: str, + group_name: str, + exclude_nodes: set[str] | None = None, + ) -> Dict[SlurmNodeState, List[SlurmNode]]: """ Group nodes by their states, excluding nodes allocated to the current user. Args: partition_name (str): The name of the partition. group_name (str): The name of the group. - current_user (str): The username of the current user. + exclude_nodes (set[str] | None): Node names to exclude from the pool before grouping. Returns: Dict[SlurmNodeState, List[SlurmNode]]: A dictionary grouping nodes by their state. @@ -524,6 +534,8 @@ def group_nodes_by_state(self, partition_name: str, group_name: str) -> Dict[Slu } for node in self.groups[partition_name][group_name]: + if exclude_nodes and node.name in exclude_nodes: + continue if node.state in grouped_nodes: grouped_nodes[node.state].append(node) @@ -671,7 +683,7 @@ def convert_state_to_enum(self, state_str: str) -> SlurmNodeState: logging.debug(f"Unknown node state: {core_state}") return SlurmNodeState.UNKNOWN_STATE - def parse_nodes(self, nodes: List[str]) -> List[str]: + def parse_nodes(self, nodes: List[str], exclude_nodes: set[str] | None = None) -> List[str]: """ Parse a list of node specifications into individual node names. @@ -684,6 +696,7 @@ def parse_nodes(self, nodes: List[str]) -> List[str]: "partition:group:num_nodes", where "partition" is the partition name, "group" is a group within that partition, and "num_nodes" is the number of nodes requested. Node ranges should be specified with square brackets and dashes, e.g., "node[01-03]" for "node01", "node02", "node03". + exclude_nodes (set[str] | None): Node names to exclude from group pools before selection. Returns: List[str]: A list of node names. For specifications, it includes names of allocated nodes based on the @@ -701,17 +714,23 @@ def parse_nodes(self, nodes: List[str]) -> List[str]: raise ValueError("Format should be partition:group:num_nodes") partition_name, group_name, num_nodes_spec = parts num_nodes = int(num_nodes_spec) if num_nodes_spec != "max_avail" else num_nodes_spec - group_nodes = self.get_available_nodes_from_group(partition_name, group_name, num_nodes) + group_nodes = self.get_available_nodes_from_group( + partition_name, group_name, num_nodes, exclude_nodes=exclude_nodes + ) parsed_nodes += [node.name for node in group_nodes] else: expanded_nodes = parse_node_list(node_spec) + if exclude_nodes: + expanded_nodes = [n for n in expanded_nodes if n not in exclude_nodes] parsed_nodes += expanded_nodes # Remove duplicates while preserving order parsed_nodes = list(dict.fromkeys(parsed_nodes)) return parsed_nodes - def get_nodes_by_spec(self, num_nodes: int, nodes: list[str]) -> Tuple[int, list[str]]: + def get_nodes_by_spec( + self, num_nodes: int, nodes: list[str], exclude_nodes: set[str] | None = None + ) -> Tuple[int, list[str]]: """ Retrieve a list of node names based on specifications. @@ -721,12 +740,13 @@ def get_nodes_by_spec(self, num_nodes: int, nodes: list[str]) -> Tuple[int, list Args: num_nodes (int): The number of nodes, can't be `0`. nodes (list[str]): A list of node names specifications, slurm format or `PARTITION:GROUP:NUM_NODES`. + exclude_nodes (set[str] | None): Node names to exclude from group pools before selection. Returns: Tuple[int, list[str]]: The number of nodes and a list of node names. """ num_nodes, node_list = num_nodes, [] - parsed_nodes = self.parse_nodes(nodes) + parsed_nodes = self.parse_nodes(nodes, exclude_nodes=exclude_nodes) if parsed_nodes: num_nodes = len(parsed_nodes) node_list = parsed_nodes diff --git a/src/cloudai/test_scenario_parser.py b/src/cloudai/test_scenario_parser.py index 774e8dd9c..a75c467af 100644 --- a/src/cloudai/test_scenario_parser.py +++ b/src/cloudai/test_scenario_parser.py @@ -208,6 +208,7 @@ def _create_test_run( post_test=post_test, reports=get_reporters(test_info, tdef), extra_srun_args=test_info.extra_srun_args, + exclude_nodes=test_info.exclude_nodes, ) return tr diff --git a/tests/systems/slurm/test_allocation.py b/tests/systems/slurm/test_allocation.py index 9426f5fa5..5913fa1a7 100644 --- a/tests/systems/slurm/test_allocation.py +++ b/tests/systems/slurm/test_allocation.py @@ -115,3 +115,24 @@ def test_group_allocation_is_preserved_on_updated(self, slurm_system: SlurmSyste ): system.update() assert all(node.state == SlurmNodeState.ALLOCATED for node in system.group_allocated) + + def test_exclude_nodes_selects_from_remaining_pool( + self, slurm_system: SlurmSystem, monkeypatch: pytest.MonkeyPatch + ): + """Excluding a node from a group should still yield the requested count from the remaining pool.""" + system, all_nodes, _ = self.prepare(slurm_system, [], monkeypatch) + nnodes, nodes_list = system.get_nodes_by_spec(1, ["main:group1:4"], exclude_nodes={"node03"}) + assert nnodes == 4 + assert "node03" not in nodes_list + assert len(nodes_list) == 4 + + def test_exclude_multiple_nodes_from_group( + self, slurm_system: SlurmSystem, monkeypatch: pytest.MonkeyPatch + ): + """Excluding multiple nodes still selects the requested count from remaining nodes.""" + system, all_nodes, _ = self.prepare(slurm_system, [], monkeypatch) + nnodes, nodes_list = system.get_nodes_by_spec(1, ["main:group1:3"], exclude_nodes={"node01", "node05"}) + assert nnodes == 3 + assert "node01" not in nodes_list + assert "node05" not in nodes_list + assert len(nodes_list) == 3 diff --git a/tests/systems/slurm/test_system.py b/tests/systems/slurm/test_system.py index 0ad79b61f..6ba1da1d1 100644 --- a/tests/systems/slurm/test_system.py +++ b/tests/systems/slurm/test_system.py @@ -363,7 +363,7 @@ def test_explicit_node_names( num_nodes, node_list = slurm_system.get_nodes_by_spec(in_nnodes, in_nodes) - mock_parse_nodes.assert_called_once_with(in_nodes) + mock_parse_nodes.assert_called_once_with(in_nodes, exclude_nodes=None) assert num_nodes == exp_nnodes assert node_list == exp_nodes From d720616f683c92cdce044a22e50db42e955e06de Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Wed, 11 Mar 2026 07:36:48 +0200 Subject: [PATCH 03/30] Also init submodules when install git repos --- .../kubernetes/kubernetes_installer.py | 15 +++++++ src/cloudai/systems/slurm/slurm_installer.py | 15 +++++++ tests/test_git_repo_installer.py | 39 +++++++++++++++++++ 3 files changed, 69 insertions(+) diff --git a/src/cloudai/systems/kubernetes/kubernetes_installer.py b/src/cloudai/systems/kubernetes/kubernetes_installer.py index 6f18f2a7a..f360230ea 100644 --- a/src/cloudai/systems/kubernetes/kubernetes_installer.py +++ b/src/cloudai/systems/kubernetes/kubernetes_installer.py @@ -172,6 +172,13 @@ def _install_one_git_repo(self, item: GitRepo) -> InstallStatusResult: rmtree(repo_path) return res + res = self._init_submodules(repo_path) + if not res.success: + logging.error(f"Submodule init failed, removing cloned repository at {repo_path}") + if repo_path.exists(): + rmtree(repo_path) + return res + item.installed_path = repo_path return InstallStatusResult(True) @@ -237,6 +244,14 @@ def _checkout_commit(self, commit_hash: str, path: Path) -> InstallStatusResult: return InstallStatusResult(False, f"Failed to checkout commit {commit_hash}: {result.stderr}") return InstallStatusResult(True) + def _init_submodules(self, path: Path) -> InstallStatusResult: + logging.debug(f"Initializing submodules in {path}") + submodule_cmd = ["git", "submodule", "update", "--init", "--recursive"] + result = subprocess.run(submodule_cmd, cwd=str(path), capture_output=True, text=True) + if result.returncode != 0: + return InstallStatusResult(False, f"Failed to initialize submodules: {result.stderr}") + return InstallStatusResult(True) + def _verify_commit(self, ref: str, path: Path) -> InstallStatusResult: try: result = subprocess.run(["git", "rev-parse", "HEAD"], cwd=str(path), capture_output=True, text=True) diff --git a/src/cloudai/systems/slurm/slurm_installer.py b/src/cloudai/systems/slurm/slurm_installer.py index 4800becc5..d286c32d7 100644 --- a/src/cloudai/systems/slurm/slurm_installer.py +++ b/src/cloudai/systems/slurm/slurm_installer.py @@ -225,6 +225,13 @@ def _install_one_git_repo(self, item: GitRepo) -> InstallStatusResult: rmtree(repo_path) return res + res = self._init_submodules(repo_path) + if not res.success: + logging.error(f"Submodule init failed, removing cloned repository at {repo_path}") + if repo_path.exists(): + rmtree(repo_path) + return res + item.installed_path = repo_path return InstallStatusResult(True) @@ -290,6 +297,14 @@ def _checkout_commit(self, commit_hash: str, path: Path) -> InstallStatusResult: return InstallStatusResult(False, f"Failed to checkout commit {commit_hash}: {result.stderr}") return InstallStatusResult(True) + def _init_submodules(self, path: Path) -> InstallStatusResult: + logging.debug(f"Initializing submodules in {path}") + submodule_cmd = ["git", "submodule", "update", "--init", "--recursive"] + result = subprocess.run(submodule_cmd, cwd=str(path), capture_output=True, text=True) + if result.returncode != 0: + return InstallStatusResult(False, f"Failed to initialize submodules: {result.stderr}") + return InstallStatusResult(True) + def _verify_commit(self, ref: str, path: Path) -> InstallStatusResult: try: result = subprocess.run(["git", "rev-parse", "HEAD"], cwd=str(path), capture_output=True, text=True) diff --git a/tests/test_git_repo_installer.py b/tests/test_git_repo_installer.py index 545f1fe1d..3d12298c1 100644 --- a/tests/test_git_repo_installer.py +++ b/tests/test_git_repo_installer.py @@ -232,9 +232,48 @@ def test_verify_commit_branch_name_match(self, installer: Union[KubernetesInstal res = installer._verify_commit(ref, repo_path) assert res.success + def test_submodules_initialized(self, installer: Union[KubernetesInstaller, SlurmInstaller], git: GitRepo): + repo_path = installer.system.install_path / git.repo_name + repo_path.mkdir() + with patch("subprocess.run") as mock_run: + mock_run.return_value = CompletedProcess(args=[], returncode=0) + res = installer._init_submodules(repo_path) + assert res.success + mock_run.assert_called_once_with( + ["git", "submodule", "update", "--init", "--recursive"], + cwd=str(repo_path), + capture_output=True, + text=True, + ) + + def test_error_initializing_submodules(self, installer: Union[KubernetesInstaller, SlurmInstaller], git: GitRepo): + repo_path = installer.system.install_path / git.repo_name + repo_path.mkdir() + with patch("subprocess.run") as mock_run: + mock_run.return_value = CompletedProcess(args=[], returncode=1, stderr="err") + res = installer._init_submodules(repo_path) + assert not res.success + assert res.message == "Failed to initialize submodules: err" + + def test_submodule_failure_cleans_up_repo( + self, installer: Union[KubernetesInstaller, SlurmInstaller], git: GitRepo + ): + repo_path = installer.system.install_path / git.repo_name + installer._clone_repository = Mock( + side_effect=lambda url, path: (path.mkdir(parents=True, exist_ok=True), InstallStatusResult(True))[1] + ) + installer._checkout_commit = Mock(return_value=InstallStatusResult(True)) + installer._init_submodules = Mock( + return_value=InstallStatusResult(False, "Failed to initialize submodules: err") + ) + res = installer._install_one_git_repo(git) + assert not res.success + assert not repo_path.exists() + def test_all_good_flow(self, installer: Union[KubernetesInstaller, SlurmInstaller], git: GitRepo): installer._clone_repository = Mock(return_value=InstallStatusResult(True)) installer._checkout_commit = Mock(return_value=InstallStatusResult(True)) + installer._init_submodules = Mock(return_value=InstallStatusResult(True)) res = installer._install_one_git_repo(git) assert res.success assert git.installed_path == installer.system.install_path / git.repo_name From ea4844db42bb7f35a90ec1d10c30069140fa4fc7 Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Wed, 11 Mar 2026 07:57:41 +0200 Subject: [PATCH 04/30] Add options to mount the repo in the container --- .../megatron_bridge/megatron_bridge.py | 10 ++++- .../slurm_command_gen_strategy.py | 17 +++++---- .../test_command_gen_strategy_slurm.py | 37 +++++++++++++++---- 3 files changed, 47 insertions(+), 17 deletions(-) diff --git a/src/cloudai/workloads/megatron_bridge/megatron_bridge.py b/src/cloudai/workloads/megatron_bridge/megatron_bridge.py index 783072c59..7ff8f72b3 100644 --- a/src/cloudai/workloads/megatron_bridge/megatron_bridge.py +++ b/src/cloudai/workloads/megatron_bridge/megatron_bridge.py @@ -184,10 +184,16 @@ class MegatronBridgeTestDefinition(TestDefinition): @staticmethod def _select_megatron_bridge_repo(git_repos: list[GitRepo]) -> GitRepo | None: - """Return the Megatron-Bridge repo from `git_repos` (normalized to mount_as=/opt/Megatron-Bridge).""" + """ + Return the Megatron-Bridge repo from `git_repos`. + + When the user sets ``mount_as`` (e.g. ``/opt/Megatron-Bridge``), the installed clone will be bind-mounted + into the container at that path, overriding whatever the container image ships. When ``mount_as`` is *not* + set the container's built-in ``/opt/Megatron-Bridge`` is used. + """ for repo in git_repos: if "Megatron-Bridge" in repo.url or (repo.mount_as or "").rstrip("/") == "/opt/Megatron-Bridge": - return repo if repo.mount_as else repo.model_copy(update={"mount_as": "/opt/Megatron-Bridge"}) + return repo return None @field_validator("git_repos", mode="after") diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py index 1fae8e229..064b7e199 100644 --- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py @@ -225,14 +225,15 @@ def _installed_container_path() -> str: else: container_path = _installed_container_path() - # Use only test-level extra_container_mounts; never mount the Megatron-Bridge repo via -cm - # because the container uses its built-in copy. - mounts = [str(m).strip() for m in (tdef.extra_container_mounts or []) if str(m).strip()] - mounts = [ - m - for m in mounts - if "/opt/Megatron-Bridge" not in m and "Megatron-Bridge" not in m.split(":")[0].split("/")[-1] - ] + mounts: list[str] = [] + mounts.extend(tdef.extra_container_mounts or []) + + # When the user sets mount_as on the Megatron-Bridge git repo, bind-mount the + # installed clone into the container to override the image's built-in copy. + mb_repo = tdef.megatron_bridge_repo + if mb_repo.mount_as: + mb_host = mb_repo.installed_path.absolute() if mb_repo.installed_path else repo_path + mounts.append(f"{mb_host}:{mb_repo.mount_as}") venv_path = tdef.python_executable.venv_path or (self.system.install_path / tdef.python_executable.venv_name) python_bin = (venv_path / "bin" / "python").absolute() diff --git a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py index 23708e50c..120ad7de9 100644 --- a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py +++ b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py @@ -64,6 +64,7 @@ def _make( *, cmd_args_overrides: dict[str, Any] | None = None, git_commit: str = "r0.2.0", + mount_as: str | None = "/opt/Megatron-Bridge", output_subdir: str = "out", num_nodes: int = 2, ) -> TestRun: @@ -78,19 +79,20 @@ def _make( if cmd_args_overrides: cmd_args_data.update(cmd_args_overrides) + repo_kwargs: dict[str, Any] = { + "url": "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git", + "commit": git_commit, + } + if mount_as is not None: + repo_kwargs["mount_as"] = mount_as + tdef = MegatronBridgeTestDefinition( name="mb", description="desc", test_template_name="MegatronBridge", cmd_args=MegatronBridgeCmdArgs.model_validate(cmd_args_data), extra_container_mounts=[], - git_repos=[ - GitRepo( - url="https://github.com/NVIDIA-NeMo/Megatron-Bridge.git", - commit=git_commit, - mount_as="/opt/Megatron-Bridge", - ) - ], + git_repos=[GitRepo(**repo_kwargs)], ) self._configure_fake_installs(tdef, tmp_path) return TestRun( @@ -317,3 +319,24 @@ def test_use_recipes_emitted_only_when_true( cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr) wrapper_content = self._wrapper_content(cmd_gen) assert ("--use_recipes" in wrapper_content) is expected_in_wrapper + + def test_mount_as_adds_repo_to_container_mounts( + self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun], tmp_path: Path + ) -> None: + tr = make_test_run(mount_as="/opt/Megatron-Bridge", output_subdir="out_mount") + tdef = cast(MegatronBridgeTestDefinition, tr.test) + repo_path = tdef.megatron_bridge_repo.installed_path + assert repo_path is not None + + cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr) + wrapper_content = self._wrapper_content(cmd_gen) + assert f"-cm {repo_path.absolute()}:/opt/Megatron-Bridge" in wrapper_content + + def test_no_mount_as_skips_repo_container_mount( + self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun] + ) -> None: + tr = make_test_run(mount_as=None, output_subdir="out_no_mount") + cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr) + wrapper_content = self._wrapper_content(cmd_gen) + assert "-cm" not in wrapper_content + assert "/opt/Megatron-Bridge" not in wrapper_content From 9f786ffed2aa7321eb9b696b3f7532c73315c133 Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Wed, 11 Mar 2026 09:56:26 +0200 Subject: [PATCH 05/30] Fix slurm gpu resource requesting --- .../megatron_bridge/slurm_command_gen_strategy.py | 11 +++++++++-- .../test_command_gen_strategy_slurm.py | 9 +++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py index 064b7e199..338eb24fc 100644 --- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py @@ -401,11 +401,18 @@ def add_field(field: str, flag: str, value: Any) -> None: add_field("nsys_trace", "--nsys_trace", self._list_or_comma_str(args.nsys_trace)) add_field("nsys_extra_args", "--nsys_extra_args", self._list_or_comma_str(args.nsys_extra_args)) - # Node placement (pass resolved nodelist to Megatron-Bridge via --additional_slurm_params) + additional_slurm_params: list[str] = [] + + if args.gpus_per_node: + additional_slurm_params.append(f"gpus-per-node={args.gpus_per_node}") + _, node_list = self.get_cached_nodes_spec() if node_list: nodelist_str = ",".join(node_list) - parts.extend(["--additional_slurm_params", shlex.quote(f"nodelist={nodelist_str}")]) + additional_slurm_params.append(f"nodelist={nodelist_str}") + + if additional_slurm_params: + parts.extend(["--additional_slurm_params", shlex.quote(" ".join(additional_slurm_params))]) # Config variant add_field("config_variant", "-cv", args.config_variant) diff --git a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py index 120ad7de9..430f36d38 100644 --- a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py +++ b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py @@ -340,3 +340,12 @@ def test_no_mount_as_skips_repo_container_mount( wrapper_content = self._wrapper_content(cmd_gen) assert "-cm" not in wrapper_content assert "/opt/Megatron-Bridge" not in wrapper_content + + def test_gpus_per_node_passed_as_additional_slurm_param( + self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun] + ) -> None: + tr = make_test_run(cmd_args_overrides={"gpus_per_node": 4}, output_subdir="out_gpus") + cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr) + wrapper_content = self._wrapper_content(cmd_gen) + assert "--additional_slurm_params" in wrapper_content + assert "gpus-per-node=4" in wrapper_content From e6c9cb22a0f5b605e325b2f70f8191b268b262a8 Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Wed, 11 Mar 2026 08:37:31 +0200 Subject: [PATCH 06/30] Update gb200 M-Bridge r0.3.0 qwen recipe --- .../test/gb200/megatron_bridge_qwen_30b.toml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml index b06b9d1f6..d7903c026 100644 --- a/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml +++ b/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml @@ -22,12 +22,12 @@ extra_container_mounts = [] [[git_repos]] url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git" -commit = "v0.3.0" +commit = "r0.3.0" mount_as = "/opt/Megatron-Bridge" [cmd_args] gpu_type = "gb200" -container_image = "nvcr.io#nvidia/nemo:26.02.00" +container_image = "nvcr.io#nvidia/nemo:26.02" model_family_name = "qwen" model_recipe_name = "qwen3_30b_a3b" gpus_per_node = 4 @@ -37,3 +37,7 @@ task = "pretrain" compute_dtype = "fp8_mx" hf_token = "" enable_vboost = true +detach = false + +[extra_env_vars] +PYTHONPATH = "/opt/Megatron-Bridge/3rdparty/Megatron-LM:${PYTHONPATH}" From 2acfb29771cbdf703cf7c967912a586f3e56d868 Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Wed, 11 Mar 2026 10:26:26 +0200 Subject: [PATCH 07/30] ruff check & ruff format --- tests/systems/slurm/test_allocation.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/systems/slurm/test_allocation.py b/tests/systems/slurm/test_allocation.py index 5913fa1a7..cc690a920 100644 --- a/tests/systems/slurm/test_allocation.py +++ b/tests/systems/slurm/test_allocation.py @@ -120,17 +120,15 @@ def test_exclude_nodes_selects_from_remaining_pool( self, slurm_system: SlurmSystem, monkeypatch: pytest.MonkeyPatch ): """Excluding a node from a group should still yield the requested count from the remaining pool.""" - system, all_nodes, _ = self.prepare(slurm_system, [], monkeypatch) + system, _all_nodes, _ = self.prepare(slurm_system, [], monkeypatch) nnodes, nodes_list = system.get_nodes_by_spec(1, ["main:group1:4"], exclude_nodes={"node03"}) assert nnodes == 4 assert "node03" not in nodes_list assert len(nodes_list) == 4 - def test_exclude_multiple_nodes_from_group( - self, slurm_system: SlurmSystem, monkeypatch: pytest.MonkeyPatch - ): + def test_exclude_multiple_nodes_from_group(self, slurm_system: SlurmSystem, monkeypatch: pytest.MonkeyPatch): """Excluding multiple nodes still selects the requested count from remaining nodes.""" - system, all_nodes, _ = self.prepare(slurm_system, [], monkeypatch) + system, _all_nodes, _ = self.prepare(slurm_system, [], monkeypatch) nnodes, nodes_list = system.get_nodes_by_spec(1, ["main:group1:3"], exclude_nodes={"node01", "node05"}) assert nnodes == 3 assert "node01" not in nodes_list From 45e6153a77b4fd2a65fa6395204e90c3c9a20d26 Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Wed, 11 Mar 2026 10:33:33 +0200 Subject: [PATCH 08/30] Fix tests to reflect updates --- tests/ref_data/megatron-bridge.sbatch | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ref_data/megatron-bridge.sbatch b/tests/ref_data/megatron-bridge.sbatch index a65cc44ed..4c95d637f 100644 --- a/tests/ref_data/megatron-bridge.sbatch +++ b/tests/ref_data/megatron-bridge.sbatch @@ -19,7 +19,7 @@ if [ "${WANDB_INSTALL_RC}" -ne 0 ]; then fi LAUNCH_RC=0 -NEMORUN_HOME="__OUTPUT_DIR__/output" __INSTALL_DIR__/Run__main-venv/bin/python __INSTALL_DIR__/Megatron-Bridge__main/scripts/performance/setup_experiment.py -p main -i __OUTPUT_DIR__/output/megatron_bridge_image.sqsh -hf dummy_token -ng 8 -gn 4 --golden_values_path cloudai_megatron_bridge_golden_values.json -cb 'export CUDA_VISIBLE_DEVICES=0,1,2,3' -cb 'export NCCL_DEBUG=INFO' -m qwen3 -mr 30b_a3b --detach false >>"$LOG" 2>&1 || LAUNCH_RC=$? +NEMORUN_HOME="__OUTPUT_DIR__/output" __INSTALL_DIR__/Run__main-venv/bin/python __INSTALL_DIR__/Megatron-Bridge__main/scripts/performance/setup_experiment.py -p main -i __OUTPUT_DIR__/output/megatron_bridge_image.sqsh -hf dummy_token -ng 8 -gn 4 --golden_values_path cloudai_megatron_bridge_golden_values.json -cm __INSTALL_DIR__/Megatron-Bridge__main:/opt/Megatron-Bridge -cb 'export CUDA_VISIBLE_DEVICES=0,1,2,3' -cb 'export NCCL_DEBUG=INFO' -m qwen3 -mr 30b_a3b --detach false --additional_slurm_params gpus-per-node=4 >>"$LOG" 2>&1 || LAUNCH_RC=$? JOB_ID="" From e14fa6f0e862183f789154b61aa5955cd1637228 Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Wed, 11 Mar 2026 10:39:13 +0200 Subject: [PATCH 09/30] Update copyright year --- src/cloudai/_core/test_scenario.py | 2 +- src/cloudai/systems/slurm/slurm_system.py | 2 +- src/cloudai/test_scenario_parser.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cloudai/_core/test_scenario.py b/src/cloudai/_core/test_scenario.py index 22484ddc1..a9329621b 100644 --- a/src/cloudai/_core/test_scenario.py +++ b/src/cloudai/_core/test_scenario.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/src/cloudai/systems/slurm/slurm_system.py b/src/cloudai/systems/slurm/slurm_system.py index 36395b3bb..3eb74563c 100644 --- a/src/cloudai/systems/slurm/slurm_system.py +++ b/src/cloudai/systems/slurm/slurm_system.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/src/cloudai/test_scenario_parser.py b/src/cloudai/test_scenario_parser.py index a75c467af..b324d638c 100644 --- a/src/cloudai/test_scenario_parser.py +++ b/src/cloudai/test_scenario_parser.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); From 54f4fbbc09b76740a89136473e92ec00cdda08ec Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Wed, 11 Mar 2026 11:28:52 +0200 Subject: [PATCH 10/30] Add back the defensive filter --- .../workloads/megatron_bridge/slurm_command_gen_strategy.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py index 338eb24fc..7ae8034a8 100644 --- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py @@ -225,8 +225,7 @@ def _installed_container_path() -> str: else: container_path = _installed_container_path() - mounts: list[str] = [] - mounts.extend(tdef.extra_container_mounts or []) + mounts: list[str] = [str(m).strip() for m in (tdef.extra_container_mounts or []) if str(m).strip()] # When the user sets mount_as on the Megatron-Bridge git repo, bind-mount the # installed clone into the container to override the image's built-in copy. From ff2f2fdc7cc8dfeff5802d5bff331b10dd81cd51 Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Wed, 11 Mar 2026 12:07:48 +0200 Subject: [PATCH 11/30] Enhance nodes allocation for edge cases --- .../slurm/slurm_command_gen_strategy.py | 3 ++ src/cloudai/systems/slurm/slurm_system.py | 8 +++++ .../slurm/test_command_gen_strategy.py | 33 +++++++++++++++++++ tests/systems/slurm/test_system.py | 22 +++++++++++++ 4 files changed, 66 insertions(+) diff --git a/src/cloudai/systems/slurm/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/slurm_command_gen_strategy.py index 661eda35f..24725592f 100644 --- a/src/cloudai/systems/slurm/slurm_command_gen_strategy.py +++ b/src/cloudai/systems/slurm/slurm_command_gen_strategy.py @@ -424,6 +424,9 @@ def _append_nodes_related_directives(self, content: List[str]) -> Optional[Path] content.append(f"#SBATCH -N {num_nodes}") + if self.test_run.exclude_nodes: + content.append(f"#SBATCH --exclude={self.test_run.exclude_nodes}") + return None def _format_env_vars(self, env_vars: Dict[str, Any]) -> str: diff --git a/src/cloudai/systems/slurm/slurm_system.py b/src/cloudai/systems/slurm/slurm_system.py index 3eb74563c..7771139fe 100644 --- a/src/cloudai/systems/slurm/slurm_system.py +++ b/src/cloudai/systems/slurm/slurm_system.py @@ -744,12 +744,20 @@ def get_nodes_by_spec( Returns: Tuple[int, list[str]]: The number of nodes and a list of node names. + + Raises: + ValueError: If node specifications were provided but resolved to an empty list after applying exclusions. """ num_nodes, node_list = num_nodes, [] parsed_nodes = self.parse_nodes(nodes, exclude_nodes=exclude_nodes) if parsed_nodes: num_nodes = len(parsed_nodes) node_list = parsed_nodes + elif nodes: + raise ValueError( + f"Node specifications {nodes} resolved to an empty node list after applying " + f"exclude_nodes={exclude_nodes}. Cannot fall back to unconstrained allocation." + ) return num_nodes, sorted(node_list) def system_installables(self) -> list[Installable]: diff --git a/tests/systems/slurm/test_command_gen_strategy.py b/tests/systems/slurm/test_command_gen_strategy.py index e18fdf5ed..eeecc0298 100644 --- a/tests/systems/slurm/test_command_gen_strategy.py +++ b/tests/systems/slurm/test_command_gen_strategy.py @@ -361,6 +361,39 @@ def test_distribution_fallback_when_no_nodes(strategy_fixture: SlurmCommandGenSt assert "#SBATCH --nodelist=" not in content +def test_exclude_nodes_directive_when_no_nodelist(strategy_fixture: SlurmCommandGenStrategy) -> None: + strategy_fixture.test_run.nodes = [] + strategy_fixture.test_run.num_nodes = 3 + strategy_fixture.test_run.exclude_nodes = "node01,node02" + content: List[str] = [] + strategy_fixture._append_nodes_related_directives(content) + + assert "#SBATCH -N 3" in content + assert "#SBATCH --exclude=node01,node02" in content + + +def test_no_exclude_directive_when_nodelist_present(slurm_system: SlurmSystem, testrun_fixture: TestRun) -> None: + testrun_fixture.nodes = ["node3", "node4"] + testrun_fixture.exclude_nodes = "node01,node02" + strategy = MySlurmCommandGenStrategy(slurm_system, testrun_fixture) + content: List[str] = [] + strategy._append_nodes_related_directives(content) + + assert "#SBATCH --nodelist=node3,node4" in content + assert "#SBATCH --exclude=" not in content + + +def test_no_exclude_directive_when_exclude_nodes_unset(strategy_fixture: SlurmCommandGenStrategy) -> None: + strategy_fixture.test_run.nodes = [] + strategy_fixture.test_run.num_nodes = 2 + strategy_fixture.test_run.exclude_nodes = None + content: List[str] = [] + strategy_fixture._append_nodes_related_directives(content) + + assert "#SBATCH -N 2" in content + assert not any("--exclude" in line for line in content) + + def test_nodelist_over_num_nodes(slurm_system: SlurmSystem, testrun_fixture: TestRun) -> None: testrun_fixture.nodes = ["nodeA", "nodeB", "nodeC"] testrun_fixture.num_nodes = 5 diff --git a/tests/systems/slurm/test_system.py b/tests/systems/slurm/test_system.py index 6ba1da1d1..e7a2ddd89 100644 --- a/tests/systems/slurm/test_system.py +++ b/tests/systems/slurm/test_system.py @@ -367,6 +367,28 @@ def test_explicit_node_names( assert num_nodes == exp_nnodes assert node_list == exp_nodes + @patch("cloudai.systems.slurm.slurm_system.SlurmSystem.parse_nodes") + def test_raises_when_all_nodes_excluded(self, mock_parse_nodes: Mock, slurm_system: SlurmSystem): + mock_parse_nodes.return_value = [] + exclude = {"node01", "node02"} + + with pytest.raises(ValueError, match="resolved to an empty node list"): + slurm_system.get_nodes_by_spec(2, ["node0[1-2]"], exclude_nodes=exclude) + + @patch("cloudai.systems.slurm.slurm_system.SlurmSystem.parse_nodes") + def test_raises_when_parse_nodes_returns_empty_for_nonempty_specs( + self, mock_parse_nodes: Mock, slurm_system: SlurmSystem + ): + mock_parse_nodes.return_value = [] + + with pytest.raises(ValueError, match="resolved to an empty node list"): + slurm_system.get_nodes_by_spec(1, ["main:group1:3"]) + + def test_empty_nodes_with_exclude_still_returns_unconstrained(self, slurm_system: SlurmSystem): + num_nodes, node_list = slurm_system.get_nodes_by_spec(3, [], exclude_nodes={"node01"}) + assert num_nodes == 3 + assert node_list == [] + class ConcreteSlurmStrategy(SlurmCommandGenStrategy): def _container_mounts(self) -> list[str]: From f26fe7bceecf7e880292b145cca929c93b471551 Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Wed, 11 Mar 2026 12:14:55 +0200 Subject: [PATCH 12/30] Enhance tests by avoiding default names --- .../megatron_bridge/test_command_gen_strategy_slurm.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py index 430f36d38..7dfbfa7ec 100644 --- a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py +++ b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py @@ -323,14 +323,14 @@ def test_use_recipes_emitted_only_when_true( def test_mount_as_adds_repo_to_container_mounts( self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun], tmp_path: Path ) -> None: - tr = make_test_run(mount_as="/opt/Megatron-Bridge", output_subdir="out_mount") + tr = make_test_run(mount_as="/opt/custom-megatron", output_subdir="out_mount") tdef = cast(MegatronBridgeTestDefinition, tr.test) repo_path = tdef.megatron_bridge_repo.installed_path assert repo_path is not None cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr) wrapper_content = self._wrapper_content(cmd_gen) - assert f"-cm {repo_path.absolute()}:/opt/Megatron-Bridge" in wrapper_content + assert f"-cm {repo_path.absolute()}:/opt/custom-megatron" in wrapper_content def test_no_mount_as_skips_repo_container_mount( self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun] @@ -344,8 +344,8 @@ def test_no_mount_as_skips_repo_container_mount( def test_gpus_per_node_passed_as_additional_slurm_param( self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun] ) -> None: - tr = make_test_run(cmd_args_overrides={"gpus_per_node": 4}, output_subdir="out_gpus") + tr = make_test_run(cmd_args_overrides={"gpus_per_node": 2}, output_subdir="out_gpus") cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr) wrapper_content = self._wrapper_content(cmd_gen) assert "--additional_slurm_params" in wrapper_content - assert "gpus-per-node=4" in wrapper_content + assert "gpus-per-node=2" in wrapper_content From a159736ae1fca5fd7139b6bf3c726e5d6ff3beac Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Wed, 11 Mar 2026 12:34:10 +0200 Subject: [PATCH 13/30] Make exclude nodes correctly propagate to M-Bridge --- .../workloads/megatron_bridge/slurm_command_gen_strategy.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py index 7ae8034a8..a05fe1079 100644 --- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py @@ -409,6 +409,8 @@ def add_field(field: str, flag: str, value: Any) -> None: if node_list: nodelist_str = ",".join(node_list) additional_slurm_params.append(f"nodelist={nodelist_str}") + elif self.test_run.exclude_nodes: + additional_slurm_params.append(f"exclude={self.test_run.exclude_nodes}") if additional_slurm_params: parts.extend(["--additional_slurm_params", shlex.quote(" ".join(additional_slurm_params))]) From 5da0a578eb5d0a86007d08bb88290ecb1df4c4cb Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Mon, 16 Mar 2026 08:23:46 +0200 Subject: [PATCH 14/30] Improve error messages for excluding nodes --- src/cloudai/systems/slurm/slurm_system.py | 11 ++++++++--- tests/systems/slurm/test_system.py | 4 ++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/cloudai/systems/slurm/slurm_system.py b/src/cloudai/systems/slurm/slurm_system.py index 7771139fe..46bb9b1bf 100644 --- a/src/cloudai/systems/slurm/slurm_system.py +++ b/src/cloudai/systems/slurm/slurm_system.py @@ -746,7 +746,7 @@ def get_nodes_by_spec( Tuple[int, list[str]]: The number of nodes and a list of node names. Raises: - ValueError: If node specifications were provided but resolved to an empty list after applying exclusions. + ValueError: If node specifications were provided but resolved to an empty list. """ num_nodes, node_list = num_nodes, [] parsed_nodes = self.parse_nodes(nodes, exclude_nodes=exclude_nodes) @@ -754,9 +754,14 @@ def get_nodes_by_spec( num_nodes = len(parsed_nodes) node_list = parsed_nodes elif nodes: + reason = ( + f"after excluding nodes {exclude_nodes}" + if exclude_nodes + else "— no nodes are available (all may be DRAIN/DOWN)" + ) raise ValueError( - f"Node specifications {nodes} resolved to an empty node list after applying " - f"exclude_nodes={exclude_nodes}. Cannot fall back to unconstrained allocation." + f"Node specifications {nodes} resolved to an empty node list {reason}. " + "Cannot fall back to unconstrained allocation." ) return num_nodes, sorted(node_list) diff --git a/tests/systems/slurm/test_system.py b/tests/systems/slurm/test_system.py index e7a2ddd89..3a51f3360 100644 --- a/tests/systems/slurm/test_system.py +++ b/tests/systems/slurm/test_system.py @@ -372,7 +372,7 @@ def test_raises_when_all_nodes_excluded(self, mock_parse_nodes: Mock, slurm_syst mock_parse_nodes.return_value = [] exclude = {"node01", "node02"} - with pytest.raises(ValueError, match="resolved to an empty node list"): + with pytest.raises(ValueError, match="after excluding nodes"): slurm_system.get_nodes_by_spec(2, ["node0[1-2]"], exclude_nodes=exclude) @patch("cloudai.systems.slurm.slurm_system.SlurmSystem.parse_nodes") @@ -381,7 +381,7 @@ def test_raises_when_parse_nodes_returns_empty_for_nonempty_specs( ): mock_parse_nodes.return_value = [] - with pytest.raises(ValueError, match="resolved to an empty node list"): + with pytest.raises(ValueError, match="no nodes are available"): slurm_system.get_nodes_by_spec(1, ["main:group1:3"]) def test_empty_nodes_with_exclude_still_returns_unconstrained(self, slurm_system: SlurmSystem): From 2d7b98b2d845239c312fc283f167219bba717bbb Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Mon, 16 Mar 2026 08:36:28 +0200 Subject: [PATCH 15/30] Improve exclude nodes APIs --- src/cloudai/_core/test_scenario.py | 2 +- src/cloudai/models/scenario.py | 11 ++++++---- .../slurm/slurm_command_gen_strategy.py | 9 ++++----- src/cloudai/systems/slurm/slurm_system.py | 20 +++++++++++-------- .../slurm_command_gen_strategy.py | 2 +- tests/systems/slurm/test_allocation.py | 4 ++-- .../slurm/test_command_gen_strategy.py | 6 +++--- tests/systems/slurm/test_system.py | 4 ++-- 8 files changed, 32 insertions(+), 26 deletions(-) diff --git a/src/cloudai/_core/test_scenario.py b/src/cloudai/_core/test_scenario.py index a9329621b..5eb28820c 100644 --- a/src/cloudai/_core/test_scenario.py +++ b/src/cloudai/_core/test_scenario.py @@ -62,6 +62,7 @@ class TestRun: test: TestDefinition num_nodes: Union[int, list[int]] nodes: List[str] + exclude_nodes: List[str] = field(default_factory=list) output_path: Path = Path("") iterations: int = 1 current_iteration: int = 0 @@ -75,7 +76,6 @@ class TestRun: post_test: Optional[TestScenario] = None reports: Set[Type[ReportGenerationStrategy]] = field(default_factory=set) extra_srun_args: str | None = None - exclude_nodes: str | None = None def __hash__(self) -> int: return hash(self.name + self.test.name + str(self.iterations) + str(self.current_iteration)) diff --git a/src/cloudai/models/scenario.py b/src/cloudai/models/scenario.py index 823a83527..beeb84244 100644 --- a/src/cloudai/models/scenario.py +++ b/src/cloudai/models/scenario.py @@ -73,6 +73,13 @@ class TestRunModel(BaseModel): test_name: Optional[str] = None num_nodes: int | list[int] | None = None nodes: list[str] = Field(default_factory=list) + exclude_nodes: list[str] = Field( + default_factory=list, + description=( + "Hostnames to exclude from the resolved node list. " + "Supports Slurm range syntax, e.g. ['node-048', 'node-[101-104]']." + ), + ) weight: int = 0 iterations: int = 1 sol: Optional[float] = None @@ -80,10 +87,6 @@ class TestRunModel(BaseModel): time_limit: Optional[str] = None dependencies: list[TestRunDependencyModel] = Field(default_factory=list) extra_srun_args: str | None = None - exclude_nodes: str | None = Field( - default=None, - description="Comma-separated hostnames to exclude from the resolved node list.", - ) # test definition fields name: Optional[str] = None diff --git a/src/cloudai/systems/slurm/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/slurm_command_gen_strategy.py index 24725592f..67619a1b9 100644 --- a/src/cloudai/systems/slurm/slurm_command_gen_strategy.py +++ b/src/cloudai/systems/slurm/slurm_command_gen_strategy.py @@ -26,7 +26,7 @@ from cloudai.core import CommandGenStrategy, Registry, System, TestRun, TestScenario from cloudai.models.scenario import TestRunDetails -from .slurm_system import SlurmSystem, parse_node_list +from .slurm_system import SlurmSystem class SlurmCommandGenStrategy(CommandGenStrategy): @@ -425,7 +425,7 @@ def _append_nodes_related_directives(self, content: List[str]) -> Optional[Path] content.append(f"#SBATCH -N {num_nodes}") if self.test_run.exclude_nodes: - content.append(f"#SBATCH --exclude={self.test_run.exclude_nodes}") + content.append(f"#SBATCH --exclude={','.join(self.test_run.exclude_nodes)}") return None @@ -468,7 +468,7 @@ def get_cached_nodes_spec(self) -> tuple[int, list[str]]: str(self.test_run.step), str(self.test_run.nnodes), ",".join(self.test_run.nodes), - self.test_run.exclude_nodes or "", + ",".join(self.test_run.exclude_nodes), ] ) @@ -476,9 +476,8 @@ def get_cached_nodes_spec(self) -> tuple[int, list[str]]: logging.debug(f"Using cached node allocation for {cache_key}: {self._node_spec_cache[cache_key]}") return self._node_spec_cache[cache_key] - excluded = set(parse_node_list(self.test_run.exclude_nodes)) if self.test_run.exclude_nodes else None num_nodes, node_list = self.system.get_nodes_by_spec( - self.test_run.nnodes, self.test_run.nodes, exclude_nodes=excluded + self.test_run.nnodes, self.test_run.nodes, exclude_nodes=self.test_run.exclude_nodes or None ) self._node_spec_cache[cache_key] = (num_nodes, node_list) diff --git a/src/cloudai/systems/slurm/slurm_system.py b/src/cloudai/systems/slurm/slurm_system.py index 46bb9b1bf..ec5784ae5 100644 --- a/src/cloudai/systems/slurm/slurm_system.py +++ b/src/cloudai/systems/slurm/slurm_system.py @@ -447,7 +447,7 @@ def get_available_nodes_from_group( partition_name: str, group_name: str, number_of_nodes: Union[int, str], - exclude_nodes: set[str] | None = None, + exclude_nodes: list[str] | None = None, ) -> List[SlurmNode]: """ Retrieve a specific number of potentially available nodes from a group within a partition. @@ -460,7 +460,7 @@ def get_available_nodes_from_group( group_name (str): The name of the group. number_of_nodes (Union[int,str]): The number of nodes to retrieve. Could also be 'all' to retrieve all the nodes from the group. - exclude_nodes (set[str] | None): Node names to exclude from the pool before selection. + exclude_nodes (list[str] | None): Node names to exclude from the pool before selection. Returns: List[SlurmNode]: Objects that are potentially available for use. @@ -514,7 +514,7 @@ def group_nodes_by_state( self, partition_name: str, group_name: str, - exclude_nodes: set[str] | None = None, + exclude_nodes: list[str] | None = None, ) -> Dict[SlurmNodeState, List[SlurmNode]]: """ Group nodes by their states, excluding nodes allocated to the current user. @@ -522,7 +522,7 @@ def group_nodes_by_state( Args: partition_name (str): The name of the partition. group_name (str): The name of the group. - exclude_nodes (set[str] | None): Node names to exclude from the pool before grouping. + exclude_nodes (list[str] | None): Node names to exclude from the pool before grouping. Returns: Dict[SlurmNodeState, List[SlurmNode]]: A dictionary grouping nodes by their state. @@ -683,7 +683,7 @@ def convert_state_to_enum(self, state_str: str) -> SlurmNodeState: logging.debug(f"Unknown node state: {core_state}") return SlurmNodeState.UNKNOWN_STATE - def parse_nodes(self, nodes: List[str], exclude_nodes: set[str] | None = None) -> List[str]: + def parse_nodes(self, nodes: List[str], exclude_nodes: list[str] | None = None) -> List[str]: """ Parse a list of node specifications into individual node names. @@ -696,7 +696,8 @@ def parse_nodes(self, nodes: List[str], exclude_nodes: set[str] | None = None) - "partition:group:num_nodes", where "partition" is the partition name, "group" is a group within that partition, and "num_nodes" is the number of nodes requested. Node ranges should be specified with square brackets and dashes, e.g., "node[01-03]" for "node01", "node02", "node03". - exclude_nodes (set[str] | None): Node names to exclude from group pools before selection. + exclude_nodes (list[str] | None): Node names (or Slurm range expressions) to exclude from group pools + before selection. Ranges are expanded internally. Returns: List[str]: A list of node names. For specifications, it includes names of allocated nodes based on the @@ -706,6 +707,9 @@ def parse_nodes(self, nodes: List[str], exclude_nodes: set[str] | None = None) - ValueError: If a specification is malformed, a specified node is not found, or a node range cannot be parsed. This ensures users are aware of incorrect inputs. """ + if exclude_nodes: + exclude_nodes = [n for spec in exclude_nodes for n in parse_node_list(spec)] + parsed_nodes = [] for node_spec in nodes: if ":" in node_spec: @@ -729,7 +733,7 @@ def parse_nodes(self, nodes: List[str], exclude_nodes: set[str] | None = None) - return parsed_nodes def get_nodes_by_spec( - self, num_nodes: int, nodes: list[str], exclude_nodes: set[str] | None = None + self, num_nodes: int, nodes: list[str], exclude_nodes: list[str] | None = None ) -> Tuple[int, list[str]]: """ Retrieve a list of node names based on specifications. @@ -740,7 +744,7 @@ def get_nodes_by_spec( Args: num_nodes (int): The number of nodes, can't be `0`. nodes (list[str]): A list of node names specifications, slurm format or `PARTITION:GROUP:NUM_NODES`. - exclude_nodes (set[str] | None): Node names to exclude from group pools before selection. + exclude_nodes (list[str] | None): Node names to exclude from group pools before selection. Returns: Tuple[int, list[str]]: The number of nodes and a list of node names. diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py index a05fe1079..f1854e732 100644 --- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py @@ -410,7 +410,7 @@ def add_field(field: str, flag: str, value: Any) -> None: nodelist_str = ",".join(node_list) additional_slurm_params.append(f"nodelist={nodelist_str}") elif self.test_run.exclude_nodes: - additional_slurm_params.append(f"exclude={self.test_run.exclude_nodes}") + additional_slurm_params.append(f"exclude={','.join(self.test_run.exclude_nodes)}") if additional_slurm_params: parts.extend(["--additional_slurm_params", shlex.quote(" ".join(additional_slurm_params))]) diff --git a/tests/systems/slurm/test_allocation.py b/tests/systems/slurm/test_allocation.py index cc690a920..c8f06a624 100644 --- a/tests/systems/slurm/test_allocation.py +++ b/tests/systems/slurm/test_allocation.py @@ -121,7 +121,7 @@ def test_exclude_nodes_selects_from_remaining_pool( ): """Excluding a node from a group should still yield the requested count from the remaining pool.""" system, _all_nodes, _ = self.prepare(slurm_system, [], monkeypatch) - nnodes, nodes_list = system.get_nodes_by_spec(1, ["main:group1:4"], exclude_nodes={"node03"}) + nnodes, nodes_list = system.get_nodes_by_spec(1, ["main:group1:4"], exclude_nodes=["node03"]) assert nnodes == 4 assert "node03" not in nodes_list assert len(nodes_list) == 4 @@ -129,7 +129,7 @@ def test_exclude_nodes_selects_from_remaining_pool( def test_exclude_multiple_nodes_from_group(self, slurm_system: SlurmSystem, monkeypatch: pytest.MonkeyPatch): """Excluding multiple nodes still selects the requested count from remaining nodes.""" system, _all_nodes, _ = self.prepare(slurm_system, [], monkeypatch) - nnodes, nodes_list = system.get_nodes_by_spec(1, ["main:group1:3"], exclude_nodes={"node01", "node05"}) + nnodes, nodes_list = system.get_nodes_by_spec(1, ["main:group1:3"], exclude_nodes=["node01", "node05"]) assert nnodes == 3 assert "node01" not in nodes_list assert "node05" not in nodes_list diff --git a/tests/systems/slurm/test_command_gen_strategy.py b/tests/systems/slurm/test_command_gen_strategy.py index eeecc0298..98e103704 100644 --- a/tests/systems/slurm/test_command_gen_strategy.py +++ b/tests/systems/slurm/test_command_gen_strategy.py @@ -364,7 +364,7 @@ def test_distribution_fallback_when_no_nodes(strategy_fixture: SlurmCommandGenSt def test_exclude_nodes_directive_when_no_nodelist(strategy_fixture: SlurmCommandGenStrategy) -> None: strategy_fixture.test_run.nodes = [] strategy_fixture.test_run.num_nodes = 3 - strategy_fixture.test_run.exclude_nodes = "node01,node02" + strategy_fixture.test_run.exclude_nodes = ["node01", "node02"] content: List[str] = [] strategy_fixture._append_nodes_related_directives(content) @@ -374,7 +374,7 @@ def test_exclude_nodes_directive_when_no_nodelist(strategy_fixture: SlurmCommand def test_no_exclude_directive_when_nodelist_present(slurm_system: SlurmSystem, testrun_fixture: TestRun) -> None: testrun_fixture.nodes = ["node3", "node4"] - testrun_fixture.exclude_nodes = "node01,node02" + testrun_fixture.exclude_nodes = ["node01", "node02"] strategy = MySlurmCommandGenStrategy(slurm_system, testrun_fixture) content: List[str] = [] strategy._append_nodes_related_directives(content) @@ -386,7 +386,7 @@ def test_no_exclude_directive_when_nodelist_present(slurm_system: SlurmSystem, t def test_no_exclude_directive_when_exclude_nodes_unset(strategy_fixture: SlurmCommandGenStrategy) -> None: strategy_fixture.test_run.nodes = [] strategy_fixture.test_run.num_nodes = 2 - strategy_fixture.test_run.exclude_nodes = None + strategy_fixture.test_run.exclude_nodes = [] content: List[str] = [] strategy_fixture._append_nodes_related_directives(content) diff --git a/tests/systems/slurm/test_system.py b/tests/systems/slurm/test_system.py index 3a51f3360..0c0e240b0 100644 --- a/tests/systems/slurm/test_system.py +++ b/tests/systems/slurm/test_system.py @@ -370,7 +370,7 @@ def test_explicit_node_names( @patch("cloudai.systems.slurm.slurm_system.SlurmSystem.parse_nodes") def test_raises_when_all_nodes_excluded(self, mock_parse_nodes: Mock, slurm_system: SlurmSystem): mock_parse_nodes.return_value = [] - exclude = {"node01", "node02"} + exclude = ["node01", "node02"] with pytest.raises(ValueError, match="after excluding nodes"): slurm_system.get_nodes_by_spec(2, ["node0[1-2]"], exclude_nodes=exclude) @@ -385,7 +385,7 @@ def test_raises_when_parse_nodes_returns_empty_for_nonempty_specs( slurm_system.get_nodes_by_spec(1, ["main:group1:3"]) def test_empty_nodes_with_exclude_still_returns_unconstrained(self, slurm_system: SlurmSystem): - num_nodes, node_list = slurm_system.get_nodes_by_spec(3, [], exclude_nodes={"node01"}) + num_nodes, node_list = slurm_system.get_nodes_by_spec(3, [], exclude_nodes=["node01"]) assert num_nodes == 3 assert node_list == [] From 174dcefae7936388256d975e9d401f038afafe44 Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Mon, 16 Mar 2026 08:54:04 +0200 Subject: [PATCH 16/30] Change additional_slurm_params separator to semi-colon --- .../workloads/megatron_bridge/slurm_command_gen_strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py index f1854e732..522f4b750 100644 --- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py @@ -413,7 +413,7 @@ def add_field(field: str, flag: str, value: Any) -> None: additional_slurm_params.append(f"exclude={','.join(self.test_run.exclude_nodes)}") if additional_slurm_params: - parts.extend(["--additional_slurm_params", shlex.quote(" ".join(additional_slurm_params))]) + parts.extend(["--additional_slurm_params", shlex.quote(";".join(additional_slurm_params))]) # Config variant add_field("config_variant", "-cv", args.config_variant) From 3480ba87116b53ca01555adacee6a8001e8cc6c4 Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Mon, 16 Mar 2026 09:13:04 +0200 Subject: [PATCH 17/30] Make init_submodules optional --- .../test/gb200/megatron_bridge_qwen_30b.toml | 1 + src/cloudai/_core/installables.py | 1 + .../kubernetes/kubernetes_installer.py | 13 ++++++----- src/cloudai/systems/slurm/slurm_installer.py | 13 ++++++----- tests/test_git_repo_installer.py | 23 ++++++++++++++++++- 5 files changed, 38 insertions(+), 13 deletions(-) diff --git a/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml index d7903c026..45976b712 100644 --- a/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml +++ b/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml @@ -24,6 +24,7 @@ extra_container_mounts = [] url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git" commit = "r0.3.0" mount_as = "/opt/Megatron-Bridge" +init_submodules = true [cmd_args] gpu_type = "gb200" diff --git a/src/cloudai/_core/installables.py b/src/cloudai/_core/installables.py index f8527876c..dc0126a0f 100644 --- a/src/cloudai/_core/installables.py +++ b/src/cloudai/_core/installables.py @@ -91,6 +91,7 @@ class GitRepo(Installable, BaseModel): url: str commit: str + init_submodules: bool = False installed_path: Optional[Path] = None mount_as: Optional[str] = None diff --git a/src/cloudai/systems/kubernetes/kubernetes_installer.py b/src/cloudai/systems/kubernetes/kubernetes_installer.py index f360230ea..0f88985ee 100644 --- a/src/cloudai/systems/kubernetes/kubernetes_installer.py +++ b/src/cloudai/systems/kubernetes/kubernetes_installer.py @@ -172,12 +172,13 @@ def _install_one_git_repo(self, item: GitRepo) -> InstallStatusResult: rmtree(repo_path) return res - res = self._init_submodules(repo_path) - if not res.success: - logging.error(f"Submodule init failed, removing cloned repository at {repo_path}") - if repo_path.exists(): - rmtree(repo_path) - return res + if item.init_submodules: + res = self._init_submodules(repo_path) + if not res.success: + logging.error(f"Submodule init failed, removing cloned repository at {repo_path}") + if repo_path.exists(): + rmtree(repo_path) + return res item.installed_path = repo_path return InstallStatusResult(True) diff --git a/src/cloudai/systems/slurm/slurm_installer.py b/src/cloudai/systems/slurm/slurm_installer.py index d286c32d7..352ea69a3 100644 --- a/src/cloudai/systems/slurm/slurm_installer.py +++ b/src/cloudai/systems/slurm/slurm_installer.py @@ -225,12 +225,13 @@ def _install_one_git_repo(self, item: GitRepo) -> InstallStatusResult: rmtree(repo_path) return res - res = self._init_submodules(repo_path) - if not res.success: - logging.error(f"Submodule init failed, removing cloned repository at {repo_path}") - if repo_path.exists(): - rmtree(repo_path) - return res + if item.init_submodules: + res = self._init_submodules(repo_path) + if not res.success: + logging.error(f"Submodule init failed, removing cloned repository at {repo_path}") + if repo_path.exists(): + rmtree(repo_path) + return res item.installed_path = repo_path return InstallStatusResult(True) diff --git a/tests/test_git_repo_installer.py b/tests/test_git_repo_installer.py index 3d12298c1..dea795790 100644 --- a/tests/test_git_repo_installer.py +++ b/tests/test_git_repo_installer.py @@ -258,6 +258,7 @@ def test_error_initializing_submodules(self, installer: Union[KubernetesInstalle def test_submodule_failure_cleans_up_repo( self, installer: Union[KubernetesInstaller, SlurmInstaller], git: GitRepo ): + git.init_submodules = True repo_path = installer.system.install_path / git.repo_name installer._clone_repository = Mock( side_effect=lambda url, path: (path.mkdir(parents=True, exist_ok=True), InstallStatusResult(True))[1] @@ -273,11 +274,31 @@ def test_submodule_failure_cleans_up_repo( def test_all_good_flow(self, installer: Union[KubernetesInstaller, SlurmInstaller], git: GitRepo): installer._clone_repository = Mock(return_value=InstallStatusResult(True)) installer._checkout_commit = Mock(return_value=InstallStatusResult(True)) - installer._init_submodules = Mock(return_value=InstallStatusResult(True)) res = installer._install_one_git_repo(git) assert res.success assert git.installed_path == installer.system.install_path / git.repo_name + def test_submodules_skipped_when_not_requested( + self, installer: Union[KubernetesInstaller, SlurmInstaller], git: GitRepo + ): + installer._clone_repository = Mock(return_value=InstallStatusResult(True)) + installer._checkout_commit = Mock(return_value=InstallStatusResult(True)) + installer._init_submodules = Mock(return_value=InstallStatusResult(True)) + res = installer._install_one_git_repo(git) + assert res.success + installer._init_submodules.assert_not_called() + + def test_submodules_run_when_requested( + self, installer: Union[KubernetesInstaller, SlurmInstaller], git: GitRepo + ): + git.init_submodules = True + installer._clone_repository = Mock(return_value=InstallStatusResult(True)) + installer._checkout_commit = Mock(return_value=InstallStatusResult(True)) + installer._init_submodules = Mock(return_value=InstallStatusResult(True)) + res = installer._install_one_git_repo(git) + assert res.success + installer._init_submodules.assert_called_once() + def test_uninstall_no_repo(self, installer: Union[KubernetesInstaller, SlurmInstaller], git: GitRepo): res = installer._uninstall_git_repo(git) assert res.success From e12ec45d4ee22cf3012e63d3a6225f8ac856ac1e Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Mon, 16 Mar 2026 09:21:28 +0200 Subject: [PATCH 18/30] Update configurations --- .../test/b200/megatron_bridge_qwen_30b.toml | 8 ++++++-- .../test/gb200/megatron_bridge_qwen_30b.toml | 1 - .../test/gb300/megatron_bridge_qwen_30b.toml | 8 ++++++-- .../test/h100/megatron_bridge_qwen_30b.toml | 8 ++++++-- 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/conf/experimental/megatron_bridge/test/b200/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/b200/megatron_bridge_qwen_30b.toml index 238dbc7de..108f78993 100644 --- a/conf/experimental/megatron_bridge/test/b200/megatron_bridge_qwen_30b.toml +++ b/conf/experimental/megatron_bridge/test/b200/megatron_bridge_qwen_30b.toml @@ -22,12 +22,13 @@ extra_container_mounts = [] [[git_repos]] url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git" -commit = "v0.3.0" +commit = "r0.3.0" mount_as = "/opt/Megatron-Bridge" +init_submodules = true [cmd_args] gpu_type = "b200" -container_image = "nvcr.io#nvidia/nemo:26.02.00" +container_image = "nvcr.io#nvidia/nemo:26.02" model_family_name = "qwen" model_recipe_name = "qwen3_30b_a3b" gpus_per_node = 8 @@ -37,3 +38,6 @@ task = "pretrain" compute_dtype = "fp8_mx" hf_token = "" enable_vboost = true + +[extra_env_vars] +PYTHONPATH = "/opt/Megatron-Bridge/3rdparty/Megatron-LM:${PYTHONPATH}" diff --git a/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml index 45976b712..2225ac2ed 100644 --- a/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml +++ b/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml @@ -38,7 +38,6 @@ task = "pretrain" compute_dtype = "fp8_mx" hf_token = "" enable_vboost = true -detach = false [extra_env_vars] PYTHONPATH = "/opt/Megatron-Bridge/3rdparty/Megatron-LM:${PYTHONPATH}" diff --git a/conf/experimental/megatron_bridge/test/gb300/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/gb300/megatron_bridge_qwen_30b.toml index 22b692e27..44b724e75 100644 --- a/conf/experimental/megatron_bridge/test/gb300/megatron_bridge_qwen_30b.toml +++ b/conf/experimental/megatron_bridge/test/gb300/megatron_bridge_qwen_30b.toml @@ -22,12 +22,13 @@ extra_container_mounts = [] [[git_repos]] url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git" -commit = "v0.3.0" +commit = "r0.3.0" mount_as = "/opt/Megatron-Bridge" +init_submodules = true [cmd_args] gpu_type = "gb300" -container_image = "nvcr.io#nvidia/nemo:26.02.00" +container_image = "nvcr.io#nvidia/nemo:26.02" model_family_name = "qwen" model_recipe_name = "qwen3_30b_a3b" gpus_per_node = 4 @@ -37,3 +38,6 @@ task = "pretrain" compute_dtype = "fp8_mx" hf_token = "" enable_vboost = true + +[extra_env_vars] +PYTHONPATH = "/opt/Megatron-Bridge/3rdparty/Megatron-LM:${PYTHONPATH}" diff --git a/conf/experimental/megatron_bridge/test/h100/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/h100/megatron_bridge_qwen_30b.toml index 68d378052..200e256c6 100644 --- a/conf/experimental/megatron_bridge/test/h100/megatron_bridge_qwen_30b.toml +++ b/conf/experimental/megatron_bridge/test/h100/megatron_bridge_qwen_30b.toml @@ -22,12 +22,13 @@ extra_container_mounts = [] [[git_repos]] url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git" -commit = "v0.3.0" +commit = "r0.3.0" mount_as = "/opt/Megatron-Bridge" +init_submodules = true [cmd_args] gpu_type = "h100" -container_image = "nvcr.io#nvidia/nemo:26.02.00" +container_image = "nvcr.io#nvidia/nemo:26.02" model_family_name = "qwen" model_recipe_name = "qwen3_30b_a3b" gpus_per_node = 8 @@ -37,3 +38,6 @@ task = "pretrain" compute_dtype = "fp8_cs" hf_token = "" enable_vboost = true + +[extra_env_vars] +PYTHONPATH = "/opt/Megatron-Bridge/3rdparty/Megatron-LM:${PYTHONPATH}" From af471d5371c0d0da9cbb2497e1a12caae3e8fa73 Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Mon, 16 Mar 2026 10:06:53 +0200 Subject: [PATCH 19/30] Make extra_srun_args correctly propagate to M-Bridge --- .../slurm_command_gen_strategy.py | 21 ++++++++++++++++ .../test_command_gen_strategy_slurm.py | 24 +++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py index 522f4b750..eda0ba1d0 100644 --- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py @@ -108,6 +108,23 @@ def _normalize_recompute_modules(self, val: Any) -> str: joined = ",".join(items) return f'"{joined}"' + @staticmethod + def _parse_srun_args_as_slurm_params(srun_args: str) -> list[str]: + """Convert ``--key value`` pairs from extra_srun_args into ``key=value`` for --additional_slurm_params.""" + params: list[str] = [] + tokens = srun_args.split() + i = 0 + while i < len(tokens): + tok = tokens[i] + if tok.startswith("--") and "=" in tok: + key, val = tok[2:].split("=", 1) + params.append(f"{key}={val}") + elif tok.startswith("--") and i + 1 < len(tokens) and not tokens[i + 1].startswith("--"): + params.append(f"{tok[2:]}={tokens[i + 1]}") + i += 1 + i += 1 + return params + def _normalize_cuda_graph_scope_arg(self, val: Any) -> str: s = str(val).strip().strip("\"'") if s.startswith("[") and s.endswith("]"): @@ -412,6 +429,10 @@ def add_field(field: str, flag: str, value: Any) -> None: elif self.test_run.exclude_nodes: additional_slurm_params.append(f"exclude={','.join(self.test_run.exclude_nodes)}") + for source in (self.system.extra_srun_args, self.test_run.extra_srun_args): + if source: + additional_slurm_params.extend(self._parse_srun_args_as_slurm_params(source)) + if additional_slurm_params: parts.extend(["--additional_slurm_params", shlex.quote(";".join(additional_slurm_params))]) diff --git a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py index 7dfbfa7ec..4970eae17 100644 --- a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py +++ b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py @@ -349,3 +349,27 @@ def test_gpus_per_node_passed_as_additional_slurm_param( wrapper_content = self._wrapper_content(cmd_gen) assert "--additional_slurm_params" in wrapper_content assert "gpus-per-node=2" in wrapper_content + + def test_system_extra_srun_args_forwarded( + self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun] + ) -> None: + configured_slurm_system.extra_srun_args = "--reservation my_reserv" + tr = make_test_run(output_subdir="out_srun") + cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr) + wrapper_content = self._wrapper_content(cmd_gen) + assert "reservation=my_reserv" in wrapper_content + + def test_test_run_extra_srun_args_forwarded( + self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun] + ) -> None: + tr = make_test_run(output_subdir="out_tr_srun") + tr.extra_srun_args = "--exclusive" + cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr) + wrapper_content = self._wrapper_content(cmd_gen) + assert "--additional_slurm_params" in wrapper_content + + def test_parse_srun_args_as_slurm_params(self) -> None: + result = MegatronBridgeSlurmCommandGenStrategy._parse_srun_args_as_slurm_params( + "--reservation my_reserv --constraint=gpu" + ) + assert result == ["reservation=my_reserv", "constraint=gpu"] From 52cef4c10f51839763fa031ea75d867f908d88f4 Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Mon, 16 Mar 2026 10:08:10 +0200 Subject: [PATCH 20/30] Allow submission of jobs to resv nodes --- src/cloudai/systems/slurm/slurm_system.py | 2 ++ tests/systems/slurm/test_system.py | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/cloudai/systems/slurm/slurm_system.py b/src/cloudai/systems/slurm/slurm_system.py index ec5784ae5..2ec73b6fb 100644 --- a/src/cloudai/systems/slurm/slurm_system.py +++ b/src/cloudai/systems/slurm/slurm_system.py @@ -531,6 +531,7 @@ def group_nodes_by_state( SlurmNodeState.IDLE: [], SlurmNodeState.COMPLETING: [], SlurmNodeState.ALLOCATED: [], + SlurmNodeState.RESERVED: [], } for node in self.groups[partition_name][group_name]: @@ -566,6 +567,7 @@ def allocate_nodes( if isinstance(number_of_nodes, str) and number_of_nodes == "max_avail": allocated_nodes.extend(grouped_nodes[SlurmNodeState.IDLE]) allocated_nodes.extend(grouped_nodes[SlurmNodeState.COMPLETING]) + allocated_nodes.extend(grouped_nodes[SlurmNodeState.RESERVED]) if len(allocated_nodes) == 0: raise ValueError( diff --git a/tests/systems/slurm/test_system.py b/tests/systems/slurm/test_system.py index 0c0e240b0..477015ed2 100644 --- a/tests/systems/slurm/test_system.py +++ b/tests/systems/slurm/test_system.py @@ -140,6 +140,7 @@ def grouped_nodes() -> dict[SlurmNodeState, list[SlurmNode]]: SlurmNode(name="node04", partition=partition_name, state=SlurmNodeState.COMPLETING) ], SlurmNodeState.ALLOCATED: [SlurmNode(name="node05", partition=partition_name, state=SlurmNodeState.ALLOCATED)], + SlurmNodeState.RESERVED: [SlurmNode(name="node06", partition=partition_name, state=SlurmNodeState.RESERVED)], } return grouped_nodes @@ -166,6 +167,7 @@ def test_allocate_nodes_max_avail(slurm_system: SlurmSystem, grouped_nodes: dict grouped_nodes[SlurmNodeState.IDLE][0].name, grouped_nodes[SlurmNodeState.IDLE][1].name, grouped_nodes[SlurmNodeState.COMPLETING][0].name, + grouped_nodes[SlurmNodeState.RESERVED][0].name, ] returned_node_names = [node.name for node in available_nodes] @@ -193,8 +195,8 @@ def test_allocate_nodes_exceeding_limit( slurm_system: SlurmSystem, grouped_nodes: dict[SlurmNodeState, list[SlurmNode]] ): group_name = "group_name" - num_nodes = 5 - available_nodes = 4 + num_nodes = 6 + available_nodes = 5 with pytest.raises( ValueError, From 62f5cf7fb26092da8d203edf3086d57b76c852ec Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Mon, 16 Mar 2026 10:52:00 +0200 Subject: [PATCH 21/30] Make the no-mount assertion less brittle --- .../megatron_bridge/test_command_gen_strategy_slurm.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py index 4970eae17..4e765955d 100644 --- a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py +++ b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py @@ -336,10 +336,14 @@ def test_no_mount_as_skips_repo_container_mount( self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun] ) -> None: tr = make_test_run(mount_as=None, output_subdir="out_no_mount") + tdef = cast(MegatronBridgeTestDefinition, tr.test) + repo_path = tdef.megatron_bridge_repo.installed_path + assert repo_path is not None + cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr) wrapper_content = self._wrapper_content(cmd_gen) - assert "-cm" not in wrapper_content - assert "/opt/Megatron-Bridge" not in wrapper_content + assert f"{repo_path.absolute()}:" not in wrapper_content + assert ":/opt/Megatron-Bridge" not in wrapper_content def test_gpus_per_node_passed_as_additional_slurm_param( self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun] From f1b027edc5c00c9b1c26f3c8a274deca2584250d Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Mon, 16 Mar 2026 10:59:30 +0200 Subject: [PATCH 22/30] Fix and improve the extra_srun_args propagation --- .../megatron_bridge/slurm_command_gen_strategy.py | 9 ++++++++- .../megatron_bridge/test_command_gen_strategy_slurm.py | 10 ++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py index eda0ba1d0..d814a6859 100644 --- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py @@ -110,7 +110,12 @@ def _normalize_recompute_modules(self, val: Any) -> str: @staticmethod def _parse_srun_args_as_slurm_params(srun_args: str) -> list[str]: - """Convert ``--key value`` pairs from extra_srun_args into ``key=value`` for --additional_slurm_params.""" + """ + Convert ``--key value`` pairs from extra_srun_args into ``key=value`` for --additional_slurm_params. + + Standalone boolean flags (e.g. ``--exclusive``) are emitted as bare + key names without a ``=value`` suffix. + """ params: list[str] = [] tokens = srun_args.split() i = 0 @@ -122,6 +127,8 @@ def _parse_srun_args_as_slurm_params(srun_args: str) -> list[str]: elif tok.startswith("--") and i + 1 < len(tokens) and not tokens[i + 1].startswith("--"): params.append(f"{tok[2:]}={tokens[i + 1]}") i += 1 + elif tok.startswith("--"): + params.append(tok[2:]) i += 1 return params diff --git a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py index 4e765955d..b8e599388 100644 --- a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py +++ b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py @@ -367,13 +367,19 @@ def test_test_run_extra_srun_args_forwarded( self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun] ) -> None: tr = make_test_run(output_subdir="out_tr_srun") - tr.extra_srun_args = "--exclusive" + tr.extra_srun_args = "--constraint gpu" cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr) wrapper_content = self._wrapper_content(cmd_gen) - assert "--additional_slurm_params" in wrapper_content + assert "constraint=gpu" in wrapper_content def test_parse_srun_args_as_slurm_params(self) -> None: result = MegatronBridgeSlurmCommandGenStrategy._parse_srun_args_as_slurm_params( "--reservation my_reserv --constraint=gpu" ) assert result == ["reservation=my_reserv", "constraint=gpu"] + + def test_parse_srun_args_boolean_flags(self) -> None: + result = MegatronBridgeSlurmCommandGenStrategy._parse_srun_args_as_slurm_params( + "--exclusive --reservation my_reserv --overcommit" + ) + assert result == ["exclusive", "reservation=my_reserv", "overcommit"] From b6a5dcf3d0928c9d3f81e53a401e4d4a42a131da Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Mon, 16 Mar 2026 12:23:45 +0200 Subject: [PATCH 23/30] ruff format & copyright year --- .../test_scenario/megatron_bridge_qwen_30b.toml | 2 +- src/cloudai/_core/installables.py | 2 +- tests/test_git_repo_installer.py | 4 +--- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/conf/experimental/megatron_bridge/test_scenario/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test_scenario/megatron_bridge_qwen_30b.toml index 16d218f84..d9dc23419 100644 --- a/conf/experimental/megatron_bridge/test_scenario/megatron_bridge_qwen_30b.toml +++ b/conf/experimental/megatron_bridge/test_scenario/megatron_bridge_qwen_30b.toml @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/src/cloudai/_core/installables.py b/src/cloudai/_core/installables.py index dc0126a0f..d6db18589 100644 --- a/src/cloudai/_core/installables.py +++ b/src/cloudai/_core/installables.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tests/test_git_repo_installer.py b/tests/test_git_repo_installer.py index dea795790..623fcc33c 100644 --- a/tests/test_git_repo_installer.py +++ b/tests/test_git_repo_installer.py @@ -288,9 +288,7 @@ def test_submodules_skipped_when_not_requested( assert res.success installer._init_submodules.assert_not_called() - def test_submodules_run_when_requested( - self, installer: Union[KubernetesInstaller, SlurmInstaller], git: GitRepo - ): + def test_submodules_run_when_requested(self, installer: Union[KubernetesInstaller, SlurmInstaller], git: GitRepo): git.init_submodules = True installer._clone_repository = Mock(return_value=InstallStatusResult(True)) installer._checkout_commit = Mock(return_value=InstallStatusResult(True)) From f463df331275960f5b2a610ed11a806f23cd44c1 Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Tue, 17 Mar 2026 06:36:19 +0200 Subject: [PATCH 24/30] Fix init_submodules is silently bypassed on pre-existing clones --- .../systems/kubernetes/kubernetes_installer.py | 13 ++++++++++++- src/cloudai/systems/slurm/slurm_installer.py | 13 ++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/src/cloudai/systems/kubernetes/kubernetes_installer.py b/src/cloudai/systems/kubernetes/kubernetes_installer.py index 0f88985ee..5b1ebdf30 100644 --- a/src/cloudai/systems/kubernetes/kubernetes_installer.py +++ b/src/cloudai/systems/kubernetes/kubernetes_installer.py @@ -156,11 +156,23 @@ def _install_one_git_repo(self, item: GitRepo) -> InstallStatusResult: verify_res = self._verify_commit(item.commit, repo_path) if not verify_res.success: return verify_res + if item.init_submodules: + res = self._init_submodules(repo_path) + if not res.success: + return res item.installed_path = repo_path msg = f"Git repository already exists at {repo_path}." logging.debug(msg) return InstallStatusResult(True, msg) + res = self._clone_and_setup_repo(item, repo_path) + if not res.success: + return res + + item.installed_path = repo_path + return InstallStatusResult(True) + + def _clone_and_setup_repo(self, item: GitRepo, repo_path: Path) -> InstallStatusResult: res = self._clone_repository(item.url, repo_path) if not res.success: return res @@ -180,7 +192,6 @@ def _install_one_git_repo(self, item: GitRepo) -> InstallStatusResult: rmtree(repo_path) return res - item.installed_path = repo_path return InstallStatusResult(True) def _install_python_executable(self, item: PythonExecutable) -> InstallStatusResult: diff --git a/src/cloudai/systems/slurm/slurm_installer.py b/src/cloudai/systems/slurm/slurm_installer.py index 352ea69a3..e30a87395 100644 --- a/src/cloudai/systems/slurm/slurm_installer.py +++ b/src/cloudai/systems/slurm/slurm_installer.py @@ -209,11 +209,23 @@ def _install_one_git_repo(self, item: GitRepo) -> InstallStatusResult: verify_res = self._verify_commit(item.commit, repo_path) if not verify_res.success: return verify_res + if item.init_submodules: + res = self._init_submodules(repo_path) + if not res.success: + return res item.installed_path = repo_path msg = f"Git repository already exists at {repo_path}." logging.debug(msg) return InstallStatusResult(True, msg) + res = self._clone_and_setup_repo(item, repo_path) + if not res.success: + return res + + item.installed_path = repo_path + return InstallStatusResult(True) + + def _clone_and_setup_repo(self, item: GitRepo, repo_path: Path) -> InstallStatusResult: res = self._clone_repository(item.url, repo_path) if not res.success: return res @@ -233,7 +245,6 @@ def _install_one_git_repo(self, item: GitRepo) -> InstallStatusResult: rmtree(repo_path) return res - item.installed_path = repo_path return InstallStatusResult(True) def _install_python_executable(self, item: PythonExecutable) -> InstallStatusResult: From 6edc15ee4d605c83bb71210cf95e3aef36b1d206 Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Tue, 17 Mar 2026 06:37:53 +0200 Subject: [PATCH 25/30] Use shlex.split() instead of str.split() --- .../workloads/megatron_bridge/slurm_command_gen_strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py index d814a6859..3c5ee258a 100644 --- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py @@ -117,7 +117,7 @@ def _parse_srun_args_as_slurm_params(srun_args: str) -> list[str]: key names without a ``=value`` suffix. """ params: list[str] = [] - tokens = srun_args.split() + tokens = shlex.split(srun_args) i = 0 while i < len(tokens): tok = tokens[i] From 66fbb33565c954d3803797054b379c2844d51392 Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Tue, 17 Mar 2026 06:39:54 +0200 Subject: [PATCH 26/30] Add missing regression: existing repo + init_submodules=True --- tests/test_git_repo_installer.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_git_repo_installer.py b/tests/test_git_repo_installer.py index 623fcc33c..fbcaffe52 100644 --- a/tests/test_git_repo_installer.py +++ b/tests/test_git_repo_installer.py @@ -297,6 +297,19 @@ def test_submodules_run_when_requested(self, installer: Union[KubernetesInstalle assert res.success installer._init_submodules.assert_called_once() + def test_existing_repo_inits_submodules_when_requested( + self, installer: Union[KubernetesInstaller, SlurmInstaller], git: GitRepo + ): + git.init_submodules = True + repo_path = installer.system.install_path / git.repo_name + repo_path.mkdir() + installer._verify_commit = Mock(return_value=InstallStatusResult(True)) + installer._init_submodules = Mock(return_value=InstallStatusResult(True)) + res = installer._install_one_git_repo(git) + assert res.success + assert git.installed_path == repo_path + installer._init_submodules.assert_called_once_with(repo_path) + def test_uninstall_no_repo(self, installer: Union[KubernetesInstaller, SlurmInstaller], git: GitRepo): res = installer._uninstall_git_repo(git) assert res.success From 59166d61fe27fe7204b5018c9599cba5079cc113 Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Thu, 19 Mar 2026 09:08:23 +0200 Subject: [PATCH 27/30] Export container-runtime env vars before Megatron-Bridge launcher --- .../slurm_command_gen_strategy.py | 31 +++++++++++++++++- .../test_command_gen_strategy_slurm.py | 32 +++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py index 3c5ee258a..b590444a7 100644 --- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py @@ -37,6 +37,14 @@ class MegatronBridgeSlurmCommandGenStrategy(SlurmCommandGenStrategy): The launcher submits the actual training sbatch job; CloudAI tracks that job ID via SlurmRunner parsing. """ + CONTAINER_RUNTIME_ENV_VARS: frozenset[str] = frozenset( + { + "MELLANOX_VISIBLE_DEVICES", + "NVIDIA_VISIBLE_DEVICES", + "NVIDIA_DRIVER_CAPABILITIES", + } + ) + def _container_mounts(self) -> list[str]: # This workload submits its own sbatch job and passes mounts via `-cm`. return [] @@ -96,6 +104,25 @@ def _build_custom_bash_env_exports(self) -> list[str]: exports.extend(["-cb", shlex.quote(f"export {key}={value}")]) return exports + def _container_runtime_env_exports(self) -> list[str]: + """ + Build ``export`` lines for container-runtime env vars. + + Variables like ``MELLANOX_VISIBLE_DEVICES`` and ``NVIDIA_VISIBLE_DEVICES`` + are consumed by the NVIDIA container toolkit / enroot at container-creation + time to decide which devices to mount. They must be present in the process + environment **before** the Megatron-Bridge launcher calls ``sbatch`` so that + Slurm inherits them into the job and ``srun`` passes them to the container + runtime. Exporting them in the wrapper script (which runs on the submit + node) achieves this. The same variables are still passed via ``-cb`` as + well, so they are also set inside the container for any runtime readers. + """ + lines: list[str] = [] + for key, value in sorted(self.final_env_vars.items()): + if key in self.CONTAINER_RUNTIME_ENV_VARS: + lines.append(f"export {key}={shlex.quote(str(value))}") + return lines + def _normalize_recompute_modules(self, val: Any) -> str: if isinstance(val, list): items = [str(x).strip().strip("\"'") for x in val if str(x).strip()] @@ -152,6 +179,8 @@ def _wrap_launcher_for_job_id_and_quiet_output(self, launcher_cmd: str, launcher wrapper_path = output_dir / "cloudai_megatron_bridge_submit_and_parse_jobid.sh" log_path = output_dir / "cloudai_megatron_bridge_launcher.log" + container_runtime_exports = self._container_runtime_env_exports() + script_lines = [ "#!/usr/bin/env bash", "set -o pipefail", @@ -164,7 +193,7 @@ def _wrap_launcher_for_job_id_and_quiet_output(self, launcher_cmd: str, launcher # Mirror wrapper stdout/stderr to files for debugging while still emitting to the parent process. 'exec > >(tee -a "$WRAPPER_STDOUT") 2> >(tee -a "$WRAPPER_STDERR" >&2)', "", - # Launch Megatron-Bridge (log stdout/stderr to file) + *container_runtime_exports, "", ': >"$LOG"', "WANDB_INSTALL_RC=0", diff --git a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py index b8e599388..b0395d0a6 100644 --- a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py +++ b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py @@ -225,6 +225,38 @@ def test_env_vars_are_forwarded_via_custom_bash_cmds( assert "-cb 'export CUDA_VISIBLE_DEVICES=0,1,2,3'" in wrapper_content assert "-cb 'export NCCL_DEBUG=INFO'" in wrapper_content + def test_container_runtime_env_vars_exported_in_wrapper_script( + self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun] + ) -> None: + configured_slurm_system.global_env_vars = { + "MELLANOX_VISIBLE_DEVICES": "0,1,4,5", + "NCCL_IB_HCA": "roce_p0_r0,roce_p0_r1,roce_p0_r2,roce_p0_r3", + "NCCL_IB_GID_INDEX": "3", + } + tr = make_test_run(output_subdir="out_container_rt") + tdef = cast(MegatronBridgeTestDefinition, tr.test) + tdef.extra_env_vars = {"NVIDIA_VISIBLE_DEVICES": "all", "NCCL_DEBUG": "INFO"} + + cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr) + wrapper_content = self._wrapper_content(cmd_gen) + + launcher_idx = wrapper_content.index("setup_experiment.py") + + assert "export MELLANOX_VISIBLE_DEVICES=0,1,4,5" in wrapper_content + assert "export NVIDIA_VISIBLE_DEVICES=all" in wrapper_content + mvd_idx = wrapper_content.index("export MELLANOX_VISIBLE_DEVICES=") + nvd_idx = wrapper_content.index("export NVIDIA_VISIBLE_DEVICES=") + assert mvd_idx < launcher_idx, "MELLANOX_VISIBLE_DEVICES must be exported before the launcher" + assert nvd_idx < launcher_idx, "NVIDIA_VISIBLE_DEVICES must be exported before the launcher" + + assert "-cb 'export MELLANOX_VISIBLE_DEVICES=0,1,4,5'" in wrapper_content + assert "-cb 'export NVIDIA_VISIBLE_DEVICES=all'" in wrapper_content + assert "-cb 'export NCCL_IB_HCA=roce_p0_r0,roce_p0_r1,roce_p0_r2,roce_p0_r3'" in wrapper_content + assert "-cb 'export NCCL_DEBUG=INFO'" in wrapper_content + + assert "export NCCL_IB_HCA=" not in wrapper_content.split("setup_experiment.py")[0] + assert "export NCCL_DEBUG=" not in wrapper_content.split("setup_experiment.py")[0] + def test_wrapper_emits_job_id_even_when_launcher_non_zero( self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun] ) -> None: From f5681b83cae3dd635355e717940ff644e16be43a Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Thu, 19 Mar 2026 15:33:42 +0200 Subject: [PATCH 28/30] Add supports_gpu_directives check --- .../megatron_bridge/slurm_command_gen_strategy.py | 3 ++- .../test_command_gen_strategy_slurm.py | 12 ++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py index b590444a7..abb0a1330 100644 --- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py @@ -455,8 +455,9 @@ def add_field(field: str, flag: str, value: Any) -> None: additional_slurm_params: list[str] = [] - if args.gpus_per_node: + if args.gpus_per_node and self.system.supports_gpu_directives: additional_slurm_params.append(f"gpus-per-node={args.gpus_per_node}") + additional_slurm_params.append(f"gres=gpu:{args.gpus_per_node}") _, node_list = self.get_cached_nodes_spec() if node_list: diff --git a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py index b0395d0a6..6a977d95e 100644 --- a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py +++ b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py @@ -380,11 +380,23 @@ def test_no_mount_as_skips_repo_container_mount( def test_gpus_per_node_passed_as_additional_slurm_param( self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun] ) -> None: + configured_slurm_system.supports_gpu_directives_cache = True tr = make_test_run(cmd_args_overrides={"gpus_per_node": 2}, output_subdir="out_gpus") cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr) wrapper_content = self._wrapper_content(cmd_gen) assert "--additional_slurm_params" in wrapper_content assert "gpus-per-node=2" in wrapper_content + assert "gres=gpu:2" in wrapper_content + + def test_gpus_per_node_skipped_when_gpu_directives_unsupported( + self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun] + ) -> None: + configured_slurm_system.supports_gpu_directives_cache = False + tr = make_test_run(cmd_args_overrides={"gpus_per_node": 2}, output_subdir="out_no_gpu_directives") + cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr) + wrapper_content = self._wrapper_content(cmd_gen) + assert "gpus-per-node=2" not in wrapper_content + assert "gres=gpu:2" not in wrapper_content def test_system_extra_srun_args_forwarded( self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun] From 15081a693a83b89dff9f90c358578540fcae3252 Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Thu, 19 Mar 2026 15:46:28 +0200 Subject: [PATCH 29/30] Fix tests --- tests/ref_data/megatron-bridge.sbatch | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ref_data/megatron-bridge.sbatch b/tests/ref_data/megatron-bridge.sbatch index 4c95d637f..884b90091 100644 --- a/tests/ref_data/megatron-bridge.sbatch +++ b/tests/ref_data/megatron-bridge.sbatch @@ -19,7 +19,7 @@ if [ "${WANDB_INSTALL_RC}" -ne 0 ]; then fi LAUNCH_RC=0 -NEMORUN_HOME="__OUTPUT_DIR__/output" __INSTALL_DIR__/Run__main-venv/bin/python __INSTALL_DIR__/Megatron-Bridge__main/scripts/performance/setup_experiment.py -p main -i __OUTPUT_DIR__/output/megatron_bridge_image.sqsh -hf dummy_token -ng 8 -gn 4 --golden_values_path cloudai_megatron_bridge_golden_values.json -cm __INSTALL_DIR__/Megatron-Bridge__main:/opt/Megatron-Bridge -cb 'export CUDA_VISIBLE_DEVICES=0,1,2,3' -cb 'export NCCL_DEBUG=INFO' -m qwen3 -mr 30b_a3b --detach false --additional_slurm_params gpus-per-node=4 >>"$LOG" 2>&1 || LAUNCH_RC=$? +NEMORUN_HOME="__OUTPUT_DIR__/output" __INSTALL_DIR__/Run__main-venv/bin/python __INSTALL_DIR__/Megatron-Bridge__main/scripts/performance/setup_experiment.py -p main -i __OUTPUT_DIR__/output/megatron_bridge_image.sqsh -hf dummy_token -ng 8 -gn 4 --golden_values_path cloudai_megatron_bridge_golden_values.json -cm __INSTALL_DIR__/Megatron-Bridge__main:/opt/Megatron-Bridge -cb 'export CUDA_VISIBLE_DEVICES=0,1,2,3' -cb 'export NCCL_DEBUG=INFO' -m qwen3 -mr 30b_a3b --detach false --additional_slurm_params 'gpus-per-node=4;gres=gpu:4' >>"$LOG" 2>&1 || LAUNCH_RC=$? JOB_ID="" From 3a1a996893384ab5facb86fcf2a4b9ead8470ae6 Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Thu, 19 Mar 2026 17:04:27 +0200 Subject: [PATCH 30/30] Restructure configurations --- .../test/b200/megatron_bridge_qwen_30b.toml | 9 ++---- .../test/gb200/megatron_bridge_qwen_30b.toml | 9 ++---- .../test/gb300/megatron_bridge_qwen_30b.toml | 10 ++---- .../test/h100/megatron_bridge_qwen_30b.toml | 9 ++---- .../megatron_bridge_r0.3.0_qwen_30b.toml | 31 +++++++++++++++++++ 5 files changed, 40 insertions(+), 28 deletions(-) create mode 100644 conf/experimental/megatron_bridge/test_scenario/megatron_bridge_r0.3.0_qwen_30b.toml diff --git a/conf/experimental/megatron_bridge/test/b200/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/b200/megatron_bridge_qwen_30b.toml index 108f78993..89cd95dd0 100644 --- a/conf/experimental/megatron_bridge/test/b200/megatron_bridge_qwen_30b.toml +++ b/conf/experimental/megatron_bridge/test/b200/megatron_bridge_qwen_30b.toml @@ -22,13 +22,11 @@ extra_container_mounts = [] [[git_repos]] url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git" -commit = "r0.3.0" -mount_as = "/opt/Megatron-Bridge" -init_submodules = true +commit = "v0.3.0" [cmd_args] gpu_type = "b200" -container_image = "nvcr.io#nvidia/nemo:26.02" +container_image = "nvcr.io#nvidia/nemo:26.02.00" model_family_name = "qwen" model_recipe_name = "qwen3_30b_a3b" gpus_per_node = 8 @@ -38,6 +36,3 @@ task = "pretrain" compute_dtype = "fp8_mx" hf_token = "" enable_vboost = true - -[extra_env_vars] -PYTHONPATH = "/opt/Megatron-Bridge/3rdparty/Megatron-LM:${PYTHONPATH}" diff --git a/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml index 2225ac2ed..d0ca6e2fb 100644 --- a/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml +++ b/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml @@ -22,13 +22,11 @@ extra_container_mounts = [] [[git_repos]] url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git" -commit = "r0.3.0" -mount_as = "/opt/Megatron-Bridge" -init_submodules = true +commit = "v0.3.0" [cmd_args] gpu_type = "gb200" -container_image = "nvcr.io#nvidia/nemo:26.02" +container_image = "nvcr.io#nvidia/nemo:26.02.00" model_family_name = "qwen" model_recipe_name = "qwen3_30b_a3b" gpus_per_node = 4 @@ -38,6 +36,3 @@ task = "pretrain" compute_dtype = "fp8_mx" hf_token = "" enable_vboost = true - -[extra_env_vars] -PYTHONPATH = "/opt/Megatron-Bridge/3rdparty/Megatron-LM:${PYTHONPATH}" diff --git a/conf/experimental/megatron_bridge/test/gb300/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/gb300/megatron_bridge_qwen_30b.toml index 44b724e75..f68899a08 100644 --- a/conf/experimental/megatron_bridge/test/gb300/megatron_bridge_qwen_30b.toml +++ b/conf/experimental/megatron_bridge/test/gb300/megatron_bridge_qwen_30b.toml @@ -22,22 +22,18 @@ extra_container_mounts = [] [[git_repos]] url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git" -commit = "r0.3.0" -mount_as = "/opt/Megatron-Bridge" -init_submodules = true +commit = "v0.3.0" [cmd_args] gpu_type = "gb300" -container_image = "nvcr.io#nvidia/nemo:26.02" +container_image = "nvcr.io#nvidia/nemo:26.02.00" model_family_name = "qwen" model_recipe_name = "qwen3_30b_a3b" gpus_per_node = 4 num_gpus = 8 +# mb = 4 # In case OOM, uncomment this for smaller micro-batch size domain = "llm" task = "pretrain" compute_dtype = "fp8_mx" hf_token = "" enable_vboost = true - -[extra_env_vars] -PYTHONPATH = "/opt/Megatron-Bridge/3rdparty/Megatron-LM:${PYTHONPATH}" diff --git a/conf/experimental/megatron_bridge/test/h100/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/h100/megatron_bridge_qwen_30b.toml index 200e256c6..84c52f893 100644 --- a/conf/experimental/megatron_bridge/test/h100/megatron_bridge_qwen_30b.toml +++ b/conf/experimental/megatron_bridge/test/h100/megatron_bridge_qwen_30b.toml @@ -22,13 +22,11 @@ extra_container_mounts = [] [[git_repos]] url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git" -commit = "r0.3.0" -mount_as = "/opt/Megatron-Bridge" -init_submodules = true +commit = "v0.3.0" [cmd_args] gpu_type = "h100" -container_image = "nvcr.io#nvidia/nemo:26.02" +container_image = "nvcr.io#nvidia/nemo:26.02.00" model_family_name = "qwen" model_recipe_name = "qwen3_30b_a3b" gpus_per_node = 8 @@ -38,6 +36,3 @@ task = "pretrain" compute_dtype = "fp8_cs" hf_token = "" enable_vboost = true - -[extra_env_vars] -PYTHONPATH = "/opt/Megatron-Bridge/3rdparty/Megatron-LM:${PYTHONPATH}" diff --git a/conf/experimental/megatron_bridge/test_scenario/megatron_bridge_r0.3.0_qwen_30b.toml b/conf/experimental/megatron_bridge/test_scenario/megatron_bridge_r0.3.0_qwen_30b.toml new file mode 100644 index 000000000..70e62e188 --- /dev/null +++ b/conf/experimental/megatron_bridge/test_scenario/megatron_bridge_r0.3.0_qwen_30b.toml @@ -0,0 +1,31 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "megatron_bridge_qwen_30b" + +[[Tests]] +id = "megatron_bridge_qwen_30b" +test_name = "megatron_bridge_qwen_30b" +num_nodes = "2" + + [[Tests.git_repos]] + url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git" + commit = "r0.3.0" + mount_as = "/opt/Megatron-Bridge" + init_submodules = true + + [Tests.extra_env_vars] + PYTHONPATH = "/opt/Megatron-Bridge/3rdparty/Megatron-LM:${PYTHONPATH}"