From a0d54a42e58d8b9fd05c2fb0287c3f9c7139a39f Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Fri, 6 Mar 2026 13:47:25 +0200
Subject: [PATCH 01/30] Make node placement correctly propagate M-Bridge

---
 .../workloads/megatron_bridge/slurm_command_gen_strategy.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
index 8d08c5696..1fae8e229 100644
--- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
@@ -400,6 +400,12 @@ def add_field(field: str, flag: str, value: Any) -> None:
         add_field("nsys_trace", "--nsys_trace", self._list_or_comma_str(args.nsys_trace))
         add_field("nsys_extra_args", "--nsys_extra_args", self._list_or_comma_str(args.nsys_extra_args))
 
+        # Node placement (pass resolved nodelist to Megatron-Bridge via --additional_slurm_params)
+        _, node_list = self.get_cached_nodes_spec()
+        if node_list:
+            nodelist_str = ",".join(node_list)
+            parts.extend(["--additional_slurm_params", shlex.quote(f"nodelist={nodelist_str}")])
+
         # Config variant
         add_field("config_variant", "-cv", args.config_variant)
         if args.list_config_variants and "list_config_variants" in fields_set:

From e9b1218d72d0f18ac1fb900f0c93a65c38147c0a Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Tue, 10 Mar 2026 14:01:43 +0200
Subject: [PATCH 02/30] Allow excluding nodes from group of nodes

---
 src/cloudai/_core/test_scenario.py            |  1 +
 src/cloudai/models/scenario.py                |  4 +++
 .../slurm/slurm_command_gen_strategy.py       | 10 ++++--
 src/cloudai/systems/slurm/slurm_system.py     | 36 ++++++++++++++-----
 src/cloudai/test_scenario_parser.py           |  1 +
 tests/systems/slurm/test_allocation.py        | 21 +++++++++++
 tests/systems/slurm/test_system.py            |  2 +-
 7 files changed, 64 insertions(+), 11 deletions(-)

diff --git a/src/cloudai/_core/test_scenario.py b/src/cloudai/_core/test_scenario.py
index 5480adf78..22484ddc1 100644
--- a/src/cloudai/_core/test_scenario.py
+++ b/src/cloudai/_core/test_scenario.py
@@ -75,6 +75,7 @@ class TestRun:
     post_test: Optional[TestScenario] = None
     reports: Set[Type[ReportGenerationStrategy]] = field(default_factory=set)
     extra_srun_args: str | None = None
+    exclude_nodes: str | None = None
 
     def __hash__(self) -> int:
         return hash(self.name + self.test.name + str(self.iterations) + str(self.current_iteration))
diff --git a/src/cloudai/models/scenario.py b/src/cloudai/models/scenario.py
index cb5452af2..823a83527 100644
--- a/src/cloudai/models/scenario.py
+++ b/src/cloudai/models/scenario.py
@@ -80,6 +80,10 @@ class TestRunModel(BaseModel):
     time_limit: Optional[str] = None
     dependencies: list[TestRunDependencyModel] = Field(default_factory=list)
     extra_srun_args: str | None = None
+    exclude_nodes: str | None = Field(
+        default=None,
+        description="Comma-separated hostnames to exclude from the resolved node list.",
+    )
 
     # test definition fields
     name: Optional[str] = None
diff --git a/src/cloudai/systems/slurm/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/slurm_command_gen_strategy.py
index 5c17b4d17..661eda35f 100644
--- a/src/cloudai/systems/slurm/slurm_command_gen_strategy.py
+++ b/src/cloudai/systems/slurm/slurm_command_gen_strategy.py
@@ -26,7 +26,7 @@
 from cloudai.core import CommandGenStrategy, Registry, System, TestRun, TestScenario
 from cloudai.models.scenario import TestRunDetails
 
-from .slurm_system import SlurmSystem
+from .slurm_system import SlurmSystem, parse_node_list
 
 
 class SlurmCommandGenStrategy(CommandGenStrategy):
@@ -465,6 +465,7 @@ def get_cached_nodes_spec(self) -> tuple[int, list[str]]:
                 str(self.test_run.step),
                 str(self.test_run.nnodes),
                 ",".join(self.test_run.nodes),
+                self.test_run.exclude_nodes or "",
             ]
         )
 
@@ -472,5 +473,10 @@ def get_cached_nodes_spec(self) -> tuple[int, list[str]]:
             logging.debug(f"Using cached node allocation for {cache_key}: {self._node_spec_cache[cache_key]}")
             return self._node_spec_cache[cache_key]
 
-        self._node_spec_cache[cache_key] = self.system.get_nodes_by_spec(self.test_run.nnodes, self.test_run.nodes)
+        excluded = set(parse_node_list(self.test_run.exclude_nodes)) if self.test_run.exclude_nodes else None
+        num_nodes, node_list = self.system.get_nodes_by_spec(
+            self.test_run.nnodes, self.test_run.nodes, exclude_nodes=excluded
+        )
+
+        self._node_spec_cache[cache_key] = (num_nodes, node_list)
         return self._node_spec_cache[cache_key]
diff --git a/src/cloudai/systems/slurm/slurm_system.py b/src/cloudai/systems/slurm/slurm_system.py
index 5edbed01a..36395b3bb 100644
--- a/src/cloudai/systems/slurm/slurm_system.py
+++ b/src/cloudai/systems/slurm/slurm_system.py
@@ -443,7 +443,11 @@ def format_range(lst: List[int], padding: int) -> List[str]:
         return ", ".join(formatted_ranges)
 
     def get_available_nodes_from_group(
-        self, partition_name: str, group_name: str, number_of_nodes: Union[int, str]
+        self,
+        partition_name: str,
+        group_name: str,
+        number_of_nodes: Union[int, str],
+        exclude_nodes: set[str] | None = None,
     ) -> List[SlurmNode]:
         """
         Retrieve a specific number of potentially available nodes from a group within a partition.
@@ -456,6 +460,7 @@ def get_available_nodes_from_group(
             group_name (str): The name of the group.
             number_of_nodes (Union[int,str]): The number of nodes to retrieve.
                 Could also be 'all' to retrieve all the nodes from the group.
+            exclude_nodes (set[str] | None): Node names to exclude from the pool before selection.
 
         Returns:
             List[SlurmNode]: Objects that are potentially available for use.
@@ -468,7 +473,7 @@ def get_available_nodes_from_group(
 
         self.validate_partition_and_group(partition_name, group_name)
 
-        grouped_nodes = self.group_nodes_by_state(partition_name, group_name)
+        grouped_nodes = self.group_nodes_by_state(partition_name, group_name, exclude_nodes=exclude_nodes)
 
         try:
             allocated_nodes = self.allocate_nodes(grouped_nodes, number_of_nodes, group_name)
@@ -505,14 +510,19 @@ def validate_partition_and_group(self, partition_name: str, group_name: str) ->
         if group_name not in self.groups[partition_name]:
             raise ValueError(f"Group '{group_name}' not found in partition '{partition_name}'.")
 
-    def group_nodes_by_state(self, partition_name: str, group_name: str) -> Dict[SlurmNodeState, List[SlurmNode]]:
+    def group_nodes_by_state(
+        self,
+        partition_name: str,
+        group_name: str,
+        exclude_nodes: set[str] | None = None,
+    ) -> Dict[SlurmNodeState, List[SlurmNode]]:
         """
         Group nodes by their states, excluding nodes allocated to the current user.
 
         Args:
             partition_name (str): The name of the partition.
             group_name (str): The name of the group.
-            current_user (str): The username of the current user.
+            exclude_nodes (set[str] | None): Node names to exclude from the pool before grouping.
 
         Returns:
             Dict[SlurmNodeState, List[SlurmNode]]: A dictionary grouping nodes by their state.
@@ -524,6 +534,8 @@ def group_nodes_by_state(self, partition_name: str, group_name: str) -> Dict[Slu
         }
 
         for node in self.groups[partition_name][group_name]:
+            if exclude_nodes and node.name in exclude_nodes:
+                continue
             if node.state in grouped_nodes:
                 grouped_nodes[node.state].append(node)
 
@@ -671,7 +683,7 @@ def convert_state_to_enum(self, state_str: str) -> SlurmNodeState:
                 logging.debug(f"Unknown node state: {core_state}")
                 return SlurmNodeState.UNKNOWN_STATE
 
-    def parse_nodes(self, nodes: List[str]) -> List[str]:
+    def parse_nodes(self, nodes: List[str], exclude_nodes: set[str] | None = None) -> List[str]:
         """
         Parse a list of node specifications into individual node names.
 
@@ -684,6 +696,7 @@ def parse_nodes(self, nodes: List[str]) -> List[str]:
                 "partition:group:num_nodes", where "partition" is the partition name, "group" is a group within that
                 partition, and "num_nodes" is the number of nodes requested. Node ranges should be specified with
                 square brackets and dashes, e.g., "node[01-03]" for "node01", "node02", "node03".
+            exclude_nodes (set[str] | None): Node names to exclude from group pools before selection.
 
         Returns:
             List[str]: A list of node names. For specifications, it includes names of allocated nodes based on the
@@ -701,17 +714,23 @@ def parse_nodes(self, nodes: List[str]) -> List[str]:
                     raise ValueError("Format should be partition:group:num_nodes")
                 partition_name, group_name, num_nodes_spec = parts
                 num_nodes = int(num_nodes_spec) if num_nodes_spec != "max_avail" else num_nodes_spec
-                group_nodes = self.get_available_nodes_from_group(partition_name, group_name, num_nodes)
+                group_nodes = self.get_available_nodes_from_group(
+                    partition_name, group_name, num_nodes, exclude_nodes=exclude_nodes
+                )
                 parsed_nodes += [node.name for node in group_nodes]
             else:
                 expanded_nodes = parse_node_list(node_spec)
+                if exclude_nodes:
+                    expanded_nodes = [n for n in expanded_nodes if n not in exclude_nodes]
                 parsed_nodes += expanded_nodes
 
         # Remove duplicates while preserving order
         parsed_nodes = list(dict.fromkeys(parsed_nodes))
         return parsed_nodes
 
-    def get_nodes_by_spec(self, num_nodes: int, nodes: list[str]) -> Tuple[int, list[str]]:
+    def get_nodes_by_spec(
+        self, num_nodes: int, nodes: list[str], exclude_nodes: set[str] | None = None
+    ) -> Tuple[int, list[str]]:
         """
         Retrieve a list of node names based on specifications.
 
@@ -721,12 +740,13 @@ def get_nodes_by_spec(self, num_nodes: int, nodes: list[str]) -> Tuple[int, list
         Args:
             num_nodes (int): The number of nodes, can't be `0`.
             nodes (list[str]): A list of node names specifications, slurm format or `PARTITION:GROUP:NUM_NODES`.
+            exclude_nodes (set[str] | None): Node names to exclude from group pools before selection.
 
         Returns:
             Tuple[int, list[str]]: The number of nodes and a list of node names.
         """
         num_nodes, node_list = num_nodes, []
-        parsed_nodes = self.parse_nodes(nodes)
+        parsed_nodes = self.parse_nodes(nodes, exclude_nodes=exclude_nodes)
         if parsed_nodes:
             num_nodes = len(parsed_nodes)
             node_list = parsed_nodes
diff --git a/src/cloudai/test_scenario_parser.py b/src/cloudai/test_scenario_parser.py
index 774e8dd9c..a75c467af 100644
--- a/src/cloudai/test_scenario_parser.py
+++ b/src/cloudai/test_scenario_parser.py
@@ -208,6 +208,7 @@ def _create_test_run(
             post_test=post_test,
             reports=get_reporters(test_info, tdef),
             extra_srun_args=test_info.extra_srun_args,
+            exclude_nodes=test_info.exclude_nodes,
         )
 
         return tr
diff --git a/tests/systems/slurm/test_allocation.py b/tests/systems/slurm/test_allocation.py
index 9426f5fa5..5913fa1a7 100644
--- a/tests/systems/slurm/test_allocation.py
+++ b/tests/systems/slurm/test_allocation.py
@@ -115,3 +115,24 @@ def test_group_allocation_is_preserved_on_updated(self, slurm_system: SlurmSyste
         ):
             system.update()
         assert all(node.state == SlurmNodeState.ALLOCATED for node in system.group_allocated)
+
+    def test_exclude_nodes_selects_from_remaining_pool(
+        self, slurm_system: SlurmSystem, monkeypatch: pytest.MonkeyPatch
+    ):
+        """Excluding a node from a group should still yield the requested count from the remaining pool."""
+        system, all_nodes, _ = self.prepare(slurm_system, [], monkeypatch)
+        nnodes, nodes_list = system.get_nodes_by_spec(1, ["main:group1:4"], exclude_nodes={"node03"})
+        assert nnodes == 4
+        assert "node03" not in nodes_list
+        assert len(nodes_list) == 4
+
+    def test_exclude_multiple_nodes_from_group(
+        self, slurm_system: SlurmSystem, monkeypatch: pytest.MonkeyPatch
+    ):
+        """Excluding multiple nodes still selects the requested count from remaining nodes."""
+        system, all_nodes, _ = self.prepare(slurm_system, [], monkeypatch)
+        nnodes, nodes_list = system.get_nodes_by_spec(1, ["main:group1:3"], exclude_nodes={"node01", "node05"})
+        assert nnodes == 3
+        assert "node01" not in nodes_list
+        assert "node05" not in nodes_list
+        assert len(nodes_list) == 3
diff --git a/tests/systems/slurm/test_system.py b/tests/systems/slurm/test_system.py
index 0ad79b61f..6ba1da1d1 100644
--- a/tests/systems/slurm/test_system.py
+++ b/tests/systems/slurm/test_system.py
@@ -363,7 +363,7 @@ def test_explicit_node_names(
 
         num_nodes, node_list = slurm_system.get_nodes_by_spec(in_nnodes, in_nodes)
 
-        mock_parse_nodes.assert_called_once_with(in_nodes)
+        mock_parse_nodes.assert_called_once_with(in_nodes, exclude_nodes=None)
         assert num_nodes == exp_nnodes
         assert node_list == exp_nodes
 

From d720616f683c92cdce044a22e50db42e955e06de Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Wed, 11 Mar 2026 07:36:48 +0200
Subject: [PATCH 03/30] Also init submodules when install git repos

---
 .../kubernetes/kubernetes_installer.py        | 15 +++++++
 src/cloudai/systems/slurm/slurm_installer.py  | 15 +++++++
 tests/test_git_repo_installer.py              | 39 +++++++++++++++++++
 3 files changed, 69 insertions(+)

diff --git a/src/cloudai/systems/kubernetes/kubernetes_installer.py b/src/cloudai/systems/kubernetes/kubernetes_installer.py
index 6f18f2a7a..f360230ea 100644
--- a/src/cloudai/systems/kubernetes/kubernetes_installer.py
+++ b/src/cloudai/systems/kubernetes/kubernetes_installer.py
@@ -172,6 +172,13 @@ def _install_one_git_repo(self, item: GitRepo) -> InstallStatusResult:
                 rmtree(repo_path)
             return res
 
+        res = self._init_submodules(repo_path)
+        if not res.success:
+            logging.error(f"Submodule init failed, removing cloned repository at {repo_path}")
+            if repo_path.exists():
+                rmtree(repo_path)
+            return res
+
         item.installed_path = repo_path
         return InstallStatusResult(True)
 
@@ -237,6 +244,14 @@ def _checkout_commit(self, commit_hash: str, path: Path) -> InstallStatusResult:
             return InstallStatusResult(False, f"Failed to checkout commit {commit_hash}: {result.stderr}")
         return InstallStatusResult(True)
 
+    def _init_submodules(self, path: Path) -> InstallStatusResult:
+        logging.debug(f"Initializing submodules in {path}")
+        submodule_cmd = ["git", "submodule", "update", "--init", "--recursive"]
+        result = subprocess.run(submodule_cmd, cwd=str(path), capture_output=True, text=True)
+        if result.returncode != 0:
+            return InstallStatusResult(False, f"Failed to initialize submodules: {result.stderr}")
+        return InstallStatusResult(True)
+
     def _verify_commit(self, ref: str, path: Path) -> InstallStatusResult:
         try:
             result = subprocess.run(["git", "rev-parse", "HEAD"], cwd=str(path), capture_output=True, text=True)
diff --git a/src/cloudai/systems/slurm/slurm_installer.py b/src/cloudai/systems/slurm/slurm_installer.py
index 4800becc5..d286c32d7 100644
--- a/src/cloudai/systems/slurm/slurm_installer.py
+++ b/src/cloudai/systems/slurm/slurm_installer.py
@@ -225,6 +225,13 @@ def _install_one_git_repo(self, item: GitRepo) -> InstallStatusResult:
                 rmtree(repo_path)
             return res
 
+        res = self._init_submodules(repo_path)
+        if not res.success:
+            logging.error(f"Submodule init failed, removing cloned repository at {repo_path}")
+            if repo_path.exists():
+                rmtree(repo_path)
+            return res
+
         item.installed_path = repo_path
         return InstallStatusResult(True)
 
@@ -290,6 +297,14 @@ def _checkout_commit(self, commit_hash: str, path: Path) -> InstallStatusResult:
             return InstallStatusResult(False, f"Failed to checkout commit {commit_hash}: {result.stderr}")
         return InstallStatusResult(True)
 
+    def _init_submodules(self, path: Path) -> InstallStatusResult:
+        logging.debug(f"Initializing submodules in {path}")
+        submodule_cmd = ["git", "submodule", "update", "--init", "--recursive"]
+        result = subprocess.run(submodule_cmd, cwd=str(path), capture_output=True, text=True)
+        if result.returncode != 0:
+            return InstallStatusResult(False, f"Failed to initialize submodules: {result.stderr}")
+        return InstallStatusResult(True)
+
     def _verify_commit(self, ref: str, path: Path) -> InstallStatusResult:
         try:
             result = subprocess.run(["git", "rev-parse", "HEAD"], cwd=str(path), capture_output=True, text=True)
diff --git a/tests/test_git_repo_installer.py b/tests/test_git_repo_installer.py
index 545f1fe1d..3d12298c1 100644
--- a/tests/test_git_repo_installer.py
+++ b/tests/test_git_repo_installer.py
@@ -232,9 +232,48 @@ def test_verify_commit_branch_name_match(self, installer: Union[KubernetesInstal
             res = installer._verify_commit(ref, repo_path)
         assert res.success
 
+    def test_submodules_initialized(self, installer: Union[KubernetesInstaller, SlurmInstaller], git: GitRepo):
+        repo_path = installer.system.install_path / git.repo_name
+        repo_path.mkdir()
+        with patch("subprocess.run") as mock_run:
+            mock_run.return_value = CompletedProcess(args=[], returncode=0)
+            res = installer._init_submodules(repo_path)
+        assert res.success
+        mock_run.assert_called_once_with(
+            ["git", "submodule", "update", "--init", "--recursive"],
+            cwd=str(repo_path),
+            capture_output=True,
+            text=True,
+        )
+
+    def test_error_initializing_submodules(self, installer: Union[KubernetesInstaller, SlurmInstaller], git: GitRepo):
+        repo_path = installer.system.install_path / git.repo_name
+        repo_path.mkdir()
+        with patch("subprocess.run") as mock_run:
+            mock_run.return_value = CompletedProcess(args=[], returncode=1, stderr="err")
+            res = installer._init_submodules(repo_path)
+        assert not res.success
+        assert res.message == "Failed to initialize submodules: err"
+
+    def test_submodule_failure_cleans_up_repo(
+        self, installer: Union[KubernetesInstaller, SlurmInstaller], git: GitRepo
+    ):
+        repo_path = installer.system.install_path / git.repo_name
+        installer._clone_repository = Mock(
+            side_effect=lambda url, path: (path.mkdir(parents=True, exist_ok=True), InstallStatusResult(True))[1]
+        )
+        installer._checkout_commit = Mock(return_value=InstallStatusResult(True))
+        installer._init_submodules = Mock(
+            return_value=InstallStatusResult(False, "Failed to initialize submodules: err")
+        )
+        res = installer._install_one_git_repo(git)
+        assert not res.success
+        assert not repo_path.exists()
+
     def test_all_good_flow(self, installer: Union[KubernetesInstaller, SlurmInstaller], git: GitRepo):
         installer._clone_repository = Mock(return_value=InstallStatusResult(True))
         installer._checkout_commit = Mock(return_value=InstallStatusResult(True))
+        installer._init_submodules = Mock(return_value=InstallStatusResult(True))
         res = installer._install_one_git_repo(git)
         assert res.success
         assert git.installed_path == installer.system.install_path / git.repo_name

From ea4844db42bb7f35a90ec1d10c30069140fa4fc7 Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Wed, 11 Mar 2026 07:57:41 +0200
Subject: [PATCH 04/30] Add options to mount the repo in the container

---
 .../megatron_bridge/megatron_bridge.py        | 10 ++++-
 .../slurm_command_gen_strategy.py             | 17 +++++----
 .../test_command_gen_strategy_slurm.py        | 37 +++++++++++++++----
 3 files changed, 47 insertions(+), 17 deletions(-)

diff --git a/src/cloudai/workloads/megatron_bridge/megatron_bridge.py b/src/cloudai/workloads/megatron_bridge/megatron_bridge.py
index 783072c59..7ff8f72b3 100644
--- a/src/cloudai/workloads/megatron_bridge/megatron_bridge.py
+++ b/src/cloudai/workloads/megatron_bridge/megatron_bridge.py
@@ -184,10 +184,16 @@ class MegatronBridgeTestDefinition(TestDefinition):
 
     @staticmethod
     def _select_megatron_bridge_repo(git_repos: list[GitRepo]) -> GitRepo | None:
-        """Return the Megatron-Bridge repo from `git_repos` (normalized to mount_as=/opt/Megatron-Bridge)."""
+        """
+        Return the Megatron-Bridge repo from `git_repos`.
+
+        When the user sets ``mount_as`` (e.g. ``/opt/Megatron-Bridge``), the installed clone will be bind-mounted
+        into the container at that path, overriding whatever the container image ships. When ``mount_as`` is *not*
+        set the container's built-in ``/opt/Megatron-Bridge`` is used.
+        """
         for repo in git_repos:
             if "Megatron-Bridge" in repo.url or (repo.mount_as or "").rstrip("/") == "/opt/Megatron-Bridge":
-                return repo if repo.mount_as else repo.model_copy(update={"mount_as": "/opt/Megatron-Bridge"})
+                return repo
         return None
 
     @field_validator("git_repos", mode="after")
diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
index 1fae8e229..064b7e199 100644
--- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
@@ -225,14 +225,15 @@ def _installed_container_path() -> str:
             else:
                 container_path = _installed_container_path()
 
-        # Use only test-level extra_container_mounts; never mount the Megatron-Bridge repo via -cm
-        # because the container uses its built-in copy.
-        mounts = [str(m).strip() for m in (tdef.extra_container_mounts or []) if str(m).strip()]
-        mounts = [
-            m
-            for m in mounts
-            if "/opt/Megatron-Bridge" not in m and "Megatron-Bridge" not in m.split(":")[0].split("/")[-1]
-        ]
+        mounts: list[str] = []
+        mounts.extend(tdef.extra_container_mounts or [])
+
+        # When the user sets mount_as on the Megatron-Bridge git repo, bind-mount the
+        # installed clone into the container to override the image's built-in copy.
+        mb_repo = tdef.megatron_bridge_repo
+        if mb_repo.mount_as:
+            mb_host = mb_repo.installed_path.absolute() if mb_repo.installed_path else repo_path
+            mounts.append(f"{mb_host}:{mb_repo.mount_as}")
 
         venv_path = tdef.python_executable.venv_path or (self.system.install_path / tdef.python_executable.venv_name)
         python_bin = (venv_path / "bin" / "python").absolute()
diff --git a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py
index 23708e50c..120ad7de9 100644
--- a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py
@@ -64,6 +64,7 @@ def _make(
             *,
             cmd_args_overrides: dict[str, Any] | None = None,
             git_commit: str = "r0.2.0",
+            mount_as: str | None = "/opt/Megatron-Bridge",
             output_subdir: str = "out",
             num_nodes: int = 2,
         ) -> TestRun:
@@ -78,19 +79,20 @@ def _make(
             if cmd_args_overrides:
                 cmd_args_data.update(cmd_args_overrides)
 
+            repo_kwargs: dict[str, Any] = {
+                "url": "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git",
+                "commit": git_commit,
+            }
+            if mount_as is not None:
+                repo_kwargs["mount_as"] = mount_as
+
             tdef = MegatronBridgeTestDefinition(
                 name="mb",
                 description="desc",
                 test_template_name="MegatronBridge",
                 cmd_args=MegatronBridgeCmdArgs.model_validate(cmd_args_data),
                 extra_container_mounts=[],
-                git_repos=[
-                    GitRepo(
-                        url="https://github.com/NVIDIA-NeMo/Megatron-Bridge.git",
-                        commit=git_commit,
-                        mount_as="/opt/Megatron-Bridge",
-                    )
-                ],
+                git_repos=[GitRepo(**repo_kwargs)],
             )
             self._configure_fake_installs(tdef, tmp_path)
             return TestRun(
@@ -317,3 +319,24 @@ def test_use_recipes_emitted_only_when_true(
         cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr)
         wrapper_content = self._wrapper_content(cmd_gen)
         assert ("--use_recipes" in wrapper_content) is expected_in_wrapper
+
+    def test_mount_as_adds_repo_to_container_mounts(
+        self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun], tmp_path: Path
+    ) -> None:
+        tr = make_test_run(mount_as="/opt/Megatron-Bridge", output_subdir="out_mount")
+        tdef = cast(MegatronBridgeTestDefinition, tr.test)
+        repo_path = tdef.megatron_bridge_repo.installed_path
+        assert repo_path is not None
+
+        cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr)
+        wrapper_content = self._wrapper_content(cmd_gen)
+        assert f"-cm {repo_path.absolute()}:/opt/Megatron-Bridge" in wrapper_content
+
+    def test_no_mount_as_skips_repo_container_mount(
+        self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun]
+    ) -> None:
+        tr = make_test_run(mount_as=None, output_subdir="out_no_mount")
+        cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr)
+        wrapper_content = self._wrapper_content(cmd_gen)
+        assert "-cm" not in wrapper_content
+        assert "/opt/Megatron-Bridge" not in wrapper_content

From 9f786ffed2aa7321eb9b696b3f7532c73315c133 Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Wed, 11 Mar 2026 09:56:26 +0200
Subject: [PATCH 05/30] Fix slurm gpu resource requesting

---
 .../megatron_bridge/slurm_command_gen_strategy.py     | 11 +++++++++--
 .../test_command_gen_strategy_slurm.py                |  9 +++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
index 064b7e199..338eb24fc 100644
--- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
@@ -401,11 +401,18 @@ def add_field(field: str, flag: str, value: Any) -> None:
         add_field("nsys_trace", "--nsys_trace", self._list_or_comma_str(args.nsys_trace))
         add_field("nsys_extra_args", "--nsys_extra_args", self._list_or_comma_str(args.nsys_extra_args))
 
-        # Node placement (pass resolved nodelist to Megatron-Bridge via --additional_slurm_params)
+        additional_slurm_params: list[str] = []
+
+        if args.gpus_per_node:
+            additional_slurm_params.append(f"gpus-per-node={args.gpus_per_node}")
+
         _, node_list = self.get_cached_nodes_spec()
         if node_list:
             nodelist_str = ",".join(node_list)
-            parts.extend(["--additional_slurm_params", shlex.quote(f"nodelist={nodelist_str}")])
+            additional_slurm_params.append(f"nodelist={nodelist_str}")
+
+        if additional_slurm_params:
+            parts.extend(["--additional_slurm_params", shlex.quote(" ".join(additional_slurm_params))])
 
         # Config variant
         add_field("config_variant", "-cv", args.config_variant)
diff --git a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py
index 120ad7de9..430f36d38 100644
--- a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py
@@ -340,3 +340,12 @@ def test_no_mount_as_skips_repo_container_mount(
         wrapper_content = self._wrapper_content(cmd_gen)
         assert "-cm" not in wrapper_content
         assert "/opt/Megatron-Bridge" not in wrapper_content
+
+    def test_gpus_per_node_passed_as_additional_slurm_param(
+        self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun]
+    ) -> None:
+        tr = make_test_run(cmd_args_overrides={"gpus_per_node": 4}, output_subdir="out_gpus")
+        cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr)
+        wrapper_content = self._wrapper_content(cmd_gen)
+        assert "--additional_slurm_params" in wrapper_content
+        assert "gpus-per-node=4" in wrapper_content

From e6c9cb22a0f5b605e325b2f70f8191b268b262a8 Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Wed, 11 Mar 2026 08:37:31 +0200
Subject: [PATCH 06/30] Update gb200 M-Bridge r0.3.0 qwen recipe

---
 .../test/gb200/megatron_bridge_qwen_30b.toml              | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml
index b06b9d1f6..d7903c026 100644
--- a/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml
+++ b/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml
@@ -22,12 +22,12 @@ extra_container_mounts = []
 
 [[git_repos]]
 url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
-commit = "v0.3.0"
+commit = "r0.3.0"
 mount_as = "/opt/Megatron-Bridge"
 
 [cmd_args]
 gpu_type = "gb200"
-container_image = "nvcr.io#nvidia/nemo:26.02.00"
+container_image = "nvcr.io#nvidia/nemo:26.02"
 model_family_name = "qwen"
 model_recipe_name = "qwen3_30b_a3b"
 gpus_per_node = 4
@@ -37,3 +37,7 @@ task = "pretrain"
 compute_dtype = "fp8_mx"
 hf_token = ""
 enable_vboost = true
+detach = false
+
+[extra_env_vars]
+PYTHONPATH = "/opt/Megatron-Bridge/3rdparty/Megatron-LM:${PYTHONPATH}"

From 2acfb29771cbdf703cf7c967912a586f3e56d868 Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Wed, 11 Mar 2026 10:26:26 +0200
Subject: [PATCH 07/30] ruff check & ruff format

---
 tests/systems/slurm/test_allocation.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tests/systems/slurm/test_allocation.py b/tests/systems/slurm/test_allocation.py
index 5913fa1a7..cc690a920 100644
--- a/tests/systems/slurm/test_allocation.py
+++ b/tests/systems/slurm/test_allocation.py
@@ -120,17 +120,15 @@ def test_exclude_nodes_selects_from_remaining_pool(
         self, slurm_system: SlurmSystem, monkeypatch: pytest.MonkeyPatch
     ):
         """Excluding a node from a group should still yield the requested count from the remaining pool."""
-        system, all_nodes, _ = self.prepare(slurm_system, [], monkeypatch)
+        system, _all_nodes, _ = self.prepare(slurm_system, [], monkeypatch)
         nnodes, nodes_list = system.get_nodes_by_spec(1, ["main:group1:4"], exclude_nodes={"node03"})
         assert nnodes == 4
         assert "node03" not in nodes_list
         assert len(nodes_list) == 4
 
-    def test_exclude_multiple_nodes_from_group(
-        self, slurm_system: SlurmSystem, monkeypatch: pytest.MonkeyPatch
-    ):
+    def test_exclude_multiple_nodes_from_group(self, slurm_system: SlurmSystem, monkeypatch: pytest.MonkeyPatch):
         """Excluding multiple nodes still selects the requested count from remaining nodes."""
-        system, all_nodes, _ = self.prepare(slurm_system, [], monkeypatch)
+        system, _all_nodes, _ = self.prepare(slurm_system, [], monkeypatch)
         nnodes, nodes_list = system.get_nodes_by_spec(1, ["main:group1:3"], exclude_nodes={"node01", "node05"})
         assert nnodes == 3
         assert "node01" not in nodes_list

From 45e6153a77b4fd2a65fa6395204e90c3c9a20d26 Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Wed, 11 Mar 2026 10:33:33 +0200
Subject: [PATCH 08/30] Fix tests to reflect updates

---
 tests/ref_data/megatron-bridge.sbatch | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ref_data/megatron-bridge.sbatch b/tests/ref_data/megatron-bridge.sbatch
index a65cc44ed..4c95d637f 100644
--- a/tests/ref_data/megatron-bridge.sbatch
+++ b/tests/ref_data/megatron-bridge.sbatch
@@ -19,7 +19,7 @@ if [ "${WANDB_INSTALL_RC}" -ne 0 ]; then
 fi
 
 LAUNCH_RC=0
-NEMORUN_HOME="__OUTPUT_DIR__/output" __INSTALL_DIR__/Run__main-venv/bin/python __INSTALL_DIR__/Megatron-Bridge__main/scripts/performance/setup_experiment.py -p main -i __OUTPUT_DIR__/output/megatron_bridge_image.sqsh -hf dummy_token -ng 8 -gn 4 --golden_values_path cloudai_megatron_bridge_golden_values.json -cb 'export CUDA_VISIBLE_DEVICES=0,1,2,3' -cb 'export NCCL_DEBUG=INFO' -m qwen3 -mr 30b_a3b --detach false >>"$LOG" 2>&1 || LAUNCH_RC=$?
+NEMORUN_HOME="__OUTPUT_DIR__/output" __INSTALL_DIR__/Run__main-venv/bin/python __INSTALL_DIR__/Megatron-Bridge__main/scripts/performance/setup_experiment.py -p main -i __OUTPUT_DIR__/output/megatron_bridge_image.sqsh -hf dummy_token -ng 8 -gn 4 --golden_values_path cloudai_megatron_bridge_golden_values.json -cm __INSTALL_DIR__/Megatron-Bridge__main:/opt/Megatron-Bridge -cb 'export CUDA_VISIBLE_DEVICES=0,1,2,3' -cb 'export NCCL_DEBUG=INFO' -m qwen3 -mr 30b_a3b --detach false --additional_slurm_params gpus-per-node=4 >>"$LOG" 2>&1 || LAUNCH_RC=$?
 
 
 JOB_ID=""

From e14fa6f0e862183f789154b61aa5955cd1637228 Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Wed, 11 Mar 2026 10:39:13 +0200
Subject: [PATCH 09/30] Update copyright year

---
 src/cloudai/_core/test_scenario.py        | 2 +-
 src/cloudai/systems/slurm/slurm_system.py | 2 +-
 src/cloudai/test_scenario_parser.py       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/cloudai/_core/test_scenario.py b/src/cloudai/_core/test_scenario.py
index 22484ddc1..a9329621b 100644
--- a/src/cloudai/_core/test_scenario.py
+++ b/src/cloudai/_core/test_scenario.py
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/cloudai/systems/slurm/slurm_system.py b/src/cloudai/systems/slurm/slurm_system.py
index 36395b3bb..3eb74563c 100644
--- a/src/cloudai/systems/slurm/slurm_system.py
+++ b/src/cloudai/systems/slurm/slurm_system.py
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/cloudai/test_scenario_parser.py b/src/cloudai/test_scenario_parser.py
index a75c467af..b324d638c 100644
--- a/src/cloudai/test_scenario_parser.py
+++ b/src/cloudai/test_scenario_parser.py
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

From 54f4fbbc09b76740a89136473e92ec00cdda08ec Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Wed, 11 Mar 2026 11:28:52 +0200
Subject: [PATCH 10/30] Add back the defensive filter

---
 .../workloads/megatron_bridge/slurm_command_gen_strategy.py    | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
index 338eb24fc..7ae8034a8 100644
--- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
@@ -225,8 +225,7 @@ def _installed_container_path() -> str:
             else:
                 container_path = _installed_container_path()
 
-        mounts: list[str] = []
-        mounts.extend(tdef.extra_container_mounts or [])
+        mounts: list[str] = [str(m).strip() for m in (tdef.extra_container_mounts or []) if str(m).strip()]
 
         # When the user sets mount_as on the Megatron-Bridge git repo, bind-mount the
         # installed clone into the container to override the image's built-in copy.

From ff2f2fdc7cc8dfeff5802d5bff331b10dd81cd51 Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Wed, 11 Mar 2026 12:07:48 +0200
Subject: [PATCH 11/30] Enhance nodes allocation for edge cases

---
 .../slurm/slurm_command_gen_strategy.py       |  3 ++
 src/cloudai/systems/slurm/slurm_system.py     |  8 +++++
 .../slurm/test_command_gen_strategy.py        | 33 +++++++++++++++++++
 tests/systems/slurm/test_system.py            | 22 +++++++++++++
 4 files changed, 66 insertions(+)

diff --git a/src/cloudai/systems/slurm/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/slurm_command_gen_strategy.py
index 661eda35f..24725592f 100644
--- a/src/cloudai/systems/slurm/slurm_command_gen_strategy.py
+++ b/src/cloudai/systems/slurm/slurm_command_gen_strategy.py
@@ -424,6 +424,9 @@ def _append_nodes_related_directives(self, content: List[str]) -> Optional[Path]
 
         content.append(f"#SBATCH -N {num_nodes}")
 
+        if self.test_run.exclude_nodes:
+            content.append(f"#SBATCH --exclude={self.test_run.exclude_nodes}")
+
         return None
 
     def _format_env_vars(self, env_vars: Dict[str, Any]) -> str:
diff --git a/src/cloudai/systems/slurm/slurm_system.py b/src/cloudai/systems/slurm/slurm_system.py
index 3eb74563c..7771139fe 100644
--- a/src/cloudai/systems/slurm/slurm_system.py
+++ b/src/cloudai/systems/slurm/slurm_system.py
@@ -744,12 +744,20 @@ def get_nodes_by_spec(
 
         Returns:
             Tuple[int, list[str]]: The number of nodes and a list of node names.
+
+        Raises:
+            ValueError: If node specifications were provided but resolved to an empty list after applying exclusions.
         """
         num_nodes, node_list = num_nodes, []
         parsed_nodes = self.parse_nodes(nodes, exclude_nodes=exclude_nodes)
         if parsed_nodes:
             num_nodes = len(parsed_nodes)
             node_list = parsed_nodes
+        elif nodes:
+            raise ValueError(
+                f"Node specifications {nodes} resolved to an empty node list after applying "
+                f"exclude_nodes={exclude_nodes}. Cannot fall back to unconstrained allocation."
+            )
         return num_nodes, sorted(node_list)
 
     def system_installables(self) -> list[Installable]:
diff --git a/tests/systems/slurm/test_command_gen_strategy.py b/tests/systems/slurm/test_command_gen_strategy.py
index e18fdf5ed..eeecc0298 100644
--- a/tests/systems/slurm/test_command_gen_strategy.py
+++ b/tests/systems/slurm/test_command_gen_strategy.py
@@ -361,6 +361,39 @@ def test_distribution_fallback_when_no_nodes(strategy_fixture: SlurmCommandGenSt
     assert "#SBATCH --nodelist=" not in content
 
 
+def test_exclude_nodes_directive_when_no_nodelist(strategy_fixture: SlurmCommandGenStrategy) -> None:
+    strategy_fixture.test_run.nodes = []
+    strategy_fixture.test_run.num_nodes = 3
+    strategy_fixture.test_run.exclude_nodes = "node01,node02"
+    content: List[str] = []
+    strategy_fixture._append_nodes_related_directives(content)
+
+    assert "#SBATCH -N 3" in content
+    assert "#SBATCH --exclude=node01,node02" in content
+
+
+def test_no_exclude_directive_when_nodelist_present(slurm_system: SlurmSystem, testrun_fixture: TestRun) -> None:
+    testrun_fixture.nodes = ["node3", "node4"]
+    testrun_fixture.exclude_nodes = "node01,node02"
+    strategy = MySlurmCommandGenStrategy(slurm_system, testrun_fixture)
+    content: List[str] = []
+    strategy._append_nodes_related_directives(content)
+
+    assert "#SBATCH --nodelist=node3,node4" in content
+    assert "#SBATCH --exclude=" not in content
+
+
+def test_no_exclude_directive_when_exclude_nodes_unset(strategy_fixture: SlurmCommandGenStrategy) -> None:
+    strategy_fixture.test_run.nodes = []
+    strategy_fixture.test_run.num_nodes = 2
+    strategy_fixture.test_run.exclude_nodes = None
+    content: List[str] = []
+    strategy_fixture._append_nodes_related_directives(content)
+
+    assert "#SBATCH -N 2" in content
+    assert not any("--exclude" in line for line in content)
+
+
 def test_nodelist_over_num_nodes(slurm_system: SlurmSystem, testrun_fixture: TestRun) -> None:
     testrun_fixture.nodes = ["nodeA", "nodeB", "nodeC"]
     testrun_fixture.num_nodes = 5
diff --git a/tests/systems/slurm/test_system.py b/tests/systems/slurm/test_system.py
index 6ba1da1d1..e7a2ddd89 100644
--- a/tests/systems/slurm/test_system.py
+++ b/tests/systems/slurm/test_system.py
@@ -367,6 +367,28 @@ def test_explicit_node_names(
         assert num_nodes == exp_nnodes
         assert node_list == exp_nodes
 
+    @patch("cloudai.systems.slurm.slurm_system.SlurmSystem.parse_nodes")
+    def test_raises_when_all_nodes_excluded(self, mock_parse_nodes: Mock, slurm_system: SlurmSystem):
+        mock_parse_nodes.return_value = []
+        exclude = {"node01", "node02"}
+
+        with pytest.raises(ValueError, match="resolved to an empty node list"):
+            slurm_system.get_nodes_by_spec(2, ["node0[1-2]"], exclude_nodes=exclude)
+
+    @patch("cloudai.systems.slurm.slurm_system.SlurmSystem.parse_nodes")
+    def test_raises_when_parse_nodes_returns_empty_for_nonempty_specs(
+        self, mock_parse_nodes: Mock, slurm_system: SlurmSystem
+    ):
+        mock_parse_nodes.return_value = []
+
+        with pytest.raises(ValueError, match="resolved to an empty node list"):
+            slurm_system.get_nodes_by_spec(1, ["main:group1:3"])
+
+    def test_empty_nodes_with_exclude_still_returns_unconstrained(self, slurm_system: SlurmSystem):
+        num_nodes, node_list = slurm_system.get_nodes_by_spec(3, [], exclude_nodes={"node01"})
+        assert num_nodes == 3
+        assert node_list == []
+
 
 class ConcreteSlurmStrategy(SlurmCommandGenStrategy):
     def _container_mounts(self) -> list[str]:

From f26fe7bceecf7e880292b145cca929c93b471551 Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Wed, 11 Mar 2026 12:14:55 +0200
Subject: [PATCH 12/30] Enhance tests by avoiding default names

---
 .../megatron_bridge/test_command_gen_strategy_slurm.py    | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py
index 430f36d38..7dfbfa7ec 100644
--- a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py
@@ -323,14 +323,14 @@ def test_use_recipes_emitted_only_when_true(
     def test_mount_as_adds_repo_to_container_mounts(
         self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun], tmp_path: Path
     ) -> None:
-        tr = make_test_run(mount_as="/opt/Megatron-Bridge", output_subdir="out_mount")
+        tr = make_test_run(mount_as="/opt/custom-megatron", output_subdir="out_mount")
         tdef = cast(MegatronBridgeTestDefinition, tr.test)
         repo_path = tdef.megatron_bridge_repo.installed_path
         assert repo_path is not None
 
         cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr)
         wrapper_content = self._wrapper_content(cmd_gen)
-        assert f"-cm {repo_path.absolute()}:/opt/Megatron-Bridge" in wrapper_content
+        assert f"-cm {repo_path.absolute()}:/opt/custom-megatron" in wrapper_content
 
     def test_no_mount_as_skips_repo_container_mount(
         self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun]
@@ -344,8 +344,8 @@ def test_no_mount_as_skips_repo_container_mount(
     def test_gpus_per_node_passed_as_additional_slurm_param(
         self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun]
     ) -> None:
-        tr = make_test_run(cmd_args_overrides={"gpus_per_node": 4}, output_subdir="out_gpus")
+        tr = make_test_run(cmd_args_overrides={"gpus_per_node": 2}, output_subdir="out_gpus")
         cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr)
         wrapper_content = self._wrapper_content(cmd_gen)
         assert "--additional_slurm_params" in wrapper_content
-        assert "gpus-per-node=4" in wrapper_content
+        assert "gpus-per-node=2" in wrapper_content

From a159736ae1fca5fd7139b6bf3c726e5d6ff3beac Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Wed, 11 Mar 2026 12:34:10 +0200
Subject: [PATCH 13/30] Make exclude nodes correctly propagate to M-Bridge

---
 .../workloads/megatron_bridge/slurm_command_gen_strategy.py     | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
index 7ae8034a8..a05fe1079 100644
--- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
@@ -409,6 +409,8 @@ def add_field(field: str, flag: str, value: Any) -> None:
         if node_list:
             nodelist_str = ",".join(node_list)
             additional_slurm_params.append(f"nodelist={nodelist_str}")
+        elif self.test_run.exclude_nodes:
+            additional_slurm_params.append(f"exclude={self.test_run.exclude_nodes}")
 
         if additional_slurm_params:
             parts.extend(["--additional_slurm_params", shlex.quote(" ".join(additional_slurm_params))])

From 5da0a578eb5d0a86007d08bb88290ecb1df4c4cb Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Mon, 16 Mar 2026 08:23:46 +0200
Subject: [PATCH 14/30] Improve error messages for excluding nodes

---
 src/cloudai/systems/slurm/slurm_system.py | 11 ++++++++---
 tests/systems/slurm/test_system.py        |  4 ++--
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/cloudai/systems/slurm/slurm_system.py b/src/cloudai/systems/slurm/slurm_system.py
index 7771139fe..46bb9b1bf 100644
--- a/src/cloudai/systems/slurm/slurm_system.py
+++ b/src/cloudai/systems/slurm/slurm_system.py
@@ -746,7 +746,7 @@ def get_nodes_by_spec(
             Tuple[int, list[str]]: The number of nodes and a list of node names.
 
         Raises:
-            ValueError: If node specifications were provided but resolved to an empty list after applying exclusions.
+            ValueError: If node specifications were provided but resolved to an empty list.
         """
         num_nodes, node_list = num_nodes, []
         parsed_nodes = self.parse_nodes(nodes, exclude_nodes=exclude_nodes)
@@ -754,9 +754,14 @@ def get_nodes_by_spec(
             num_nodes = len(parsed_nodes)
             node_list = parsed_nodes
         elif nodes:
+            reason = (
+                f"after excluding nodes {exclude_nodes}"
+                if exclude_nodes
+                else "— no nodes are available (all may be DRAIN/DOWN)"
+            )
             raise ValueError(
-                f"Node specifications {nodes} resolved to an empty node list after applying "
-                f"exclude_nodes={exclude_nodes}. Cannot fall back to unconstrained allocation."
+                f"Node specifications {nodes} resolved to an empty node list {reason}. "
+                "Cannot fall back to unconstrained allocation."
             )
         return num_nodes, sorted(node_list)
 
diff --git a/tests/systems/slurm/test_system.py b/tests/systems/slurm/test_system.py
index e7a2ddd89..3a51f3360 100644
--- a/tests/systems/slurm/test_system.py
+++ b/tests/systems/slurm/test_system.py
@@ -372,7 +372,7 @@ def test_raises_when_all_nodes_excluded(self, mock_parse_nodes: Mock, slurm_syst
         mock_parse_nodes.return_value = []
         exclude = {"node01", "node02"}
 
-        with pytest.raises(ValueError, match="resolved to an empty node list"):
+        with pytest.raises(ValueError, match="after excluding nodes"):
             slurm_system.get_nodes_by_spec(2, ["node0[1-2]"], exclude_nodes=exclude)
 
     @patch("cloudai.systems.slurm.slurm_system.SlurmSystem.parse_nodes")
@@ -381,7 +381,7 @@ def test_raises_when_parse_nodes_returns_empty_for_nonempty_specs(
     ):
         mock_parse_nodes.return_value = []
 
-        with pytest.raises(ValueError, match="resolved to an empty node list"):
+        with pytest.raises(ValueError, match="no nodes are available"):
             slurm_system.get_nodes_by_spec(1, ["main:group1:3"])
 
     def test_empty_nodes_with_exclude_still_returns_unconstrained(self, slurm_system: SlurmSystem):

From 2d7b98b2d845239c312fc283f167219bba717bbb Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Mon, 16 Mar 2026 08:36:28 +0200
Subject: [PATCH 15/30] Improve exclude nodes APIs

---
 src/cloudai/_core/test_scenario.py            |  2 +-
 src/cloudai/models/scenario.py                | 11 ++++++----
 .../slurm/slurm_command_gen_strategy.py       |  9 ++++-----
 src/cloudai/systems/slurm/slurm_system.py     | 20 +++++++++++--------
 .../slurm_command_gen_strategy.py             |  2 +-
 tests/systems/slurm/test_allocation.py        |  4 ++--
 .../slurm/test_command_gen_strategy.py        |  6 +++---
 tests/systems/slurm/test_system.py            |  4 ++--
 8 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/src/cloudai/_core/test_scenario.py b/src/cloudai/_core/test_scenario.py
index a9329621b..5eb28820c 100644
--- a/src/cloudai/_core/test_scenario.py
+++ b/src/cloudai/_core/test_scenario.py
@@ -62,6 +62,7 @@ class TestRun:
     test: TestDefinition
     num_nodes: Union[int, list[int]]
     nodes: List[str]
+    exclude_nodes: List[str] = field(default_factory=list)
     output_path: Path = Path("")
     iterations: int = 1
     current_iteration: int = 0
@@ -75,7 +76,6 @@ class TestRun:
     post_test: Optional[TestScenario] = None
     reports: Set[Type[ReportGenerationStrategy]] = field(default_factory=set)
     extra_srun_args: str | None = None
-    exclude_nodes: str | None = None
 
     def __hash__(self) -> int:
         return hash(self.name + self.test.name + str(self.iterations) + str(self.current_iteration))
diff --git a/src/cloudai/models/scenario.py b/src/cloudai/models/scenario.py
index 823a83527..beeb84244 100644
--- a/src/cloudai/models/scenario.py
+++ b/src/cloudai/models/scenario.py
@@ -73,6 +73,13 @@ class TestRunModel(BaseModel):
     test_name: Optional[str] = None
     num_nodes: int | list[int] | None = None
     nodes: list[str] = Field(default_factory=list)
+    exclude_nodes: list[str] = Field(
+        default_factory=list,
+        description=(
+            "Hostnames to exclude from the resolved node list. "
+            "Supports Slurm range syntax, e.g. ['node-048', 'node-[101-104]']."
+        ),
+    )
     weight: int = 0
     iterations: int = 1
     sol: Optional[float] = None
@@ -80,10 +87,6 @@ class TestRunModel(BaseModel):
     time_limit: Optional[str] = None
     dependencies: list[TestRunDependencyModel] = Field(default_factory=list)
     extra_srun_args: str | None = None
-    exclude_nodes: str | None = Field(
-        default=None,
-        description="Comma-separated hostnames to exclude from the resolved node list.",
-    )
 
     # test definition fields
     name: Optional[str] = None
diff --git a/src/cloudai/systems/slurm/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/slurm_command_gen_strategy.py
index 24725592f..67619a1b9 100644
--- a/src/cloudai/systems/slurm/slurm_command_gen_strategy.py
+++ b/src/cloudai/systems/slurm/slurm_command_gen_strategy.py
@@ -26,7 +26,7 @@
 from cloudai.core import CommandGenStrategy, Registry, System, TestRun, TestScenario
 from cloudai.models.scenario import TestRunDetails
 
-from .slurm_system import SlurmSystem, parse_node_list
+from .slurm_system import SlurmSystem
 
 
 class SlurmCommandGenStrategy(CommandGenStrategy):
@@ -425,7 +425,7 @@ def _append_nodes_related_directives(self, content: List[str]) -> Optional[Path]
         content.append(f"#SBATCH -N {num_nodes}")
 
         if self.test_run.exclude_nodes:
-            content.append(f"#SBATCH --exclude={self.test_run.exclude_nodes}")
+            content.append(f"#SBATCH --exclude={','.join(self.test_run.exclude_nodes)}")
 
         return None
 
@@ -468,7 +468,7 @@ def get_cached_nodes_spec(self) -> tuple[int, list[str]]:
                 str(self.test_run.step),
                 str(self.test_run.nnodes),
                 ",".join(self.test_run.nodes),
-                self.test_run.exclude_nodes or "",
+                ",".join(self.test_run.exclude_nodes),
             ]
         )
 
@@ -476,9 +476,8 @@ def get_cached_nodes_spec(self) -> tuple[int, list[str]]:
             logging.debug(f"Using cached node allocation for {cache_key}: {self._node_spec_cache[cache_key]}")
             return self._node_spec_cache[cache_key]
 
-        excluded = set(parse_node_list(self.test_run.exclude_nodes)) if self.test_run.exclude_nodes else None
         num_nodes, node_list = self.system.get_nodes_by_spec(
-            self.test_run.nnodes, self.test_run.nodes, exclude_nodes=excluded
+            self.test_run.nnodes, self.test_run.nodes, exclude_nodes=self.test_run.exclude_nodes or None
         )
 
         self._node_spec_cache[cache_key] = (num_nodes, node_list)
diff --git a/src/cloudai/systems/slurm/slurm_system.py b/src/cloudai/systems/slurm/slurm_system.py
index 46bb9b1bf..ec5784ae5 100644
--- a/src/cloudai/systems/slurm/slurm_system.py
+++ b/src/cloudai/systems/slurm/slurm_system.py
@@ -447,7 +447,7 @@ def get_available_nodes_from_group(
         partition_name: str,
         group_name: str,
         number_of_nodes: Union[int, str],
-        exclude_nodes: set[str] | None = None,
+        exclude_nodes: list[str] | None = None,
     ) -> List[SlurmNode]:
         """
         Retrieve a specific number of potentially available nodes from a group within a partition.
@@ -460,7 +460,7 @@ def get_available_nodes_from_group(
             group_name (str): The name of the group.
             number_of_nodes (Union[int,str]): The number of nodes to retrieve.
                 Could also be 'all' to retrieve all the nodes from the group.
-            exclude_nodes (set[str] | None): Node names to exclude from the pool before selection.
+            exclude_nodes (list[str] | None): Node names to exclude from the pool before selection.
 
         Returns:
             List[SlurmNode]: Objects that are potentially available for use.
@@ -514,7 +514,7 @@ def group_nodes_by_state(
         self,
         partition_name: str,
         group_name: str,
-        exclude_nodes: set[str] | None = None,
+        exclude_nodes: list[str] | None = None,
     ) -> Dict[SlurmNodeState, List[SlurmNode]]:
         """
         Group nodes by their states, excluding nodes allocated to the current user.
@@ -522,7 +522,7 @@ def group_nodes_by_state(
         Args:
             partition_name (str): The name of the partition.
             group_name (str): The name of the group.
-            exclude_nodes (set[str] | None): Node names to exclude from the pool before grouping.
+            exclude_nodes (list[str] | None): Node names to exclude from the pool before grouping.
 
         Returns:
             Dict[SlurmNodeState, List[SlurmNode]]: A dictionary grouping nodes by their state.
@@ -683,7 +683,7 @@ def convert_state_to_enum(self, state_str: str) -> SlurmNodeState:
                 logging.debug(f"Unknown node state: {core_state}")
                 return SlurmNodeState.UNKNOWN_STATE
 
-    def parse_nodes(self, nodes: List[str], exclude_nodes: set[str] | None = None) -> List[str]:
+    def parse_nodes(self, nodes: List[str], exclude_nodes: list[str] | None = None) -> List[str]:
         """
         Parse a list of node specifications into individual node names.
 
@@ -696,7 +696,8 @@ def parse_nodes(self, nodes: List[str], exclude_nodes: set[str] | None = None) -
                 "partition:group:num_nodes", where "partition" is the partition name, "group" is a group within that
                 partition, and "num_nodes" is the number of nodes requested. Node ranges should be specified with
                 square brackets and dashes, e.g., "node[01-03]" for "node01", "node02", "node03".
-            exclude_nodes (set[str] | None): Node names to exclude from group pools before selection.
+            exclude_nodes (list[str] | None): Node names (or Slurm range expressions) to exclude from group pools
+                before selection. Ranges are expanded internally.
 
         Returns:
             List[str]: A list of node names. For specifications, it includes names of allocated nodes based on the
@@ -706,6 +707,9 @@ def parse_nodes(self, nodes: List[str], exclude_nodes: set[str] | None = None) -
             ValueError: If a specification is malformed, a specified node is not found, or a node range cannot be
                 parsed. This ensures users are aware of incorrect inputs.
         """
+        if exclude_nodes:
+            exclude_nodes = [n for spec in exclude_nodes for n in parse_node_list(spec)]
+
         parsed_nodes = []
         for node_spec in nodes:
             if ":" in node_spec:
@@ -729,7 +733,7 @@ def parse_nodes(self, nodes: List[str], exclude_nodes: set[str] | None = None) -
         return parsed_nodes
 
     def get_nodes_by_spec(
-        self, num_nodes: int, nodes: list[str], exclude_nodes: set[str] | None = None
+        self, num_nodes: int, nodes: list[str], exclude_nodes: list[str] | None = None
     ) -> Tuple[int, list[str]]:
         """
         Retrieve a list of node names based on specifications.
@@ -740,7 +744,7 @@ def get_nodes_by_spec(
         Args:
             num_nodes (int): The number of nodes, can't be `0`.
             nodes (list[str]): A list of node names specifications, slurm format or `PARTITION:GROUP:NUM_NODES`.
-            exclude_nodes (set[str] | None): Node names to exclude from group pools before selection.
+            exclude_nodes (list[str] | None): Node names to exclude from group pools before selection.
 
         Returns:
             Tuple[int, list[str]]: The number of nodes and a list of node names.
diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
index a05fe1079..f1854e732 100644
--- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
@@ -410,7 +410,7 @@ def add_field(field: str, flag: str, value: Any) -> None:
             nodelist_str = ",".join(node_list)
             additional_slurm_params.append(f"nodelist={nodelist_str}")
         elif self.test_run.exclude_nodes:
-            additional_slurm_params.append(f"exclude={self.test_run.exclude_nodes}")
+            additional_slurm_params.append(f"exclude={','.join(self.test_run.exclude_nodes)}")
 
         if additional_slurm_params:
             parts.extend(["--additional_slurm_params", shlex.quote(" ".join(additional_slurm_params))])
diff --git a/tests/systems/slurm/test_allocation.py b/tests/systems/slurm/test_allocation.py
index cc690a920..c8f06a624 100644
--- a/tests/systems/slurm/test_allocation.py
+++ b/tests/systems/slurm/test_allocation.py
@@ -121,7 +121,7 @@ def test_exclude_nodes_selects_from_remaining_pool(
     ):
         """Excluding a node from a group should still yield the requested count from the remaining pool."""
         system, _all_nodes, _ = self.prepare(slurm_system, [], monkeypatch)
-        nnodes, nodes_list = system.get_nodes_by_spec(1, ["main:group1:4"], exclude_nodes={"node03"})
+        nnodes, nodes_list = system.get_nodes_by_spec(1, ["main:group1:4"], exclude_nodes=["node03"])
         assert nnodes == 4
         assert "node03" not in nodes_list
         assert len(nodes_list) == 4
@@ -129,7 +129,7 @@ def test_exclude_nodes_selects_from_remaining_pool(
     def test_exclude_multiple_nodes_from_group(self, slurm_system: SlurmSystem, monkeypatch: pytest.MonkeyPatch):
         """Excluding multiple nodes still selects the requested count from remaining nodes."""
         system, _all_nodes, _ = self.prepare(slurm_system, [], monkeypatch)
-        nnodes, nodes_list = system.get_nodes_by_spec(1, ["main:group1:3"], exclude_nodes={"node01", "node05"})
+        nnodes, nodes_list = system.get_nodes_by_spec(1, ["main:group1:3"], exclude_nodes=["node01", "node05"])
         assert nnodes == 3
         assert "node01" not in nodes_list
         assert "node05" not in nodes_list
diff --git a/tests/systems/slurm/test_command_gen_strategy.py b/tests/systems/slurm/test_command_gen_strategy.py
index eeecc0298..98e103704 100644
--- a/tests/systems/slurm/test_command_gen_strategy.py
+++ b/tests/systems/slurm/test_command_gen_strategy.py
@@ -364,7 +364,7 @@ def test_distribution_fallback_when_no_nodes(strategy_fixture: SlurmCommandGenSt
 def test_exclude_nodes_directive_when_no_nodelist(strategy_fixture: SlurmCommandGenStrategy) -> None:
     strategy_fixture.test_run.nodes = []
     strategy_fixture.test_run.num_nodes = 3
-    strategy_fixture.test_run.exclude_nodes = "node01,node02"
+    strategy_fixture.test_run.exclude_nodes = ["node01", "node02"]
     content: List[str] = []
     strategy_fixture._append_nodes_related_directives(content)
 
@@ -374,7 +374,7 @@ def test_exclude_nodes_directive_when_no_nodelist(strategy_fixture: SlurmCommand
 
 def test_no_exclude_directive_when_nodelist_present(slurm_system: SlurmSystem, testrun_fixture: TestRun) -> None:
     testrun_fixture.nodes = ["node3", "node4"]
-    testrun_fixture.exclude_nodes = "node01,node02"
+    testrun_fixture.exclude_nodes = ["node01", "node02"]
     strategy = MySlurmCommandGenStrategy(slurm_system, testrun_fixture)
     content: List[str] = []
     strategy._append_nodes_related_directives(content)
@@ -386,7 +386,7 @@ def test_no_exclude_directive_when_nodelist_present(slurm_system: SlurmSystem, t
 def test_no_exclude_directive_when_exclude_nodes_unset(strategy_fixture: SlurmCommandGenStrategy) -> None:
     strategy_fixture.test_run.nodes = []
     strategy_fixture.test_run.num_nodes = 2
-    strategy_fixture.test_run.exclude_nodes = None
+    strategy_fixture.test_run.exclude_nodes = []
     content: List[str] = []
     strategy_fixture._append_nodes_related_directives(content)
 
diff --git a/tests/systems/slurm/test_system.py b/tests/systems/slurm/test_system.py
index 3a51f3360..0c0e240b0 100644
--- a/tests/systems/slurm/test_system.py
+++ b/tests/systems/slurm/test_system.py
@@ -370,7 +370,7 @@ def test_explicit_node_names(
     @patch("cloudai.systems.slurm.slurm_system.SlurmSystem.parse_nodes")
     def test_raises_when_all_nodes_excluded(self, mock_parse_nodes: Mock, slurm_system: SlurmSystem):
         mock_parse_nodes.return_value = []
-        exclude = {"node01", "node02"}
+        exclude = ["node01", "node02"]
 
         with pytest.raises(ValueError, match="after excluding nodes"):
             slurm_system.get_nodes_by_spec(2, ["node0[1-2]"], exclude_nodes=exclude)
@@ -385,7 +385,7 @@ def test_raises_when_parse_nodes_returns_empty_for_nonempty_specs(
             slurm_system.get_nodes_by_spec(1, ["main:group1:3"])
 
     def test_empty_nodes_with_exclude_still_returns_unconstrained(self, slurm_system: SlurmSystem):
-        num_nodes, node_list = slurm_system.get_nodes_by_spec(3, [], exclude_nodes={"node01"})
+        num_nodes, node_list = slurm_system.get_nodes_by_spec(3, [], exclude_nodes=["node01"])
         assert num_nodes == 3
         assert node_list == []
 

From 174dcefae7936388256d975e9d401f038afafe44 Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Mon, 16 Mar 2026 08:54:04 +0200
Subject: [PATCH 16/30] Change additional_slurm_params separator to semi-colon

---
 .../workloads/megatron_bridge/slurm_command_gen_strategy.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
index f1854e732..522f4b750 100644
--- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
@@ -413,7 +413,7 @@ def add_field(field: str, flag: str, value: Any) -> None:
             additional_slurm_params.append(f"exclude={','.join(self.test_run.exclude_nodes)}")
 
         if additional_slurm_params:
-            parts.extend(["--additional_slurm_params", shlex.quote(" ".join(additional_slurm_params))])
+            parts.extend(["--additional_slurm_params", shlex.quote(";".join(additional_slurm_params))])
 
         # Config variant
         add_field("config_variant", "-cv", args.config_variant)

From 3480ba87116b53ca01555adacee6a8001e8cc6c4 Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Mon, 16 Mar 2026 09:13:04 +0200
Subject: [PATCH 17/30] Make init_submodules optional

---
 .../test/gb200/megatron_bridge_qwen_30b.toml  |  1 +
 src/cloudai/_core/installables.py             |  1 +
 .../kubernetes/kubernetes_installer.py        | 13 ++++++-----
 src/cloudai/systems/slurm/slurm_installer.py  | 13 ++++++-----
 tests/test_git_repo_installer.py              | 23 ++++++++++++++++++-
 5 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml
index d7903c026..45976b712 100644
--- a/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml
+++ b/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml
@@ -24,6 +24,7 @@ extra_container_mounts = []
 url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
 commit = "r0.3.0"
 mount_as = "/opt/Megatron-Bridge"
+init_submodules = true
 
 [cmd_args]
 gpu_type = "gb200"
diff --git a/src/cloudai/_core/installables.py b/src/cloudai/_core/installables.py
index f8527876c..dc0126a0f 100644
--- a/src/cloudai/_core/installables.py
+++ b/src/cloudai/_core/installables.py
@@ -91,6 +91,7 @@ class GitRepo(Installable, BaseModel):
 
     url: str
     commit: str
+    init_submodules: bool = False
     installed_path: Optional[Path] = None
     mount_as: Optional[str] = None
 
diff --git a/src/cloudai/systems/kubernetes/kubernetes_installer.py b/src/cloudai/systems/kubernetes/kubernetes_installer.py
index f360230ea..0f88985ee 100644
--- a/src/cloudai/systems/kubernetes/kubernetes_installer.py
+++ b/src/cloudai/systems/kubernetes/kubernetes_installer.py
@@ -172,12 +172,13 @@ def _install_one_git_repo(self, item: GitRepo) -> InstallStatusResult:
                 rmtree(repo_path)
             return res
 
-        res = self._init_submodules(repo_path)
-        if not res.success:
-            logging.error(f"Submodule init failed, removing cloned repository at {repo_path}")
-            if repo_path.exists():
-                rmtree(repo_path)
-            return res
+        if item.init_submodules:
+            res = self._init_submodules(repo_path)
+            if not res.success:
+                logging.error(f"Submodule init failed, removing cloned repository at {repo_path}")
+                if repo_path.exists():
+                    rmtree(repo_path)
+                return res
 
         item.installed_path = repo_path
         return InstallStatusResult(True)
diff --git a/src/cloudai/systems/slurm/slurm_installer.py b/src/cloudai/systems/slurm/slurm_installer.py
index d286c32d7..352ea69a3 100644
--- a/src/cloudai/systems/slurm/slurm_installer.py
+++ b/src/cloudai/systems/slurm/slurm_installer.py
@@ -225,12 +225,13 @@ def _install_one_git_repo(self, item: GitRepo) -> InstallStatusResult:
                 rmtree(repo_path)
             return res
 
-        res = self._init_submodules(repo_path)
-        if not res.success:
-            logging.error(f"Submodule init failed, removing cloned repository at {repo_path}")
-            if repo_path.exists():
-                rmtree(repo_path)
-            return res
+        if item.init_submodules:
+            res = self._init_submodules(repo_path)
+            if not res.success:
+                logging.error(f"Submodule init failed, removing cloned repository at {repo_path}")
+                if repo_path.exists():
+                    rmtree(repo_path)
+                return res
 
         item.installed_path = repo_path
         return InstallStatusResult(True)
diff --git a/tests/test_git_repo_installer.py b/tests/test_git_repo_installer.py
index 3d12298c1..dea795790 100644
--- a/tests/test_git_repo_installer.py
+++ b/tests/test_git_repo_installer.py
@@ -258,6 +258,7 @@ def test_error_initializing_submodules(self, installer: Union[KubernetesInstalle
     def test_submodule_failure_cleans_up_repo(
         self, installer: Union[KubernetesInstaller, SlurmInstaller], git: GitRepo
     ):
+        git.init_submodules = True
         repo_path = installer.system.install_path / git.repo_name
         installer._clone_repository = Mock(
             side_effect=lambda url, path: (path.mkdir(parents=True, exist_ok=True), InstallStatusResult(True))[1]
@@ -273,11 +274,31 @@ def test_submodule_failure_cleans_up_repo(
     def test_all_good_flow(self, installer: Union[KubernetesInstaller, SlurmInstaller], git: GitRepo):
         installer._clone_repository = Mock(return_value=InstallStatusResult(True))
         installer._checkout_commit = Mock(return_value=InstallStatusResult(True))
-        installer._init_submodules = Mock(return_value=InstallStatusResult(True))
         res = installer._install_one_git_repo(git)
         assert res.success
         assert git.installed_path == installer.system.install_path / git.repo_name
 
+    def test_submodules_skipped_when_not_requested(
+        self, installer: Union[KubernetesInstaller, SlurmInstaller], git: GitRepo
+    ):
+        installer._clone_repository = Mock(return_value=InstallStatusResult(True))
+        installer._checkout_commit = Mock(return_value=InstallStatusResult(True))
+        installer._init_submodules = Mock(return_value=InstallStatusResult(True))
+        res = installer._install_one_git_repo(git)
+        assert res.success
+        installer._init_submodules.assert_not_called()
+
+    def test_submodules_run_when_requested(
+        self, installer: Union[KubernetesInstaller, SlurmInstaller], git: GitRepo
+    ):
+        git.init_submodules = True
+        installer._clone_repository = Mock(return_value=InstallStatusResult(True))
+        installer._checkout_commit = Mock(return_value=InstallStatusResult(True))
+        installer._init_submodules = Mock(return_value=InstallStatusResult(True))
+        res = installer._install_one_git_repo(git)
+        assert res.success
+        installer._init_submodules.assert_called_once()
+
     def test_uninstall_no_repo(self, installer: Union[KubernetesInstaller, SlurmInstaller], git: GitRepo):
         res = installer._uninstall_git_repo(git)
         assert res.success

From e12ec45d4ee22cf3012e63d3a6225f8ac856ac1e Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Mon, 16 Mar 2026 09:21:28 +0200
Subject: [PATCH 18/30] Update configurations

---
 .../test/b200/megatron_bridge_qwen_30b.toml               | 8 ++++++--
 .../test/gb200/megatron_bridge_qwen_30b.toml              | 1 -
 .../test/gb300/megatron_bridge_qwen_30b.toml              | 8 ++++++--
 .../test/h100/megatron_bridge_qwen_30b.toml               | 8 ++++++--
 4 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/conf/experimental/megatron_bridge/test/b200/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/b200/megatron_bridge_qwen_30b.toml
index 238dbc7de..108f78993 100644
--- a/conf/experimental/megatron_bridge/test/b200/megatron_bridge_qwen_30b.toml
+++ b/conf/experimental/megatron_bridge/test/b200/megatron_bridge_qwen_30b.toml
@@ -22,12 +22,13 @@ extra_container_mounts = []
 
 [[git_repos]]
 url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
-commit = "v0.3.0"
+commit = "r0.3.0"
 mount_as = "/opt/Megatron-Bridge"
+init_submodules = true
 
 [cmd_args]
 gpu_type = "b200"
-container_image = "nvcr.io#nvidia/nemo:26.02.00"
+container_image = "nvcr.io#nvidia/nemo:26.02"
 model_family_name = "qwen"
 model_recipe_name = "qwen3_30b_a3b"
 gpus_per_node = 8
@@ -37,3 +38,6 @@ task = "pretrain"
 compute_dtype = "fp8_mx"
 hf_token = ""
 enable_vboost = true
+
+[extra_env_vars]
+PYTHONPATH = "/opt/Megatron-Bridge/3rdparty/Megatron-LM:${PYTHONPATH}"
diff --git a/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml
index 45976b712..2225ac2ed 100644
--- a/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml
+++ b/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml
@@ -38,7 +38,6 @@ task = "pretrain"
 compute_dtype = "fp8_mx"
 hf_token = ""
 enable_vboost = true
-detach = false
 
 [extra_env_vars]
 PYTHONPATH = "/opt/Megatron-Bridge/3rdparty/Megatron-LM:${PYTHONPATH}"
diff --git a/conf/experimental/megatron_bridge/test/gb300/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/gb300/megatron_bridge_qwen_30b.toml
index 22b692e27..44b724e75 100644
--- a/conf/experimental/megatron_bridge/test/gb300/megatron_bridge_qwen_30b.toml
+++ b/conf/experimental/megatron_bridge/test/gb300/megatron_bridge_qwen_30b.toml
@@ -22,12 +22,13 @@ extra_container_mounts = []
 
 [[git_repos]]
 url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
-commit = "v0.3.0"
+commit = "r0.3.0"
 mount_as = "/opt/Megatron-Bridge"
+init_submodules = true
 
 [cmd_args]
 gpu_type = "gb300"
-container_image = "nvcr.io#nvidia/nemo:26.02.00"
+container_image = "nvcr.io#nvidia/nemo:26.02"
 model_family_name = "qwen"
 model_recipe_name = "qwen3_30b_a3b"
 gpus_per_node = 4
@@ -37,3 +38,6 @@ task = "pretrain"
 compute_dtype = "fp8_mx"
 hf_token = ""
 enable_vboost = true
+
+[extra_env_vars]
+PYTHONPATH = "/opt/Megatron-Bridge/3rdparty/Megatron-LM:${PYTHONPATH}"
diff --git a/conf/experimental/megatron_bridge/test/h100/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/h100/megatron_bridge_qwen_30b.toml
index 68d378052..200e256c6 100644
--- a/conf/experimental/megatron_bridge/test/h100/megatron_bridge_qwen_30b.toml
+++ b/conf/experimental/megatron_bridge/test/h100/megatron_bridge_qwen_30b.toml
@@ -22,12 +22,13 @@ extra_container_mounts = []
 
 [[git_repos]]
 url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
-commit = "v0.3.0"
+commit = "r0.3.0"
 mount_as = "/opt/Megatron-Bridge"
+init_submodules = true
 
 [cmd_args]
 gpu_type = "h100"
-container_image = "nvcr.io#nvidia/nemo:26.02.00"
+container_image = "nvcr.io#nvidia/nemo:26.02"
 model_family_name = "qwen"
 model_recipe_name = "qwen3_30b_a3b"
 gpus_per_node = 8
@@ -37,3 +38,6 @@ task = "pretrain"
 compute_dtype = "fp8_cs"
 hf_token = ""
 enable_vboost = true
+
+[extra_env_vars]
+PYTHONPATH = "/opt/Megatron-Bridge/3rdparty/Megatron-LM:${PYTHONPATH}"

From af471d5371c0d0da9cbb2497e1a12caae3e8fa73 Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Mon, 16 Mar 2026 10:06:53 +0200
Subject: [PATCH 19/30] Make extra_srun_args correctly propagate to M-Bridge

---
 .../slurm_command_gen_strategy.py             | 21 ++++++++++++++++
 .../test_command_gen_strategy_slurm.py        | 24 +++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
index 522f4b750..eda0ba1d0 100644
--- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
@@ -108,6 +108,23 @@ def _normalize_recompute_modules(self, val: Any) -> str:
         joined = ",".join(items)
         return f'"{joined}"'
 
+    @staticmethod
+    def _parse_srun_args_as_slurm_params(srun_args: str) -> list[str]:
+        """Convert ``--key value`` pairs from extra_srun_args into ``key=value`` for --additional_slurm_params."""
+        params: list[str] = []
+        tokens = srun_args.split()
+        i = 0
+        while i < len(tokens):
+            tok = tokens[i]
+            if tok.startswith("--") and "=" in tok:
+                key, val = tok[2:].split("=", 1)
+                params.append(f"{key}={val}")
+            elif tok.startswith("--") and i + 1 < len(tokens) and not tokens[i + 1].startswith("--"):
+                params.append(f"{tok[2:]}={tokens[i + 1]}")
+                i += 1
+            i += 1
+        return params
+
     def _normalize_cuda_graph_scope_arg(self, val: Any) -> str:
         s = str(val).strip().strip("\"'")
         if s.startswith("[") and s.endswith("]"):
@@ -412,6 +429,10 @@ def add_field(field: str, flag: str, value: Any) -> None:
         elif self.test_run.exclude_nodes:
             additional_slurm_params.append(f"exclude={','.join(self.test_run.exclude_nodes)}")
 
+        for source in (self.system.extra_srun_args, self.test_run.extra_srun_args):
+            if source:
+                additional_slurm_params.extend(self._parse_srun_args_as_slurm_params(source))
+
         if additional_slurm_params:
             parts.extend(["--additional_slurm_params", shlex.quote(";".join(additional_slurm_params))])
 
diff --git a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py
index 7dfbfa7ec..4970eae17 100644
--- a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py
@@ -349,3 +349,27 @@ def test_gpus_per_node_passed_as_additional_slurm_param(
         wrapper_content = self._wrapper_content(cmd_gen)
         assert "--additional_slurm_params" in wrapper_content
         assert "gpus-per-node=2" in wrapper_content
+
+    def test_system_extra_srun_args_forwarded(
+        self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun]
+    ) -> None:
+        configured_slurm_system.extra_srun_args = "--reservation my_reserv"
+        tr = make_test_run(output_subdir="out_srun")
+        cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr)
+        wrapper_content = self._wrapper_content(cmd_gen)
+        assert "reservation=my_reserv" in wrapper_content
+
+    def test_test_run_extra_srun_args_forwarded(
+        self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun]
+    ) -> None:
+        tr = make_test_run(output_subdir="out_tr_srun")
+        tr.extra_srun_args = "--exclusive"
+        cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr)
+        wrapper_content = self._wrapper_content(cmd_gen)
+        assert "--additional_slurm_params" in wrapper_content
+
+    def test_parse_srun_args_as_slurm_params(self) -> None:
+        result = MegatronBridgeSlurmCommandGenStrategy._parse_srun_args_as_slurm_params(
+            "--reservation my_reserv --constraint=gpu"
+        )
+        assert result == ["reservation=my_reserv", "constraint=gpu"]

From 52cef4c10f51839763fa031ea75d867f908d88f4 Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Mon, 16 Mar 2026 10:08:10 +0200
Subject: [PATCH 20/30] Allow submission of jobs to resv nodes

---
 src/cloudai/systems/slurm/slurm_system.py | 2 ++
 tests/systems/slurm/test_system.py        | 6 ++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/cloudai/systems/slurm/slurm_system.py b/src/cloudai/systems/slurm/slurm_system.py
index ec5784ae5..2ec73b6fb 100644
--- a/src/cloudai/systems/slurm/slurm_system.py
+++ b/src/cloudai/systems/slurm/slurm_system.py
@@ -531,6 +531,7 @@ def group_nodes_by_state(
             SlurmNodeState.IDLE: [],
             SlurmNodeState.COMPLETING: [],
             SlurmNodeState.ALLOCATED: [],
+            SlurmNodeState.RESERVED: [],
         }
 
         for node in self.groups[partition_name][group_name]:
@@ -566,6 +567,7 @@ def allocate_nodes(
         if isinstance(number_of_nodes, str) and number_of_nodes == "max_avail":
             allocated_nodes.extend(grouped_nodes[SlurmNodeState.IDLE])
             allocated_nodes.extend(grouped_nodes[SlurmNodeState.COMPLETING])
+            allocated_nodes.extend(grouped_nodes[SlurmNodeState.RESERVED])
 
             if len(allocated_nodes) == 0:
                 raise ValueError(
diff --git a/tests/systems/slurm/test_system.py b/tests/systems/slurm/test_system.py
index 0c0e240b0..477015ed2 100644
--- a/tests/systems/slurm/test_system.py
+++ b/tests/systems/slurm/test_system.py
@@ -140,6 +140,7 @@ def grouped_nodes() -> dict[SlurmNodeState, list[SlurmNode]]:
             SlurmNode(name="node04", partition=partition_name, state=SlurmNodeState.COMPLETING)
         ],
         SlurmNodeState.ALLOCATED: [SlurmNode(name="node05", partition=partition_name, state=SlurmNodeState.ALLOCATED)],
+        SlurmNodeState.RESERVED: [SlurmNode(name="node06", partition=partition_name, state=SlurmNodeState.RESERVED)],
     }
 
     return grouped_nodes
@@ -166,6 +167,7 @@ def test_allocate_nodes_max_avail(slurm_system: SlurmSystem, grouped_nodes: dict
         grouped_nodes[SlurmNodeState.IDLE][0].name,
         grouped_nodes[SlurmNodeState.IDLE][1].name,
         grouped_nodes[SlurmNodeState.COMPLETING][0].name,
+        grouped_nodes[SlurmNodeState.RESERVED][0].name,
     ]
     returned_node_names = [node.name for node in available_nodes]
 
@@ -193,8 +195,8 @@ def test_allocate_nodes_exceeding_limit(
     slurm_system: SlurmSystem, grouped_nodes: dict[SlurmNodeState, list[SlurmNode]]
 ):
     group_name = "group_name"
-    num_nodes = 5
-    available_nodes = 4
+    num_nodes = 6
+    available_nodes = 5
 
     with pytest.raises(
         ValueError,

From 62f5cf7fb26092da8d203edf3086d57b76c852ec Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Mon, 16 Mar 2026 10:52:00 +0200
Subject: [PATCH 21/30] Make the no-mount assertion less brittle

---
 .../megatron_bridge/test_command_gen_strategy_slurm.py    | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py
index 4970eae17..4e765955d 100644
--- a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py
@@ -336,10 +336,14 @@ def test_no_mount_as_skips_repo_container_mount(
         self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun]
     ) -> None:
         tr = make_test_run(mount_as=None, output_subdir="out_no_mount")
+        tdef = cast(MegatronBridgeTestDefinition, tr.test)
+        repo_path = tdef.megatron_bridge_repo.installed_path
+        assert repo_path is not None
+
         cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr)
         wrapper_content = self._wrapper_content(cmd_gen)
-        assert "-cm" not in wrapper_content
-        assert "/opt/Megatron-Bridge" not in wrapper_content
+        assert f"{repo_path.absolute()}:" not in wrapper_content
+        assert ":/opt/Megatron-Bridge" not in wrapper_content
 
     def test_gpus_per_node_passed_as_additional_slurm_param(
         self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun]

From f1b027edc5c00c9b1c26f3c8a274deca2584250d Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Mon, 16 Mar 2026 10:59:30 +0200
Subject: [PATCH 22/30] Fix and improve the extra_srun_args propagation

---
 .../megatron_bridge/slurm_command_gen_strategy.py      |  9 ++++++++-
 .../megatron_bridge/test_command_gen_strategy_slurm.py | 10 ++++++++--
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
index eda0ba1d0..d814a6859 100644
--- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
@@ -110,7 +110,12 @@ def _normalize_recompute_modules(self, val: Any) -> str:
 
     @staticmethod
     def _parse_srun_args_as_slurm_params(srun_args: str) -> list[str]:
-        """Convert ``--key value`` pairs from extra_srun_args into ``key=value`` for --additional_slurm_params."""
+        """
+        Convert ``--key value`` pairs from extra_srun_args into ``key=value`` for --additional_slurm_params.
+
+        Standalone boolean flags (e.g. ``--exclusive``) are emitted as bare
+        key names without a ``=value`` suffix.
+        """
         params: list[str] = []
         tokens = srun_args.split()
         i = 0
@@ -122,6 +127,8 @@ def _parse_srun_args_as_slurm_params(srun_args: str) -> list[str]:
             elif tok.startswith("--") and i + 1 < len(tokens) and not tokens[i + 1].startswith("--"):
                 params.append(f"{tok[2:]}={tokens[i + 1]}")
                 i += 1
+            elif tok.startswith("--"):
+                params.append(tok[2:])
             i += 1
         return params
 
diff --git a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py
index 4e765955d..b8e599388 100644
--- a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py
@@ -367,13 +367,19 @@ def test_test_run_extra_srun_args_forwarded(
         self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun]
     ) -> None:
         tr = make_test_run(output_subdir="out_tr_srun")
-        tr.extra_srun_args = "--exclusive"
+        tr.extra_srun_args = "--constraint gpu"
         cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr)
         wrapper_content = self._wrapper_content(cmd_gen)
-        assert "--additional_slurm_params" in wrapper_content
+        assert "constraint=gpu" in wrapper_content
 
     def test_parse_srun_args_as_slurm_params(self) -> None:
         result = MegatronBridgeSlurmCommandGenStrategy._parse_srun_args_as_slurm_params(
             "--reservation my_reserv --constraint=gpu"
         )
         assert result == ["reservation=my_reserv", "constraint=gpu"]
+
+    def test_parse_srun_args_boolean_flags(self) -> None:
+        result = MegatronBridgeSlurmCommandGenStrategy._parse_srun_args_as_slurm_params(
+            "--exclusive --reservation my_reserv --overcommit"
+        )
+        assert result == ["exclusive", "reservation=my_reserv", "overcommit"]

From b6a5dcf3d0928c9d3f81e53a401e4d4a42a131da Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Mon, 16 Mar 2026 12:23:45 +0200
Subject: [PATCH 23/30] ruff format & copyright year

---
 .../test_scenario/megatron_bridge_qwen_30b.toml               | 2 +-
 src/cloudai/_core/installables.py                             | 2 +-
 tests/test_git_repo_installer.py                              | 4 +---
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/conf/experimental/megatron_bridge/test_scenario/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test_scenario/megatron_bridge_qwen_30b.toml
index 16d218f84..d9dc23419 100644
--- a/conf/experimental/megatron_bridge/test_scenario/megatron_bridge_qwen_30b.toml
+++ b/conf/experimental/megatron_bridge/test_scenario/megatron_bridge_qwen_30b.toml
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/cloudai/_core/installables.py b/src/cloudai/_core/installables.py
index dc0126a0f..d6db18589 100644
--- a/src/cloudai/_core/installables.py
+++ b/src/cloudai/_core/installables.py
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tests/test_git_repo_installer.py b/tests/test_git_repo_installer.py
index dea795790..623fcc33c 100644
--- a/tests/test_git_repo_installer.py
+++ b/tests/test_git_repo_installer.py
@@ -288,9 +288,7 @@ def test_submodules_skipped_when_not_requested(
         assert res.success
         installer._init_submodules.assert_not_called()
 
-    def test_submodules_run_when_requested(
-        self, installer: Union[KubernetesInstaller, SlurmInstaller], git: GitRepo
-    ):
+    def test_submodules_run_when_requested(self, installer: Union[KubernetesInstaller, SlurmInstaller], git: GitRepo):
         git.init_submodules = True
         installer._clone_repository = Mock(return_value=InstallStatusResult(True))
         installer._checkout_commit = Mock(return_value=InstallStatusResult(True))

From f463df331275960f5b2a610ed11a806f23cd44c1 Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Tue, 17 Mar 2026 06:36:19 +0200
Subject: [PATCH 24/30] Fix init_submodules is silently bypassed on
 pre-existing clones

---
 .../systems/kubernetes/kubernetes_installer.py      | 13 ++++++++++++-
 src/cloudai/systems/slurm/slurm_installer.py        | 13 ++++++++++++-
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/cloudai/systems/kubernetes/kubernetes_installer.py b/src/cloudai/systems/kubernetes/kubernetes_installer.py
index 0f88985ee..5b1ebdf30 100644
--- a/src/cloudai/systems/kubernetes/kubernetes_installer.py
+++ b/src/cloudai/systems/kubernetes/kubernetes_installer.py
@@ -156,11 +156,23 @@ def _install_one_git_repo(self, item: GitRepo) -> InstallStatusResult:
             verify_res = self._verify_commit(item.commit, repo_path)
             if not verify_res.success:
                 return verify_res
+            if item.init_submodules:
+                res = self._init_submodules(repo_path)
+                if not res.success:
+                    return res
             item.installed_path = repo_path
             msg = f"Git repository already exists at {repo_path}."
             logging.debug(msg)
             return InstallStatusResult(True, msg)
 
+        res = self._clone_and_setup_repo(item, repo_path)
+        if not res.success:
+            return res
+
+        item.installed_path = repo_path
+        return InstallStatusResult(True)
+
+    def _clone_and_setup_repo(self, item: GitRepo, repo_path: Path) -> InstallStatusResult:
         res = self._clone_repository(item.url, repo_path)
         if not res.success:
             return res
@@ -180,7 +192,6 @@ def _install_one_git_repo(self, item: GitRepo) -> InstallStatusResult:
                     rmtree(repo_path)
                 return res
 
-        item.installed_path = repo_path
         return InstallStatusResult(True)
 
     def _install_python_executable(self, item: PythonExecutable) -> InstallStatusResult:
diff --git a/src/cloudai/systems/slurm/slurm_installer.py b/src/cloudai/systems/slurm/slurm_installer.py
index 352ea69a3..e30a87395 100644
--- a/src/cloudai/systems/slurm/slurm_installer.py
+++ b/src/cloudai/systems/slurm/slurm_installer.py
@@ -209,11 +209,23 @@ def _install_one_git_repo(self, item: GitRepo) -> InstallStatusResult:
             verify_res = self._verify_commit(item.commit, repo_path)
             if not verify_res.success:
                 return verify_res
+            if item.init_submodules:
+                res = self._init_submodules(repo_path)
+                if not res.success:
+                    return res
             item.installed_path = repo_path
             msg = f"Git repository already exists at {repo_path}."
             logging.debug(msg)
             return InstallStatusResult(True, msg)
 
+        res = self._clone_and_setup_repo(item, repo_path)
+        if not res.success:
+            return res
+
+        item.installed_path = repo_path
+        return InstallStatusResult(True)
+
+    def _clone_and_setup_repo(self, item: GitRepo, repo_path: Path) -> InstallStatusResult:
         res = self._clone_repository(item.url, repo_path)
         if not res.success:
             return res
@@ -233,7 +245,6 @@ def _install_one_git_repo(self, item: GitRepo) -> InstallStatusResult:
                     rmtree(repo_path)
                 return res
 
-        item.installed_path = repo_path
         return InstallStatusResult(True)
 
     def _install_python_executable(self, item: PythonExecutable) -> InstallStatusResult:

From 6edc15ee4d605c83bb71210cf95e3aef36b1d206 Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Tue, 17 Mar 2026 06:37:53 +0200
Subject: [PATCH 25/30] Use shlex.split() instead of str.split()

---
 .../workloads/megatron_bridge/slurm_command_gen_strategy.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
index d814a6859..3c5ee258a 100644
--- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
@@ -117,7 +117,7 @@ def _parse_srun_args_as_slurm_params(srun_args: str) -> list[str]:
         key names without a ``=value`` suffix.
         """
         params: list[str] = []
-        tokens = srun_args.split()
+        tokens = shlex.split(srun_args)
         i = 0
         while i < len(tokens):
             tok = tokens[i]

From 66fbb33565c954d3803797054b379c2844d51392 Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Tue, 17 Mar 2026 06:39:54 +0200
Subject: [PATCH 26/30] Add missing regression: existing repo +
 init_submodules=True

---
 tests/test_git_repo_installer.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tests/test_git_repo_installer.py b/tests/test_git_repo_installer.py
index 623fcc33c..fbcaffe52 100644
--- a/tests/test_git_repo_installer.py
+++ b/tests/test_git_repo_installer.py
@@ -297,6 +297,19 @@ def test_submodules_run_when_requested(self, installer: Union[KubernetesInstalle
         assert res.success
         installer._init_submodules.assert_called_once()
 
+    def test_existing_repo_inits_submodules_when_requested(
+        self, installer: Union[KubernetesInstaller, SlurmInstaller], git: GitRepo
+    ):
+        git.init_submodules = True
+        repo_path = installer.system.install_path / git.repo_name
+        repo_path.mkdir()
+        installer._verify_commit = Mock(return_value=InstallStatusResult(True))
+        installer._init_submodules = Mock(return_value=InstallStatusResult(True))
+        res = installer._install_one_git_repo(git)
+        assert res.success
+        assert git.installed_path == repo_path
+        installer._init_submodules.assert_called_once_with(repo_path)
+
     def test_uninstall_no_repo(self, installer: Union[KubernetesInstaller, SlurmInstaller], git: GitRepo):
         res = installer._uninstall_git_repo(git)
         assert res.success

From 59166d61fe27fe7204b5018c9599cba5079cc113 Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Thu, 19 Mar 2026 09:08:23 +0200
Subject: [PATCH 27/30] Export container-runtime env vars before
 Megatron-Bridge launcher

---
 .../slurm_command_gen_strategy.py             | 31 +++++++++++++++++-
 .../test_command_gen_strategy_slurm.py        | 32 +++++++++++++++++++
 2 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
index 3c5ee258a..b590444a7 100644
--- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
@@ -37,6 +37,14 @@ class MegatronBridgeSlurmCommandGenStrategy(SlurmCommandGenStrategy):
     The launcher submits the actual training sbatch job; CloudAI tracks that job ID via SlurmRunner parsing.
     """
 
+    CONTAINER_RUNTIME_ENV_VARS: frozenset[str] = frozenset(
+        {
+            "MELLANOX_VISIBLE_DEVICES",
+            "NVIDIA_VISIBLE_DEVICES",
+            "NVIDIA_DRIVER_CAPABILITIES",
+        }
+    )
+
     def _container_mounts(self) -> list[str]:
         # This workload submits its own sbatch job and passes mounts via `-cm`.
         return []
@@ -96,6 +104,25 @@ def _build_custom_bash_env_exports(self) -> list[str]:
             exports.extend(["-cb", shlex.quote(f"export {key}={value}")])
         return exports
 
+    def _container_runtime_env_exports(self) -> list[str]:
+        """
+        Build ``export`` lines for container-runtime env vars.
+
+        Variables like ``MELLANOX_VISIBLE_DEVICES`` and ``NVIDIA_VISIBLE_DEVICES``
+        are consumed by the NVIDIA container toolkit / enroot at container-creation
+        time to decide which devices to mount.  They must be present in the process
+        environment **before** the Megatron-Bridge launcher calls ``sbatch`` so that
+        Slurm inherits them into the job and ``srun`` passes them to the container
+        runtime.  Exporting them in the wrapper script (which runs on the submit
+        node) achieves this.  The same variables are still passed via ``-cb`` as
+        well, so they are also set inside the container for any runtime readers.
+        """
+        lines: list[str] = []
+        for key, value in sorted(self.final_env_vars.items()):
+            if key in self.CONTAINER_RUNTIME_ENV_VARS:
+                lines.append(f"export {key}={shlex.quote(str(value))}")
+        return lines
+
     def _normalize_recompute_modules(self, val: Any) -> str:
         if isinstance(val, list):
             items = [str(x).strip().strip("\"'") for x in val if str(x).strip()]
@@ -152,6 +179,8 @@ def _wrap_launcher_for_job_id_and_quiet_output(self, launcher_cmd: str, launcher
         wrapper_path = output_dir / "cloudai_megatron_bridge_submit_and_parse_jobid.sh"
         log_path = output_dir / "cloudai_megatron_bridge_launcher.log"
 
+        container_runtime_exports = self._container_runtime_env_exports()
+
         script_lines = [
             "#!/usr/bin/env bash",
             "set -o pipefail",
@@ -164,7 +193,7 @@ def _wrap_launcher_for_job_id_and_quiet_output(self, launcher_cmd: str, launcher
             # Mirror wrapper stdout/stderr to files for debugging while still emitting to the parent process.
             'exec > >(tee -a "$WRAPPER_STDOUT") 2> >(tee -a "$WRAPPER_STDERR" >&2)',
             "",
-            # Launch Megatron-Bridge (log stdout/stderr to file)
+            *container_runtime_exports,
             "",
             ': >"$LOG"',
             "WANDB_INSTALL_RC=0",
diff --git a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py
index b8e599388..b0395d0a6 100644
--- a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py
@@ -225,6 +225,38 @@ def test_env_vars_are_forwarded_via_custom_bash_cmds(
         assert "-cb 'export CUDA_VISIBLE_DEVICES=0,1,2,3'" in wrapper_content
         assert "-cb 'export NCCL_DEBUG=INFO'" in wrapper_content
 
+    def test_container_runtime_env_vars_exported_in_wrapper_script(
+        self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun]
+    ) -> None:
+        configured_slurm_system.global_env_vars = {
+            "MELLANOX_VISIBLE_DEVICES": "0,1,4,5",
+            "NCCL_IB_HCA": "roce_p0_r0,roce_p0_r1,roce_p0_r2,roce_p0_r3",
+            "NCCL_IB_GID_INDEX": "3",
+        }
+        tr = make_test_run(output_subdir="out_container_rt")
+        tdef = cast(MegatronBridgeTestDefinition, tr.test)
+        tdef.extra_env_vars = {"NVIDIA_VISIBLE_DEVICES": "all", "NCCL_DEBUG": "INFO"}
+
+        cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr)
+        wrapper_content = self._wrapper_content(cmd_gen)
+
+        launcher_idx = wrapper_content.index("setup_experiment.py")
+
+        assert "export MELLANOX_VISIBLE_DEVICES=0,1,4,5" in wrapper_content
+        assert "export NVIDIA_VISIBLE_DEVICES=all" in wrapper_content
+        mvd_idx = wrapper_content.index("export MELLANOX_VISIBLE_DEVICES=")
+        nvd_idx = wrapper_content.index("export NVIDIA_VISIBLE_DEVICES=")
+        assert mvd_idx < launcher_idx, "MELLANOX_VISIBLE_DEVICES must be exported before the launcher"
+        assert nvd_idx < launcher_idx, "NVIDIA_VISIBLE_DEVICES must be exported before the launcher"
+
+        assert "-cb 'export MELLANOX_VISIBLE_DEVICES=0,1,4,5'" in wrapper_content
+        assert "-cb 'export NVIDIA_VISIBLE_DEVICES=all'" in wrapper_content
+        assert "-cb 'export NCCL_IB_HCA=roce_p0_r0,roce_p0_r1,roce_p0_r2,roce_p0_r3'" in wrapper_content
+        assert "-cb 'export NCCL_DEBUG=INFO'" in wrapper_content
+
+        assert "export NCCL_IB_HCA=" not in wrapper_content.split("setup_experiment.py")[0]
+        assert "export NCCL_DEBUG=" not in wrapper_content.split("setup_experiment.py")[0]
+
     def test_wrapper_emits_job_id_even_when_launcher_non_zero(
         self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun]
     ) -> None:

From f5681b83cae3dd635355e717940ff644e16be43a Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Thu, 19 Mar 2026 15:33:42 +0200
Subject: [PATCH 28/30] Add supports_gpu_directives check

---
 .../megatron_bridge/slurm_command_gen_strategy.py    |  3 ++-
 .../test_command_gen_strategy_slurm.py               | 12 ++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
index b590444a7..abb0a1330 100644
--- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
@@ -455,8 +455,9 @@ def add_field(field: str, flag: str, value: Any) -> None:
 
         additional_slurm_params: list[str] = []
 
-        if args.gpus_per_node:
+        if args.gpus_per_node and self.system.supports_gpu_directives:
             additional_slurm_params.append(f"gpus-per-node={args.gpus_per_node}")
+            additional_slurm_params.append(f"gres=gpu:{args.gpus_per_node}")
 
         _, node_list = self.get_cached_nodes_spec()
         if node_list:
diff --git a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py
index b0395d0a6..6a977d95e 100644
--- a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py
@@ -380,11 +380,23 @@ def test_no_mount_as_skips_repo_container_mount(
     def test_gpus_per_node_passed_as_additional_slurm_param(
         self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun]
     ) -> None:
+        configured_slurm_system.supports_gpu_directives_cache = True
         tr = make_test_run(cmd_args_overrides={"gpus_per_node": 2}, output_subdir="out_gpus")
         cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr)
         wrapper_content = self._wrapper_content(cmd_gen)
         assert "--additional_slurm_params" in wrapper_content
         assert "gpus-per-node=2" in wrapper_content
+        assert "gres=gpu:2" in wrapper_content
+
+    def test_gpus_per_node_skipped_when_gpu_directives_unsupported(
+        self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun]
+    ) -> None:
+        configured_slurm_system.supports_gpu_directives_cache = False
+        tr = make_test_run(cmd_args_overrides={"gpus_per_node": 2}, output_subdir="out_no_gpu_directives")
+        cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr)
+        wrapper_content = self._wrapper_content(cmd_gen)
+        assert "gpus-per-node=2" not in wrapper_content
+        assert "gres=gpu:2" not in wrapper_content
 
     def test_system_extra_srun_args_forwarded(
         self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun]

From 15081a693a83b89dff9f90c358578540fcae3252 Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Thu, 19 Mar 2026 15:46:28 +0200
Subject: [PATCH 29/30] Fix tests

---
 tests/ref_data/megatron-bridge.sbatch | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ref_data/megatron-bridge.sbatch b/tests/ref_data/megatron-bridge.sbatch
index 4c95d637f..884b90091 100644
--- a/tests/ref_data/megatron-bridge.sbatch
+++ b/tests/ref_data/megatron-bridge.sbatch
@@ -19,7 +19,7 @@ if [ "${WANDB_INSTALL_RC}" -ne 0 ]; then
 fi
 
 LAUNCH_RC=0
-NEMORUN_HOME="__OUTPUT_DIR__/output" __INSTALL_DIR__/Run__main-venv/bin/python __INSTALL_DIR__/Megatron-Bridge__main/scripts/performance/setup_experiment.py -p main -i __OUTPUT_DIR__/output/megatron_bridge_image.sqsh -hf dummy_token -ng 8 -gn 4 --golden_values_path cloudai_megatron_bridge_golden_values.json -cm __INSTALL_DIR__/Megatron-Bridge__main:/opt/Megatron-Bridge -cb 'export CUDA_VISIBLE_DEVICES=0,1,2,3' -cb 'export NCCL_DEBUG=INFO' -m qwen3 -mr 30b_a3b --detach false --additional_slurm_params gpus-per-node=4 >>"$LOG" 2>&1 || LAUNCH_RC=$?
+NEMORUN_HOME="__OUTPUT_DIR__/output" __INSTALL_DIR__/Run__main-venv/bin/python __INSTALL_DIR__/Megatron-Bridge__main/scripts/performance/setup_experiment.py -p main -i __OUTPUT_DIR__/output/megatron_bridge_image.sqsh -hf dummy_token -ng 8 -gn 4 --golden_values_path cloudai_megatron_bridge_golden_values.json -cm __INSTALL_DIR__/Megatron-Bridge__main:/opt/Megatron-Bridge -cb 'export CUDA_VISIBLE_DEVICES=0,1,2,3' -cb 'export NCCL_DEBUG=INFO' -m qwen3 -mr 30b_a3b --detach false --additional_slurm_params 'gpus-per-node=4;gres=gpu:4' >>"$LOG" 2>&1 || LAUNCH_RC=$?
 
 
 JOB_ID=""

From 3a1a996893384ab5facb86fcf2a4b9ead8470ae6 Mon Sep 17 00:00:00 2001
From: Juntao Wang <juntaow@nvidia.com>
Date: Thu, 19 Mar 2026 17:04:27 +0200
Subject: [PATCH 30/30] Restructure configurations

---
 .../test/b200/megatron_bridge_qwen_30b.toml   |  9 ++----
 .../test/gb200/megatron_bridge_qwen_30b.toml  |  9 ++----
 .../test/gb300/megatron_bridge_qwen_30b.toml  | 10 ++----
 .../test/h100/megatron_bridge_qwen_30b.toml   |  9 ++----
 .../megatron_bridge_r0.3.0_qwen_30b.toml      | 31 +++++++++++++++++++
 5 files changed, 40 insertions(+), 28 deletions(-)
 create mode 100644 conf/experimental/megatron_bridge/test_scenario/megatron_bridge_r0.3.0_qwen_30b.toml

diff --git a/conf/experimental/megatron_bridge/test/b200/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/b200/megatron_bridge_qwen_30b.toml
index 108f78993..89cd95dd0 100644
--- a/conf/experimental/megatron_bridge/test/b200/megatron_bridge_qwen_30b.toml
+++ b/conf/experimental/megatron_bridge/test/b200/megatron_bridge_qwen_30b.toml
@@ -22,13 +22,11 @@ extra_container_mounts = []
 
 [[git_repos]]
 url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
-commit = "r0.3.0"
-mount_as = "/opt/Megatron-Bridge"
-init_submodules = true
+commit = "v0.3.0"
 
 [cmd_args]
 gpu_type = "b200"
-container_image = "nvcr.io#nvidia/nemo:26.02"
+container_image = "nvcr.io#nvidia/nemo:26.02.00"
 model_family_name = "qwen"
 model_recipe_name = "qwen3_30b_a3b"
 gpus_per_node = 8
@@ -38,6 +36,3 @@ task = "pretrain"
 compute_dtype = "fp8_mx"
 hf_token = ""
 enable_vboost = true
-
-[extra_env_vars]
-PYTHONPATH = "/opt/Megatron-Bridge/3rdparty/Megatron-LM:${PYTHONPATH}"
diff --git a/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml
index 2225ac2ed..d0ca6e2fb 100644
--- a/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml
+++ b/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml
@@ -22,13 +22,11 @@ extra_container_mounts = []
 
 [[git_repos]]
 url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
-commit = "r0.3.0"
-mount_as = "/opt/Megatron-Bridge"
-init_submodules = true
+commit = "v0.3.0"
 
 [cmd_args]
 gpu_type = "gb200"
-container_image = "nvcr.io#nvidia/nemo:26.02"
+container_image = "nvcr.io#nvidia/nemo:26.02.00"
 model_family_name = "qwen"
 model_recipe_name = "qwen3_30b_a3b"
 gpus_per_node = 4
@@ -38,6 +36,3 @@ task = "pretrain"
 compute_dtype = "fp8_mx"
 hf_token = ""
 enable_vboost = true
-
-[extra_env_vars]
-PYTHONPATH = "/opt/Megatron-Bridge/3rdparty/Megatron-LM:${PYTHONPATH}"
diff --git a/conf/experimental/megatron_bridge/test/gb300/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/gb300/megatron_bridge_qwen_30b.toml
index 44b724e75..f68899a08 100644
--- a/conf/experimental/megatron_bridge/test/gb300/megatron_bridge_qwen_30b.toml
+++ b/conf/experimental/megatron_bridge/test/gb300/megatron_bridge_qwen_30b.toml
@@ -22,22 +22,18 @@ extra_container_mounts = []
 
 [[git_repos]]
 url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
-commit = "r0.3.0"
-mount_as = "/opt/Megatron-Bridge"
-init_submodules = true
+commit = "v0.3.0"
 
 [cmd_args]
 gpu_type = "gb300"
-container_image = "nvcr.io#nvidia/nemo:26.02"
+container_image = "nvcr.io#nvidia/nemo:26.02.00"
 model_family_name = "qwen"
 model_recipe_name = "qwen3_30b_a3b"
 gpus_per_node = 4
 num_gpus = 8
+# mb = 4  # In case OOM, uncomment this for smaller micro-batch size
 domain = "llm"
 task = "pretrain"
 compute_dtype = "fp8_mx"
 hf_token = ""
 enable_vboost = true
-
-[extra_env_vars]
-PYTHONPATH = "/opt/Megatron-Bridge/3rdparty/Megatron-LM:${PYTHONPATH}"
diff --git a/conf/experimental/megatron_bridge/test/h100/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/h100/megatron_bridge_qwen_30b.toml
index 200e256c6..84c52f893 100644
--- a/conf/experimental/megatron_bridge/test/h100/megatron_bridge_qwen_30b.toml
+++ b/conf/experimental/megatron_bridge/test/h100/megatron_bridge_qwen_30b.toml
@@ -22,13 +22,11 @@ extra_container_mounts = []
 
 [[git_repos]]
 url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
-commit = "r0.3.0"
-mount_as = "/opt/Megatron-Bridge"
-init_submodules = true
+commit = "v0.3.0"
 
 [cmd_args]
 gpu_type = "h100"
-container_image = "nvcr.io#nvidia/nemo:26.02"
+container_image = "nvcr.io#nvidia/nemo:26.02.00"
 model_family_name = "qwen"
 model_recipe_name = "qwen3_30b_a3b"
 gpus_per_node = 8
@@ -38,6 +36,3 @@ task = "pretrain"
 compute_dtype = "fp8_cs"
 hf_token = ""
 enable_vboost = true
-
-[extra_env_vars]
-PYTHONPATH = "/opt/Megatron-Bridge/3rdparty/Megatron-LM:${PYTHONPATH}"
diff --git a/conf/experimental/megatron_bridge/test_scenario/megatron_bridge_r0.3.0_qwen_30b.toml b/conf/experimental/megatron_bridge/test_scenario/megatron_bridge_r0.3.0_qwen_30b.toml
new file mode 100644
index 000000000..70e62e188
--- /dev/null
+++ b/conf/experimental/megatron_bridge/test_scenario/megatron_bridge_r0.3.0_qwen_30b.toml
@@ -0,0 +1,31 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "megatron_bridge_qwen_30b"
+
+[[Tests]]
+id = "megatron_bridge_qwen_30b"
+test_name = "megatron_bridge_qwen_30b"
+num_nodes = "2"
+
+  [[Tests.git_repos]]
+  url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
+  commit = "r0.3.0"
+  mount_as = "/opt/Megatron-Bridge"
+  init_submodules = true
+
+  [Tests.extra_env_vars]
+  PYTHONPATH = "/opt/Megatron-Bridge/3rdparty/Megatron-LM:${PYTHONPATH}"