Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
a0d54a4
Make node placement correctly propagate M-Bridge
juntaowww Mar 6, 2026
e9b1218
Allow excluding nodes from group of nodes
juntaowww Mar 10, 2026
d720616
Also init submodules when install git repos
juntaowww Mar 11, 2026
ea4844d
Add options to mount the repo in the container
juntaowww Mar 11, 2026
9f786ff
Fix slurm gpu resource requesting
juntaowww Mar 11, 2026
e6c9cb2
Update gb200 M-Bridge r0.3.0 qwen recipe
juntaowww Mar 11, 2026
2acfb29
ruff check & ruff format
juntaowww Mar 11, 2026
45e6153
Fix tests to reflect updates
juntaowww Mar 11, 2026
e14fa6f
Update copyright year
juntaowww Mar 11, 2026
54f4fbb
Add back the defensive filter
juntaowww Mar 11, 2026
ff2f2fd
Enhance nodes allocation for edge cases
juntaowww Mar 11, 2026
f26fe7b
Enhance tests by avoiding default names
juntaowww Mar 11, 2026
a159736
Make exclude nodes correctly propagate to M-Bridge
juntaowww Mar 11, 2026
5da0a57
Improve error messages for excluding nodes
juntaowww Mar 16, 2026
2d7b98b
Improve exclude nodes APIs
juntaowww Mar 16, 2026
174dcef
Change additional_slurm_params separator to semi-colon
juntaowww Mar 16, 2026
3480ba8
Make init_submodules optional
juntaowww Mar 16, 2026
e12ec45
Update configurations
juntaowww Mar 16, 2026
af471d5
Make extra_srun_args correctly propagate to M-Bridge
juntaowww Mar 16, 2026
52cef4c
Allow submission of jobs to resv nodes
juntaowww Mar 16, 2026
62f5cf7
Make the no-mount assertion less brittle
juntaowww Mar 16, 2026
f1b027e
Fix and improve the extra_srun_args propagation
juntaowww Mar 16, 2026
b6a5dcf
ruff format & copyright year
juntaowww Mar 16, 2026
f463df3
Fix init_submodules is silently bypassed on pre-existing clones
juntaowww Mar 17, 2026
6edc15e
Use shlex.split() instead of str.split()
juntaowww Mar 17, 2026
66fbb33
Add missing regression: existing repo + init_submodules=True
juntaowww Mar 17, 2026
59166d6
Export container-runtime env vars before Megatron-Bridge launcher
juntaowww Mar 19, 2026
f5681b8
Add supports_gpu_directives check
juntaowww Mar 19, 2026
15081a6
Fix tests
juntaowww Mar 19, 2026
3a1a996
Restructure configurations
juntaowww Mar 19, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ extra_container_mounts = []
[[git_repos]]
url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
commit = "v0.3.0"
mount_as = "/opt/Megatron-Bridge"

[cmd_args]
gpu_type = "b200"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ extra_container_mounts = []
[[git_repos]]
url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
commit = "v0.3.0"
mount_as = "/opt/Megatron-Bridge"

[cmd_args]
gpu_type = "gb200"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ extra_container_mounts = []
[[git_repos]]
url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
commit = "v0.3.0"
mount_as = "/opt/Megatron-Bridge"

[cmd_args]
gpu_type = "gb300"
Expand All @@ -32,6 +31,7 @@ model_family_name = "qwen"
model_recipe_name = "qwen3_30b_a3b"
gpus_per_node = 4
num_gpus = 8
# mb = 4 # In case OOM, uncomment this for smaller micro-batch size
domain = "llm"
task = "pretrain"
compute_dtype = "fp8_mx"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ extra_container_mounts = []
[[git_repos]]
url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
commit = "v0.3.0"
mount_as = "/opt/Megatron-Bridge"

[cmd_args]
gpu_type = "h100"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name = "megatron_bridge_qwen_30b"

[[Tests]]
id = "megatron_bridge_qwen_30b"
test_name = "megatron_bridge_qwen_30b"
num_nodes = "2"

[[Tests.git_repos]]
url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
commit = "r0.3.0"
mount_as = "/opt/Megatron-Bridge"
init_submodules = true

[Tests.extra_env_vars]
PYTHONPATH = "/opt/Megatron-Bridge/3rdparty/Megatron-LM:${PYTHONPATH}"
3 changes: 2 additions & 1 deletion src/cloudai/_core/installables.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -91,6 +91,7 @@ class GitRepo(Installable, BaseModel):

url: str
commit: str
init_submodules: bool = False
installed_path: Optional[Path] = None
mount_as: Optional[str] = None

Expand Down
3 changes: 2 additions & 1 deletion src/cloudai/_core/test_scenario.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -62,6 +62,7 @@ class TestRun:
test: TestDefinition
num_nodes: Union[int, list[int]]
nodes: List[str]
exclude_nodes: List[str] = field(default_factory=list)
output_path: Path = Path("")
iterations: int = 1
current_iteration: int = 0
Expand Down
7 changes: 7 additions & 0 deletions src/cloudai/models/scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,13 @@ class TestRunModel(BaseModel):
test_name: Optional[str] = None
num_nodes: int | list[int] | None = None
nodes: list[str] = Field(default_factory=list)
exclude_nodes: list[str] = Field(
default_factory=list,
description=(
"Hostnames to exclude from the resolved node list. "
"Supports Slurm range syntax, e.g. ['node-048', 'node-[101-104]']."
),
)
weight: int = 0
iterations: int = 1
sol: Optional[float] = None
Expand Down
29 changes: 28 additions & 1 deletion src/cloudai/systems/kubernetes/kubernetes_installer.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,11 +156,23 @@ def _install_one_git_repo(self, item: GitRepo) -> InstallStatusResult:
verify_res = self._verify_commit(item.commit, repo_path)
if not verify_res.success:
return verify_res
if item.init_submodules:
res = self._init_submodules(repo_path)
if not res.success:
return res
item.installed_path = repo_path
msg = f"Git repository already exists at {repo_path}."
logging.debug(msg)
return InstallStatusResult(True, msg)

res = self._clone_and_setup_repo(item, repo_path)
if not res.success:
return res

item.installed_path = repo_path
return InstallStatusResult(True)

def _clone_and_setup_repo(self, item: GitRepo, repo_path: Path) -> InstallStatusResult:
res = self._clone_repository(item.url, repo_path)
if not res.success:
return res
Expand All @@ -172,7 +184,14 @@ def _install_one_git_repo(self, item: GitRepo) -> InstallStatusResult:
rmtree(repo_path)
return res

item.installed_path = repo_path
if item.init_submodules:
res = self._init_submodules(repo_path)
if not res.success:
logging.error(f"Submodule init failed, removing cloned repository at {repo_path}")
if repo_path.exists():
rmtree(repo_path)
return res

return InstallStatusResult(True)

def _install_python_executable(self, item: PythonExecutable) -> InstallStatusResult:
Expand Down Expand Up @@ -237,6 +256,14 @@ def _checkout_commit(self, commit_hash: str, path: Path) -> InstallStatusResult:
return InstallStatusResult(False, f"Failed to checkout commit {commit_hash}: {result.stderr}")
return InstallStatusResult(True)

def _init_submodules(self, path: Path) -> InstallStatusResult:
logging.debug(f"Initializing submodules in {path}")
submodule_cmd = ["git", "submodule", "update", "--init", "--recursive"]
result = subprocess.run(submodule_cmd, cwd=str(path), capture_output=True, text=True)
if result.returncode != 0:
return InstallStatusResult(False, f"Failed to initialize submodules: {result.stderr}")
return InstallStatusResult(True)

def _verify_commit(self, ref: str, path: Path) -> InstallStatusResult:
try:
result = subprocess.run(["git", "rev-parse", "HEAD"], cwd=str(path), capture_output=True, text=True)
Expand Down
10 changes: 9 additions & 1 deletion src/cloudai/systems/slurm/slurm_command_gen_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,9 @@ def _append_nodes_related_directives(self, content: List[str]) -> Optional[Path]

content.append(f"#SBATCH -N {num_nodes}")

if self.test_run.exclude_nodes:
content.append(f"#SBATCH --exclude={','.join(self.test_run.exclude_nodes)}")

return None

def _format_env_vars(self, env_vars: Dict[str, Any]) -> str:
Expand Down Expand Up @@ -465,12 +468,17 @@ def get_cached_nodes_spec(self) -> tuple[int, list[str]]:
str(self.test_run.step),
str(self.test_run.nnodes),
",".join(self.test_run.nodes),
",".join(self.test_run.exclude_nodes),
]
)

if cache_key in self._node_spec_cache:
logging.debug(f"Using cached node allocation for {cache_key}: {self._node_spec_cache[cache_key]}")
return self._node_spec_cache[cache_key]

self._node_spec_cache[cache_key] = self.system.get_nodes_by_spec(self.test_run.nnodes, self.test_run.nodes)
num_nodes, node_list = self.system.get_nodes_by_spec(
self.test_run.nnodes, self.test_run.nodes, exclude_nodes=self.test_run.exclude_nodes or None
)

self._node_spec_cache[cache_key] = (num_nodes, node_list)
return self._node_spec_cache[cache_key]
29 changes: 28 additions & 1 deletion src/cloudai/systems/slurm/slurm_installer.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,11 +209,23 @@ def _install_one_git_repo(self, item: GitRepo) -> InstallStatusResult:
verify_res = self._verify_commit(item.commit, repo_path)
if not verify_res.success:
return verify_res
if item.init_submodules:
res = self._init_submodules(repo_path)
if not res.success:
return res
item.installed_path = repo_path
msg = f"Git repository already exists at {repo_path}."
logging.debug(msg)
return InstallStatusResult(True, msg)

res = self._clone_and_setup_repo(item, repo_path)
if not res.success:
return res

item.installed_path = repo_path
return InstallStatusResult(True)

def _clone_and_setup_repo(self, item: GitRepo, repo_path: Path) -> InstallStatusResult:
res = self._clone_repository(item.url, repo_path)
if not res.success:
return res
Expand All @@ -225,7 +237,14 @@ def _install_one_git_repo(self, item: GitRepo) -> InstallStatusResult:
rmtree(repo_path)
return res

item.installed_path = repo_path
if item.init_submodules:
res = self._init_submodules(repo_path)
if not res.success:
logging.error(f"Submodule init failed, removing cloned repository at {repo_path}")
if repo_path.exists():
rmtree(repo_path)
return res

return InstallStatusResult(True)

def _install_python_executable(self, item: PythonExecutable) -> InstallStatusResult:
Expand Down Expand Up @@ -290,6 +309,14 @@ def _checkout_commit(self, commit_hash: str, path: Path) -> InstallStatusResult:
return InstallStatusResult(False, f"Failed to checkout commit {commit_hash}: {result.stderr}")
return InstallStatusResult(True)

def _init_submodules(self, path: Path) -> InstallStatusResult:
logging.debug(f"Initializing submodules in {path}")
submodule_cmd = ["git", "submodule", "update", "--init", "--recursive"]
result = subprocess.run(submodule_cmd, cwd=str(path), capture_output=True, text=True)
if result.returncode != 0:
return InstallStatusResult(False, f"Failed to initialize submodules: {result.stderr}")
return InstallStatusResult(True)

def _verify_commit(self, ref: str, path: Path) -> InstallStatusResult:
try:
result = subprocess.run(["git", "rev-parse", "HEAD"], cwd=str(path), capture_output=True, text=True)
Expand Down
Loading
Loading