Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions primus/tools/preflight/network/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,20 @@ class Finding:
details: Dict[str, Any]


def collect_network_info() -> List[Finding]:
"""Collect all network info (basic + standard + runtime)."""
def collect_network_info(expect_distributed: bool = True) -> List[Finding]:
"""
Run a sequence of network diagnostic checks (basic, standard, and full)
and aggregate their findings into a single list. The checks include status
of network interfaces, distributed environment detection, IP routes, and
runtime compatibility for distributed training. The `expect_distributed`
flag influences checks that are relevant to distributed setups.

Args:
expect_distributed (bool): Whether distributed execution is expected.

Returns:
List[Finding]: All findings from the three network checks.
"""
out: List[Finding] = []

nb = run_network_basic_checks()
Expand All @@ -37,7 +49,7 @@ def collect_network_info() -> List[Finding]:
for f in ns["findings"]:
out.append(Finding(level=f.level, message=f.message, details=f.details))

nf = run_network_full_checks()
nf = run_network_full_checks(expect_distributed=expect_distributed)
for f in nf["findings"]:
out.append(Finding(level=f.level, message=f.message, details=f.details))

Expand Down
10 changes: 7 additions & 3 deletions primus/tools/preflight/network/network_full.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from .utils import Finding


def run_network_full_checks() -> Dict[str, Any]:
def run_network_full_checks(expect_distributed: bool = True) -> Dict[str, Any]:
"""
Level: full

Expand All @@ -31,11 +31,15 @@ def run_network_full_checks() -> Dict[str, Any]:
runtime["pg_backend"] = dist.get_backend()
runtime["pg_init_ok"] = True
else:
# If distributed intent is detected but PG is not initialized, treat as WARN.
# If distributed intent is detected but PG is not initialized, log WARN when
# distributed runtime is expected, otherwise log as INFO.
if bool(probe.intent.get("is_distributed")):
runtime["pg_init_ok"] = False
runtime["pg_error"] = "Process group not initialized"
findings.append(Finding("warn", "Runtime process group not initialized", runtime))
if expect_distributed:
findings.append(Finding("warn", "Runtime process group not initialized", runtime))
else:
findings.append(Finding("info", "Runtime process group not initialized", runtime))
except Exception as e:
# torch not available or dist import failed
runtime["pg_init_ok"] = False
Expand Down
16 changes: 12 additions & 4 deletions primus/tools/preflight/preflight_perf_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def _status_from_counts(fail_count: int, warn_count: int) -> str:
return "OK"


def run_preflight_info(args: Any) -> int:
def run_preflight_info(args: Any, expect_distributed: bool = True) -> int:
"""
Run lightweight preflight info collection (host/gpu/network), aggregate across ranks,
and write Markdown/PDF report on rank0.
Comment thread
alexsu52 marked this conversation as resolved.
Expand All @@ -62,6 +62,14 @@ def run_preflight_info(args: Any) -> int:
- dump_path (str)
- report_file_name (str)
- save_pdf (bool)
expect_distributed: Whether the run is expected to be in a distributed
(multi-rank) context. When True (default), the network portion of
preflight assumes multiple ranks may participate and will emit
warnings if it detects conditions that look like a misconfigured
or partially initialized distributed environment. When False, the
run is treated as local-only: distributed-related network warnings
are suppressed, which is appropriate for single-node or
non-distributed preflight invocations.

Return codes:
0: success (WARN does not change rc)
Expand All @@ -88,7 +96,7 @@ def run_preflight_info(args: Any) -> int:
for f in collect_gpu_info():
findings.append(Finding(level=f.level, message=f.message, details=f.details))
if check_network:
for f in collect_network_info():
for f in collect_network_info(expect_distributed=expect_distributed):
findings.append(Finding(level=f.level, message=f.message, details=f.details))

fail_count = sum(1 for x in findings if x.level == "fail")
Expand Down Expand Up @@ -209,7 +217,7 @@ def _append_dist_init_failure(markdown_file: str, timeout_sec: int, err: Excepti
# 1) Info-only mode: run without distributed init.
if not perf_test and any_selection:
# First, emit a local-only report immediately (so user gets output even if PG init hangs).
local_rc = run_preflight_info(args)
local_rc = run_preflight_info(args, expect_distributed=False)

# Then attempt to initialize distributed with a timeout, and if successful, re-run info
# to produce an aggregated multi-node report.
Expand Down Expand Up @@ -242,7 +250,7 @@ def _append_dist_init_failure(markdown_file: str, timeout_sec: int, err: Excepti
# 2) Plain `preflight` (no flags): run info FIRST (no dist init) so we always get a report.
info_rc = 0
if not perf_test and not any_selection:
info_rc = run_preflight_info(args)
info_rc = run_preflight_info(args, expect_distributed=False)

# 3) Perf tests (perf-only OR plain preflight after info): now attempt distributed init
# with a timeout so we fail fast instead of hanging.
Expand Down