AMD-AGI · Xiaoming-AMD · Mar 31, 2026 · Feb 20, 2026 · Mar 16, 2026 · Mar 16, 2026
@@ -25,8 +25,20 @@ class Finding:
     details: Dict[str, Any]
 
 
-def collect_network_info() -> List[Finding]:
-    """Collect all network info (basic + standard + runtime)."""
+def collect_network_info(expect_distributed: bool = True) -> List[Finding]:
+    """
+    Run a sequence of network diagnostic checks (basic, standard, and full)
+    and aggregate their findings into a single list. The checks include status
+    of network interfaces, distributed environment detection, IP routes, and
+    runtime compatibility for distributed training. The `expect_distributed`
+    flag influences checks that are relevant to distributed setups.
+
+    Args:
+        expect_distributed (bool): Whether distributed execution is expected.
+
+    Returns:
+        List[Finding]: All findings from the three network checks.
+    """
     out: List[Finding] = []
 
     nb = run_network_basic_checks()
@@ -37,7 +49,7 @@ def collect_network_info() -> List[Finding]:
     for f in ns["findings"]:
         out.append(Finding(level=f.level, message=f.message, details=f.details))
 
-    nf = run_network_full_checks()
+    nf = run_network_full_checks(expect_distributed=expect_distributed)
     for f in nf["findings"]:
         out.append(Finding(level=f.level, message=f.message, details=f.details))
 

@@ -12,7 +12,7 @@
 from .utils import Finding
 
 
-def run_network_full_checks() -> Dict[str, Any]:
+def run_network_full_checks(expect_distributed: bool = True) -> Dict[str, Any]:
     """
     Level: full
 
@@ -31,11 +31,15 @@ def run_network_full_checks() -> Dict[str, Any]:
             runtime["pg_backend"] = dist.get_backend()
             runtime["pg_init_ok"] = True
         else:
-            # If distributed intent is detected but PG is not initialized, treat as WARN.
+            # If distributed intent is detected but PG is not initialized, log WARN when
+            # distributed runtime is expected, otherwise log as INFO.
             if bool(probe.intent.get("is_distributed")):
                 runtime["pg_init_ok"] = False
                 runtime["pg_error"] = "Process group not initialized"
-                findings.append(Finding("warn", "Runtime process group not initialized", runtime))
+                if expect_distributed:
+                    findings.append(Finding("warn", "Runtime process group not initialized", runtime))
+                else:
+                    findings.append(Finding("info", "Runtime process group not initialized", runtime))
     except Exception as e:
         # torch not available or dist import failed
         runtime["pg_init_ok"] = False

@@ -49,7 +49,7 @@ def _status_from_counts(fail_count: int, warn_count: int) -> str:
     return "OK"
 
 
-def run_preflight_info(args: Any) -> int:
+def run_preflight_info(args: Any, expect_distributed: bool = True) -> int:
     """
     Run lightweight preflight info collection (host/gpu/network), aggregate across ranks,
     and write Markdown/PDF report on rank0.
@@ -62,6 +62,14 @@ def run_preflight_info(args: Any) -> int:
             - dump_path (str)
             - report_file_name (str)
             - save_pdf (bool)
+        expect_distributed: Whether the run is expected to be in a distributed
+            (multi-rank) context. When True (default), the network portion of
+            preflight assumes multiple ranks may participate and will emit
+            warnings if it detects conditions that look like a misconfigured
+            or partially initialized distributed environment. When False, the
+            run is treated as local-only: distributed-related network warnings
+            are suppressed, which is appropriate for single-node or
+            non-distributed preflight invocations.
 
     Return codes:
       0: success (WARN does not change rc)
@@ -88,7 +96,7 @@ def run_preflight_info(args: Any) -> int:
         for f in collect_gpu_info():
             findings.append(Finding(level=f.level, message=f.message, details=f.details))
     if check_network:
-        for f in collect_network_info():
+        for f in collect_network_info(expect_distributed=expect_distributed):
             findings.append(Finding(level=f.level, message=f.message, details=f.details))
 
     fail_count = sum(1 for x in findings if x.level == "fail")
@@ -209,7 +217,7 @@ def _append_dist_init_failure(markdown_file: str, timeout_sec: int, err: Excepti
     # 1) Info-only mode: run without distributed init.
     if not perf_test and any_selection:
         # First, emit a local-only report immediately (so user gets output even if PG init hangs).
-        local_rc = run_preflight_info(args)
+        local_rc = run_preflight_info(args, expect_distributed=False)
 
         # Then attempt to initialize distributed with a timeout, and if successful, re-run info
         # to produce an aggregated multi-node report.
@@ -242,7 +250,7 @@ def _append_dist_init_failure(markdown_file: str, timeout_sec: int, err: Excepti
     # 2) Plain `preflight` (no flags): run info FIRST (no dist init) so we always get a report.
     info_rc = 0
     if not perf_test and not any_selection:
-        info_rc = run_preflight_info(args)
+        info_rc = run_preflight_info(args, expect_distributed=False)
 
     # 3) Perf tests (perf-only OR plain preflight after info): now attempt distributed init
     # with a timeout so we fail fast instead of hanging.