From a25c0e76f267e09a9bfa2c2365539cd62d0f6f58 Mon Sep 17 00:00:00 2001
From: aoli26 <Ao.Li@amd.com>
Date: Tue, 21 Apr 2026 09:07:07 +0000
Subject: [PATCH 1/3] fix cluster launch by using cmake probe

---
 lib/Runtime/ROCm/CMakeLists.txt             |  21 ++++
 lib/Runtime/ROCm/FlyRocmRuntimeWrappers.cpp | 130 +++++++++++++-------
 2 files changed, 108 insertions(+), 43 deletions(-)
diff --git a/lib/Runtime/ROCm/CMakeLists.txt b/lib/Runtime/ROCm/CMakeLists.txt
index b13e1fa56..44cfb2800 100644
--- a/lib/Runtime/ROCm/CMakeLists.txt
+++ b/lib/Runtime/ROCm/CMakeLists.txt
@@ -7,6 +7,25 @@ file(GLOB _rocm_paths LIST_DIRECTORIES true "/opt/rocm*")
 list(SORT _rocm_paths ORDER DESCENDING)
 find_package(hip REQUIRED CONFIG PATHS ${_rocm_paths})
 
+include(CheckCXXSourceCompiles)
+set(_FLY_SAVED_CMAKE_REQUIRED_INCLUDES "${CMAKE_REQUIRED_INCLUDES}")
+set(_FLY_SAVED_CMAKE_REQUIRED_DEFINITIONS "${CMAKE_REQUIRED_DEFINITIONS}")
+set(_FLY_SAVED_CMAKE_TRY_COMPILE_TARGET_TYPE "${CMAKE_TRY_COMPILE_TARGET_TYPE}")
+set(CMAKE_REQUIRED_INCLUDES "${hip_INCLUDE_DIRS};${HIP_INCLUDE_DIRS}")
+set(CMAKE_REQUIRED_DEFINITIONS "-D__HIP_PLATFORM_AMD__")
+set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
+check_cxx_source_compiles("
+  #include <hip/hip_runtime_api.h>
+  int main() {
+    auto x = hipLaunchAttributeClusterDimension;
+    (void)x;
+    return 0;
+  }
+" FLY_HIP_HAS_CLUSTER_ATTR)
+set(CMAKE_REQUIRED_INCLUDES "${_FLY_SAVED_CMAKE_REQUIRED_INCLUDES}")
+set(CMAKE_REQUIRED_DEFINITIONS "${_FLY_SAVED_CMAKE_REQUIRED_DEFINITIONS}")
+set(CMAKE_TRY_COMPILE_TARGET_TYPE "${_FLY_SAVED_CMAKE_TRY_COMPILE_TARGET_TYPE}")
+
 add_library(FlyJitRuntime SHARED FlyRocmRuntimeWrappers.cpp)
 target_include_directories(FlyJitRuntime PRIVATE
   ${LLVM_INCLUDE_DIRS}
@@ -14,4 +33,6 @@ target_include_directories(FlyJitRuntime PRIVATE
 )
 target_compile_features(FlyJitRuntime PRIVATE cxx_std_17)
 target_link_libraries(FlyJitRuntime PRIVATE hip::host hip::amdhip64)
+target_compile_definitions(FlyJitRuntime PRIVATE
+  FLY_HIP_HAS_CLUSTER_ATTR=$<BOOL:${FLY_HIP_HAS_CLUSTER_ATTR}>)
 set_target_properties(FlyJitRuntime PROPERTIES OUTPUT_NAME "fly_jit_runtime")
diff --git a/lib/Runtime/ROCm/FlyRocmRuntimeWrappers.cpp b/lib/Runtime/ROCm/FlyRocmRuntimeWrappers.cpp
index 87b35d20d..2416beb97 100644
--- a/lib/Runtime/ROCm/FlyRocmRuntimeWrappers.cpp
+++ b/lib/Runtime/ROCm/FlyRocmRuntimeWrappers.cpp
@@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <atomic>
 #include <cassert>
 #include <cstdio>
 #include <vector>
@@ -66,6 +67,38 @@ extern "C" void mgpuLaunchKernel(hipFunction_t function, intptr_t gridX,
                                             stream, params, extra));
 }
 
+// Cached per-device cluster-launch capability for the current HIP device.
+// Cache slot encoding: 0 = unqueried, 1 = no cluster, 2 = cluster supported.
+// Populated lazily from hipDeviceProp_t::clusterLaunch (field stable since
+// ROCm 6.0). Per-device caching is required because mgpuSetDefaultDevice
+// allows multi-GPU usage. Concurrent first-time queries on the same device
+// race benignly: both threads compute the same value before storing.
+static int flyDeviceClusterLaunchCap() {
+  int dev = 0;
+  if (hipGetDevice(&dev) != hipSuccess)
+    return 0;
+
+  constexpr int kMaxCachedDevices = 16;
+  static std::atomic<int> sCache[kMaxCachedDevices];
+
+  auto query = [](int d) -> int {
+    hipDeviceProp_t prop{};
+    if (hipGetDeviceProperties(&prop, d) != hipSuccess)
+      return 0;
+    return prop.clusterLaunch ? 1 : 0;
+  };
+
+  if (dev < 0 || dev >= kMaxCachedDevices)
+    return query(dev);
+
+  int cached = sCache[dev].load(std::memory_order_relaxed);
+  if (cached != 0)
+    return cached - 1;
+  int v = query(dev);
+  sCache[dev].store(v + 1, std::memory_order_relaxed);
+  return v;
+}
+
 extern "C" void mgpuLaunchClusterKernel(hipFunction_t function,
                                         intptr_t clusterX, intptr_t clusterY,
                                         intptr_t clusterZ,
@@ -75,64 +108,75 @@ extern "C" void mgpuLaunchClusterKernel(hipFunction_t function,
                                         intptr_t blockZ, int32_t smem,
                                         hipStream_t stream, void **params,
                                         void **extra, size_t /*paramsCount*/) {
-#ifdef hipLaunchAttributeClusterDimension
-  hipLaunchAttribute attrs[1];
-  attrs[0].id = hipLaunchAttributeClusterDimension;
-  attrs[0].value.clusterDim.x = static_cast<unsigned>(clusterX);
-  attrs[0].value.clusterDim.y = static_cast<unsigned>(clusterY);
-  attrs[0].value.clusterDim.z = static_cast<unsigned>(clusterZ);
-
-  HIP_LAUNCH_CONFIG config{};
-  config.gridDimX = static_cast<unsigned>(gridX);
-  config.gridDimY = static_cast<unsigned>(gridY);
-  config.gridDimZ = static_cast<unsigned>(gridZ);
-  config.blockDimX = static_cast<unsigned>(blockX);
-  config.blockDimY = static_cast<unsigned>(blockY);
-  config.blockDimZ = static_cast<unsigned>(blockZ);
-  config.sharedMemBytes = static_cast<unsigned>(smem);
-  config.hStream = stream;
-  config.attrs = attrs;
-  config.numAttrs = 1;
-
-  hipError_t err = hipDrvLaunchKernelEx(&config, function, params, extra);
-  if (err == hipSuccess)
-    return;
-
   const bool requestedRealCluster =
       (clusterX > 1) || (clusterY > 1) || (clusterZ > 1);
-  if (requestedRealCluster) {
+
+#if FLY_HIP_HAS_CLUSTER_ATTR
+  const int deviceClusterCap = flyDeviceClusterLaunchCap();
+
+  if (requestedRealCluster && !deviceClusterCap) {
     fprintf(stderr,
-            "[mgpuLaunchClusterKernel] hipDrvLaunchKernelEx failed (err=%d) "
-            "for requested cluster=(%ld,%ld,%ld); not falling back to "
-            "hipModuleLaunchKernel.\n",
-            static_cast<int>(err), static_cast<long>(clusterX),
-            static_cast<long>(clusterY), static_cast<long>(clusterZ));
-    HIP_REPORT_IF_ERROR(err);
+            "[mgpuLaunchClusterKernel] cluster=(%ld,%ld,%ld) requested but "
+            "device reports clusterLaunch=0; aborting (no silent fallback).\n",
+            static_cast<long>(clusterX), static_cast<long>(clusterY),
+            static_cast<long>(clusterZ));
     return;
   }
 
-  fprintf(stderr,
-          "[mgpuLaunchClusterKernel] hipDrvLaunchKernelEx failed (err=%d) "
-          "for cluster=(1,1,1); falling back to hipModuleLaunchKernel.\n",
-          static_cast<int>(err));
-  HIP_REPORT_IF_ERROR(hipModuleLaunchKernel(function, gridX, gridY, gridZ,
-                                            blockX, blockY, blockZ, smem,
-                                            stream, params, extra));
+  if (deviceClusterCap) {
+    hipLaunchAttribute attrs[1];
+    attrs[0].id = hipLaunchAttributeClusterDimension;
+    attrs[0].value.clusterDim.x = static_cast<unsigned>(clusterX);
+    attrs[0].value.clusterDim.y = static_cast<unsigned>(clusterY);
+    attrs[0].value.clusterDim.z = static_cast<unsigned>(clusterZ);
+
+    HIP_LAUNCH_CONFIG config{};
+    config.gridDimX = static_cast<unsigned>(gridX);
+    config.gridDimY = static_cast<unsigned>(gridY);
+    config.gridDimZ = static_cast<unsigned>(gridZ);
+    config.blockDimX = static_cast<unsigned>(blockX);
+    config.blockDimY = static_cast<unsigned>(blockY);
+    config.blockDimZ = static_cast<unsigned>(blockZ);
+    config.sharedMemBytes = static_cast<unsigned>(smem);
+    config.hStream = stream;
+    config.attrs = attrs;
+    config.numAttrs = 1;
+
+    hipError_t err = hipDrvLaunchKernelEx(&config, function, params, extra);
+    if (err == hipSuccess)
+      return;
+
+    if (requestedRealCluster) {
+      fprintf(stderr,
+              "[mgpuLaunchClusterKernel] hipDrvLaunchKernelEx failed (err=%d) "
+              "for requested cluster=(%ld,%ld,%ld); not falling back.\n",
+              static_cast<int>(err), static_cast<long>(clusterX),
+              static_cast<long>(clusterY), static_cast<long>(clusterZ));
+      HIP_REPORT_IF_ERROR(err);
+      return;
+    }
+
+    fprintf(stderr,
+            "[mgpuLaunchClusterKernel] hipDrvLaunchKernelEx failed (err=%d) "
+            "for cluster=(1,1,1); falling back to hipModuleLaunchKernel.\n",
+            static_cast<int>(err));
+  }
 #else
-  // Cluster launch not supported by this HIP version; ignore cluster dims
-  // and fall back to regular kernel launch.
-  if ((clusterX > 1) || (clusterY > 1) || (clusterZ > 1)) {
+  if (requestedRealCluster) {
     fprintf(stderr,
             "[mgpuLaunchClusterKernel] cluster=(%ld,%ld,%ld) requested but "
-            "hipLaunchAttributeClusterDimension is not available in this HIP "
-            "version; falling back to hipModuleLaunchKernel.\n",
+            "FlyDSL was built against a HIP without "
+            "hipLaunchAttributeClusterDimension; aborting "
+            "(no silent fallback).\n",
             static_cast<long>(clusterX), static_cast<long>(clusterY),
             static_cast<long>(clusterZ));
+    return;
   }
+#endif
+
   HIP_REPORT_IF_ERROR(hipModuleLaunchKernel(function, gridX, gridY, gridZ,
                                             blockX, blockY, blockZ, smem,
                                             stream, params, extra));
-#endif
 }
 
 extern "C" hipStream_t mgpuStreamCreate() {

From 6f3a337f9e1c2b66c3be3dd29be631850625dff5 Mon Sep 17 00:00:00 2001
From: aoli26 <Ao.Li@amd.com>
Date: Fri, 15 May 2026 11:53:29 +0000
Subject: [PATCH 2/3] remove force effective_waves_per_eu for cluster

---
 kernels/gemm_fp8fp4_gfx1250.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernels/gemm_fp8fp4_gfx1250.py b/kernels/gemm_fp8fp4_gfx1250.py
index 7dd59d106..fe7902907 100644
--- a/kernels/gemm_fp8fp4_gfx1250.py
+++ b/kernels/gemm_fp8fp4_gfx1250.py
@@ -96,8 +96,6 @@ def compile_mxscale_gemm(
         if cluster_m * cluster_n > 16:
             raise ValueError(f"cluster_m * cluster_n must be <= 16, got {cluster_m}*{cluster_n}")
     effective_waves_per_eu = waves_per_eu
-    if use_cluster and effective_waves_per_eu is None:
-        effective_waves_per_eu = 2
 
     num_warps = m_warp * n_warp
     block_threads = num_warps * WAVE_SIZE

From 03a632541076ce25bad7fb03fcea13ccf05d31ef Mon Sep 17 00:00:00 2001
From: aoli26 <Ao.Li@amd.com>
Date: Thu, 14 May 2026 06:31:35 +0000
Subject: [PATCH 3/3] Remove COMGR import preload shim

---
 python/flydsl/__init__.py                 |  5 ---
 python/flydsl/_compat.py                  | 45 -----------------------
 tests/kernels/test_gemm_fp8fp4_gfx1250.py |  3 --
 3 files changed, 53 deletions(-)
 delete mode 100644 python/flydsl/_compat.py

diff --git a/python/flydsl/__init__.py b/python/flydsl/__init__.py
index 671d173dd..8ab40b56b 100644
--- a/python/flydsl/__init__.py
+++ b/python/flydsl/__init__.py
@@ -4,9 +4,4 @@
 
 __version__ = "0.1.7"
 
-# FFM simulator compatibility shim (no-op outside simulator sessions).
-from ._compat import _maybe_preload_system_comgr  # noqa: E402
-
-_maybe_preload_system_comgr()
-
 from .autotune import Config as Config, autotune as autotune  # noqa: E402
diff --git a/python/flydsl/_compat.py b/python/flydsl/_compat.py
deleted file mode 100644
index 1f4ac7500..000000000
--- a/python/flydsl/_compat.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# Copyright (c) 2025 FlyDSL Project Contributors
-
-"""Runtime compatibility shims loaded at import time.
-
-Kept separate from ``__init__.py`` so the workaround logic is isolated and
-easy to find / disable.
-"""
-
-import ctypes
-import os
-
-
-def _maybe_preload_system_comgr() -> None:
-    """Pre-load system ``libamd_comgr`` to avoid duplicate-option LLVM errors.
-
-    The FFM simulator ships its own ``libamd_comgr`` that registers the same
-    LLVM command-line options as the system copy.  If both are loaded the
-    process aborts with *"Option 'greedy' already exists!"*.  Loading the
-    system copy first (with ``RTLD_GLOBAL``) makes the simulator copy a
-    harmless no-op.
-
-    This function is a no-op outside FFM simulator sessions.
-    """
-    disable = os.environ.get("FLYDSL_DISABLE_COMGR_PRELOAD", "").strip().lower()
-    if disable in {"1", "true", "yes", "on"}:
-        return
-
-    model_path = os.environ.get("GFX1250_MODEL_PATH", "")
-    hsa_model_lib = os.environ.get("HSA_MODEL_LIB", "")
-    in_ffm_session = ("ffm-lite" in hsa_model_lib) or ("ffmlite" in model_path)
-    if not in_ffm_session:
-        return
-
-    system_comgr = os.environ.get("FLYDSL_COMGR_PRELOAD_PATH", "/opt/rocm/lib/libamd_comgr.so.3")
-    sim_comgr = os.path.join(model_path, "rocm", "libamd_comgr.so.3")
-    if not (os.path.exists(system_comgr) and os.path.exists(sim_comgr)):
-        return
-
-    mode = getattr(os, "RTLD_NOW", 0) | getattr(os, "RTLD_GLOBAL", 0)
-    try:
-        ctypes.CDLL(system_comgr, mode=mode)
-    except OSError:
-        # Keep import robust if the host ROCm stack differs.
-        pass
diff --git a/tests/kernels/test_gemm_fp8fp4_gfx1250.py b/tests/kernels/test_gemm_fp8fp4_gfx1250.py
index 70cd4d151..00677d243 100644
--- a/tests/kernels/test_gemm_fp8fp4_gfx1250.py
+++ b/tests/kernels/test_gemm_fp8fp4_gfx1250.py
@@ -15,12 +15,9 @@
 if _PYFLIR_SRC not in sys.path:
     sys.path.insert(0, _PYFLIR_SRC)
 
-# workaround for simulator
 import pytest  # noqa: E402
 import torch  # noqa: E402
 
-import flydsl  # noqa: E402,F401 -- preload system comgr before torch/HIP loads LLVM
-
 pytestmark = [pytest.mark.l2_device, pytest.mark.rocm_lower]
 
 from flydsl.runtime.device import get_rocm_arch  # noqa: E402