From a25c0e76f267e09a9bfa2c2365539cd62d0f6f58 Mon Sep 17 00:00:00 2001 From: aoli26 Date: Tue, 21 Apr 2026 09:07:07 +0000 Subject: [PATCH 1/3] fix cluster launch by using cmake probe --- lib/Runtime/ROCm/CMakeLists.txt | 21 ++++ lib/Runtime/ROCm/FlyRocmRuntimeWrappers.cpp | 130 +++++++++++++------- 2 files changed, 108 insertions(+), 43 deletions(-) diff --git a/lib/Runtime/ROCm/CMakeLists.txt b/lib/Runtime/ROCm/CMakeLists.txt index b13e1fa56..44cfb2800 100644 --- a/lib/Runtime/ROCm/CMakeLists.txt +++ b/lib/Runtime/ROCm/CMakeLists.txt @@ -7,6 +7,25 @@ file(GLOB _rocm_paths LIST_DIRECTORIES true "/opt/rocm*") list(SORT _rocm_paths ORDER DESCENDING) find_package(hip REQUIRED CONFIG PATHS ${_rocm_paths}) +include(CheckCXXSourceCompiles) +set(_FLY_SAVED_CMAKE_REQUIRED_INCLUDES "${CMAKE_REQUIRED_INCLUDES}") +set(_FLY_SAVED_CMAKE_REQUIRED_DEFINITIONS "${CMAKE_REQUIRED_DEFINITIONS}") +set(_FLY_SAVED_CMAKE_TRY_COMPILE_TARGET_TYPE "${CMAKE_TRY_COMPILE_TARGET_TYPE}") +set(CMAKE_REQUIRED_INCLUDES "${hip_INCLUDE_DIRS};${HIP_INCLUDE_DIRS}") +set(CMAKE_REQUIRED_DEFINITIONS "-D__HIP_PLATFORM_AMD__") +set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) +check_cxx_source_compiles(" + #include + int main() { + auto x = hipLaunchAttributeClusterDimension; + (void)x; + return 0; + } +" FLY_HIP_HAS_CLUSTER_ATTR) +set(CMAKE_REQUIRED_INCLUDES "${_FLY_SAVED_CMAKE_REQUIRED_INCLUDES}") +set(CMAKE_REQUIRED_DEFINITIONS "${_FLY_SAVED_CMAKE_REQUIRED_DEFINITIONS}") +set(CMAKE_TRY_COMPILE_TARGET_TYPE "${_FLY_SAVED_CMAKE_TRY_COMPILE_TARGET_TYPE}") + add_library(FlyJitRuntime SHARED FlyRocmRuntimeWrappers.cpp) target_include_directories(FlyJitRuntime PRIVATE ${LLVM_INCLUDE_DIRS} @@ -14,4 +33,6 @@ target_include_directories(FlyJitRuntime PRIVATE ) target_compile_features(FlyJitRuntime PRIVATE cxx_std_17) target_link_libraries(FlyJitRuntime PRIVATE hip::host hip::amdhip64) +target_compile_definitions(FlyJitRuntime PRIVATE + FLY_HIP_HAS_CLUSTER_ATTR=$) set_target_properties(FlyJitRuntime PROPERTIES OUTPUT_NAME "fly_jit_runtime") diff --git a/lib/Runtime/ROCm/FlyRocmRuntimeWrappers.cpp b/lib/Runtime/ROCm/FlyRocmRuntimeWrappers.cpp index 87b35d20d..2416beb97 100644 --- a/lib/Runtime/ROCm/FlyRocmRuntimeWrappers.cpp +++ b/lib/Runtime/ROCm/FlyRocmRuntimeWrappers.cpp @@ -12,6 +12,7 @@ // //===----------------------------------------------------------------------===// +#include #include #include #include @@ -66,6 +67,38 @@ extern "C" void mgpuLaunchKernel(hipFunction_t function, intptr_t gridX, stream, params, extra)); } +// Cached per-device cluster-launch capability for the current HIP device. +// Cache slot encoding: 0 = unqueried, 1 = no cluster, 2 = cluster supported. +// Populated lazily from hipDeviceProp_t::clusterLaunch (field stable since +// ROCm 6.0). Per-device caching is required because mgpuSetDefaultDevice +// allows multi-GPU usage. Concurrent first-time queries on the same device +// race benignly: both threads compute the same value before storing. +static int flyDeviceClusterLaunchCap() { + int dev = 0; + if (hipGetDevice(&dev) != hipSuccess) + return 0; + + constexpr int kMaxCachedDevices = 16; + static std::atomic sCache[kMaxCachedDevices]; + + auto query = [](int d) -> int { + hipDeviceProp_t prop{}; + if (hipGetDeviceProperties(&prop, d) != hipSuccess) + return 0; + return prop.clusterLaunch ? 1 : 0; + }; + + if (dev < 0 || dev >= kMaxCachedDevices) + return query(dev); + + int cached = sCache[dev].load(std::memory_order_relaxed); + if (cached != 0) + return cached - 1; + int v = query(dev); + sCache[dev].store(v + 1, std::memory_order_relaxed); + return v; +} + extern "C" void mgpuLaunchClusterKernel(hipFunction_t function, intptr_t clusterX, intptr_t clusterY, intptr_t clusterZ, @@ -75,64 +108,75 @@ extern "C" void mgpuLaunchClusterKernel(hipFunction_t function, intptr_t blockZ, int32_t smem, hipStream_t stream, void **params, void **extra, size_t /*paramsCount*/) { -#ifdef hipLaunchAttributeClusterDimension - hipLaunchAttribute attrs[1]; - attrs[0].id = hipLaunchAttributeClusterDimension; - attrs[0].value.clusterDim.x = static_cast(clusterX); - attrs[0].value.clusterDim.y = static_cast(clusterY); - attrs[0].value.clusterDim.z = static_cast(clusterZ); - - HIP_LAUNCH_CONFIG config{}; - config.gridDimX = static_cast(gridX); - config.gridDimY = static_cast(gridY); - config.gridDimZ = static_cast(gridZ); - config.blockDimX = static_cast(blockX); - config.blockDimY = static_cast(blockY); - config.blockDimZ = static_cast(blockZ); - config.sharedMemBytes = static_cast(smem); - config.hStream = stream; - config.attrs = attrs; - config.numAttrs = 1; - - hipError_t err = hipDrvLaunchKernelEx(&config, function, params, extra); - if (err == hipSuccess) - return; - const bool requestedRealCluster = (clusterX > 1) || (clusterY > 1) || (clusterZ > 1); - if (requestedRealCluster) { + +#if FLY_HIP_HAS_CLUSTER_ATTR + const int deviceClusterCap = flyDeviceClusterLaunchCap(); + + if (requestedRealCluster && !deviceClusterCap) { fprintf(stderr, - "[mgpuLaunchClusterKernel] hipDrvLaunchKernelEx failed (err=%d) " - "for requested cluster=(%ld,%ld,%ld); not falling back to " - "hipModuleLaunchKernel.\n", - static_cast(err), static_cast(clusterX), - static_cast(clusterY), static_cast(clusterZ)); - HIP_REPORT_IF_ERROR(err); + "[mgpuLaunchClusterKernel] cluster=(%ld,%ld,%ld) requested but " + "device reports clusterLaunch=0; aborting (no silent fallback).\n", + static_cast(clusterX), static_cast(clusterY), + static_cast(clusterZ)); return; } - fprintf(stderr, - "[mgpuLaunchClusterKernel] hipDrvLaunchKernelEx failed (err=%d) " - "for cluster=(1,1,1); falling back to hipModuleLaunchKernel.\n", - static_cast(err)); - HIP_REPORT_IF_ERROR(hipModuleLaunchKernel(function, gridX, gridY, gridZ, - blockX, blockY, blockZ, smem, - stream, params, extra)); + if (deviceClusterCap) { + hipLaunchAttribute attrs[1]; + attrs[0].id = hipLaunchAttributeClusterDimension; + attrs[0].value.clusterDim.x = static_cast(clusterX); + attrs[0].value.clusterDim.y = static_cast(clusterY); + attrs[0].value.clusterDim.z = static_cast(clusterZ); + + HIP_LAUNCH_CONFIG config{}; + config.gridDimX = static_cast(gridX); + config.gridDimY = static_cast(gridY); + config.gridDimZ = static_cast(gridZ); + config.blockDimX = static_cast(blockX); + config.blockDimY = static_cast(blockY); + config.blockDimZ = static_cast(blockZ); + config.sharedMemBytes = static_cast(smem); + config.hStream = stream; + config.attrs = attrs; + config.numAttrs = 1; + + hipError_t err = hipDrvLaunchKernelEx(&config, function, params, extra); + if (err == hipSuccess) + return; + + if (requestedRealCluster) { + fprintf(stderr, + "[mgpuLaunchClusterKernel] hipDrvLaunchKernelEx failed (err=%d) " + "for requested cluster=(%ld,%ld,%ld); not falling back.\n", + static_cast(err), static_cast(clusterX), + static_cast(clusterY), static_cast(clusterZ)); + HIP_REPORT_IF_ERROR(err); + return; + } + + fprintf(stderr, + "[mgpuLaunchClusterKernel] hipDrvLaunchKernelEx failed (err=%d) " + "for cluster=(1,1,1); falling back to hipModuleLaunchKernel.\n", + static_cast(err)); + } #else - // Cluster launch not supported by this HIP version; ignore cluster dims - // and fall back to regular kernel launch. - if ((clusterX > 1) || (clusterY > 1) || (clusterZ > 1)) { + if (requestedRealCluster) { fprintf(stderr, "[mgpuLaunchClusterKernel] cluster=(%ld,%ld,%ld) requested but " - "hipLaunchAttributeClusterDimension is not available in this HIP " - "version; falling back to hipModuleLaunchKernel.\n", + "FlyDSL was built against a HIP without " + "hipLaunchAttributeClusterDimension; aborting " + "(no silent fallback).\n", static_cast(clusterX), static_cast(clusterY), static_cast(clusterZ)); + return; } +#endif + HIP_REPORT_IF_ERROR(hipModuleLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, smem, stream, params, extra)); -#endif } extern "C" hipStream_t mgpuStreamCreate() { From 6f3a337f9e1c2b66c3be3dd29be631850625dff5 Mon Sep 17 00:00:00 2001 From: aoli26 Date: Fri, 15 May 2026 11:53:29 +0000 Subject: [PATCH 2/3] remove force effective_waves_per_eu for cluster --- kernels/gemm_fp8fp4_gfx1250.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernels/gemm_fp8fp4_gfx1250.py b/kernels/gemm_fp8fp4_gfx1250.py index 7dd59d106..fe7902907 100644 --- a/kernels/gemm_fp8fp4_gfx1250.py +++ b/kernels/gemm_fp8fp4_gfx1250.py @@ -96,8 +96,6 @@ def compile_mxscale_gemm( if cluster_m * cluster_n > 16: raise ValueError(f"cluster_m * cluster_n must be <= 16, got {cluster_m}*{cluster_n}") effective_waves_per_eu = waves_per_eu - if use_cluster and effective_waves_per_eu is None: - effective_waves_per_eu = 2 num_warps = m_warp * n_warp block_threads = num_warps * WAVE_SIZE From 03a632541076ce25bad7fb03fcea13ccf05d31ef Mon Sep 17 00:00:00 2001 From: aoli26 Date: Thu, 14 May 2026 06:31:35 +0000 Subject: [PATCH 3/3] Remove COMGR import preload shim --- python/flydsl/__init__.py | 5 --- python/flydsl/_compat.py | 45 ----------------------- tests/kernels/test_gemm_fp8fp4_gfx1250.py | 3 -- 3 files changed, 53 deletions(-) delete mode 100644 python/flydsl/_compat.py diff --git a/python/flydsl/__init__.py b/python/flydsl/__init__.py index 671d173dd..8ab40b56b 100644 --- a/python/flydsl/__init__.py +++ b/python/flydsl/__init__.py @@ -4,9 +4,4 @@ __version__ = "0.1.7" -# FFM simulator compatibility shim (no-op outside simulator sessions). -from ._compat import _maybe_preload_system_comgr # noqa: E402 - -_maybe_preload_system_comgr() - from .autotune import Config as Config, autotune as autotune # noqa: E402 diff --git a/python/flydsl/_compat.py b/python/flydsl/_compat.py deleted file mode 100644 index 1f4ac7500..000000000 --- a/python/flydsl/_compat.py +++ /dev/null @@ -1,45 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# Copyright (c) 2025 FlyDSL Project Contributors - -"""Runtime compatibility shims loaded at import time. - -Kept separate from ``__init__.py`` so the workaround logic is isolated and -easy to find / disable. -""" - -import ctypes -import os - - -def _maybe_preload_system_comgr() -> None: - """Pre-load system ``libamd_comgr`` to avoid duplicate-option LLVM errors. - - The FFM simulator ships its own ``libamd_comgr`` that registers the same - LLVM command-line options as the system copy. If both are loaded the - process aborts with *"Option 'greedy' already exists!"*. Loading the - system copy first (with ``RTLD_GLOBAL``) makes the simulator copy a - harmless no-op. - - This function is a no-op outside FFM simulator sessions. - """ - disable = os.environ.get("FLYDSL_DISABLE_COMGR_PRELOAD", "").strip().lower() - if disable in {"1", "true", "yes", "on"}: - return - - model_path = os.environ.get("GFX1250_MODEL_PATH", "") - hsa_model_lib = os.environ.get("HSA_MODEL_LIB", "") - in_ffm_session = ("ffm-lite" in hsa_model_lib) or ("ffmlite" in model_path) - if not in_ffm_session: - return - - system_comgr = os.environ.get("FLYDSL_COMGR_PRELOAD_PATH", "/opt/rocm/lib/libamd_comgr.so.3") - sim_comgr = os.path.join(model_path, "rocm", "libamd_comgr.so.3") - if not (os.path.exists(system_comgr) and os.path.exists(sim_comgr)): - return - - mode = getattr(os, "RTLD_NOW", 0) | getattr(os, "RTLD_GLOBAL", 0) - try: - ctypes.CDLL(system_comgr, mode=mode) - except OSError: - # Keep import robust if the host ROCm stack differs. - pass diff --git a/tests/kernels/test_gemm_fp8fp4_gfx1250.py b/tests/kernels/test_gemm_fp8fp4_gfx1250.py index 70cd4d151..00677d243 100644 --- a/tests/kernels/test_gemm_fp8fp4_gfx1250.py +++ b/tests/kernels/test_gemm_fp8fp4_gfx1250.py @@ -15,12 +15,9 @@ if _PYFLIR_SRC not in sys.path: sys.path.insert(0, _PYFLIR_SRC) -# workaround for simulator import pytest # noqa: E402 import torch # noqa: E402 -import flydsl # noqa: E402,F401 -- preload system comgr before torch/HIP loads LLVM - pytestmark = [pytest.mark.l2_device, pytest.mark.rocm_lower] from flydsl.runtime.device import get_rocm_arch # noqa: E402