Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions kernels/gemm_fp8fp4_gfx1250.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,6 @@ def compile_mxscale_gemm(
if cluster_m * cluster_n > 16:
raise ValueError(f"cluster_m * cluster_n must be <= 16, got {cluster_m}*{cluster_n}")
effective_waves_per_eu = waves_per_eu
if use_cluster and effective_waves_per_eu is None:
effective_waves_per_eu = 2

num_warps = m_warp * n_warp
block_threads = num_warps * WAVE_SIZE
Expand Down
21 changes: 21 additions & 0 deletions lib/Runtime/ROCm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,32 @@ file(GLOB _rocm_paths LIST_DIRECTORIES true "/opt/rocm*")
list(SORT _rocm_paths ORDER DESCENDING)
find_package(hip REQUIRED CONFIG PATHS ${_rocm_paths})

include(CheckCXXSourceCompiles)
set(_FLY_SAVED_CMAKE_REQUIRED_INCLUDES "${CMAKE_REQUIRED_INCLUDES}")
set(_FLY_SAVED_CMAKE_REQUIRED_DEFINITIONS "${CMAKE_REQUIRED_DEFINITIONS}")
set(_FLY_SAVED_CMAKE_TRY_COMPILE_TARGET_TYPE "${CMAKE_TRY_COMPILE_TARGET_TYPE}")
set(CMAKE_REQUIRED_INCLUDES "${hip_INCLUDE_DIRS};${HIP_INCLUDE_DIRS}")
set(CMAKE_REQUIRED_DEFINITIONS "-D__HIP_PLATFORM_AMD__")
set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
check_cxx_source_compiles("
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This inline C++ probe feels a bit brittle in CMakeLists, and it only checks one symbol while the runtime uses the full cluster-launch surface (hipLaunchAttribute, HIP_LAUNCH_CONFIG, hipDrvLaunchKernelEx, hipDeviceProp_t::clusterLaunch). Could we either gate on a known HIP/ROCm version or move this into a small standalone probe that validates the full API set?

#include <hip/hip_runtime_api.h>
int main() {
auto x = hipLaunchAttributeClusterDimension;
(void)x;
return 0;
}
" FLY_HIP_HAS_CLUSTER_ATTR)
set(CMAKE_REQUIRED_INCLUDES "${_FLY_SAVED_CMAKE_REQUIRED_INCLUDES}")
set(CMAKE_REQUIRED_DEFINITIONS "${_FLY_SAVED_CMAKE_REQUIRED_DEFINITIONS}")
set(CMAKE_TRY_COMPILE_TARGET_TYPE "${_FLY_SAVED_CMAKE_TRY_COMPILE_TARGET_TYPE}")

add_library(FlyJitRuntime SHARED FlyRocmRuntimeWrappers.cpp)
target_include_directories(FlyJitRuntime PRIVATE
${LLVM_INCLUDE_DIRS}
${MLIR_INCLUDE_DIRS}
)
target_compile_features(FlyJitRuntime PRIVATE cxx_std_17)
target_link_libraries(FlyJitRuntime PRIVATE hip::host hip::amdhip64)
target_compile_definitions(FlyJitRuntime PRIVATE
FLY_HIP_HAS_CLUSTER_ATTR=$<BOOL:${FLY_HIP_HAS_CLUSTER_ATTR}>)
set_target_properties(FlyJitRuntime PROPERTIES OUTPUT_NAME "fly_jit_runtime")
130 changes: 87 additions & 43 deletions lib/Runtime/ROCm/FlyRocmRuntimeWrappers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
//
//===----------------------------------------------------------------------===//

#include <atomic>
#include <cassert>
#include <cstdio>
#include <vector>
Expand Down Expand Up @@ -66,6 +67,38 @@ extern "C" void mgpuLaunchKernel(hipFunction_t function, intptr_t gridX,
stream, params, extra));
}

// Cached per-device cluster-launch capability for the current HIP device.
// Cache slot encoding: 0 = unqueried, 1 = no cluster, 2 = cluster supported.
// Populated lazily from hipDeviceProp_t::clusterLaunch (field stable since
// ROCm 6.0). Per-device caching is required because mgpuSetDefaultDevice
// allows multi-GPU usage. Concurrent first-time queries on the same device
// race benignly: both threads compute the same value before storing.
static int flyDeviceClusterLaunchCap() {
int dev = 0;
if (hipGetDevice(&dev) != hipSuccess)
return 0;

constexpr int kMaxCachedDevices = 16;
static std::atomic<int> sCache[kMaxCachedDevices];

auto query = [](int d) -> int {
hipDeviceProp_t prop{};
if (hipGetDeviceProperties(&prop, d) != hipSuccess)
return 0;
return prop.clusterLaunch ? 1 : 0;
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This helper is compiled unconditionally but uses hipDeviceProp_t::clusterLaunch, while the CMake probe only checks hipLaunchAttributeClusterDimension. On HIP headers where the attr check is false, this field may also be missing and break the build before the #if FLY_HIP_HAS_CLUSTER_ATTR path matters. Could we guard this helper or include clusterLaunch in the same feature probe?

};

if (dev < 0 || dev >= kMaxCachedDevices)
return query(dev);

int cached = sCache[dev].load(std::memory_order_relaxed);
if (cached != 0)
return cached - 1;
int v = query(dev);
sCache[dev].store(v + 1, std::memory_order_relaxed);
return v;
}

extern "C" void mgpuLaunchClusterKernel(hipFunction_t function,
intptr_t clusterX, intptr_t clusterY,
intptr_t clusterZ,
Expand All @@ -75,64 +108,75 @@ extern "C" void mgpuLaunchClusterKernel(hipFunction_t function,
intptr_t blockZ, int32_t smem,
hipStream_t stream, void **params,
void **extra, size_t /*paramsCount*/) {
#ifdef hipLaunchAttributeClusterDimension
hipLaunchAttribute attrs[1];
attrs[0].id = hipLaunchAttributeClusterDimension;
attrs[0].value.clusterDim.x = static_cast<unsigned>(clusterX);
attrs[0].value.clusterDim.y = static_cast<unsigned>(clusterY);
attrs[0].value.clusterDim.z = static_cast<unsigned>(clusterZ);

HIP_LAUNCH_CONFIG config{};
config.gridDimX = static_cast<unsigned>(gridX);
config.gridDimY = static_cast<unsigned>(gridY);
config.gridDimZ = static_cast<unsigned>(gridZ);
config.blockDimX = static_cast<unsigned>(blockX);
config.blockDimY = static_cast<unsigned>(blockY);
config.blockDimZ = static_cast<unsigned>(blockZ);
config.sharedMemBytes = static_cast<unsigned>(smem);
config.hStream = stream;
config.attrs = attrs;
config.numAttrs = 1;

hipError_t err = hipDrvLaunchKernelEx(&config, function, params, extra);
if (err == hipSuccess)
return;

const bool requestedRealCluster =
(clusterX > 1) || (clusterY > 1) || (clusterZ > 1);
if (requestedRealCluster) {

#if FLY_HIP_HAS_CLUSTER_ATTR
const int deviceClusterCap = flyDeviceClusterLaunchCap();

if (requestedRealCluster && !deviceClusterCap) {
fprintf(stderr,
"[mgpuLaunchClusterKernel] hipDrvLaunchKernelEx failed (err=%d) "
"for requested cluster=(%ld,%ld,%ld); not falling back to "
"hipModuleLaunchKernel.\n",
static_cast<int>(err), static_cast<long>(clusterX),
static_cast<long>(clusterY), static_cast<long>(clusterZ));
HIP_REPORT_IF_ERROR(err);
"[mgpuLaunchClusterKernel] cluster=(%ld,%ld,%ld) requested but "
"device reports clusterLaunch=0; aborting (no silent fallback).\n",
static_cast<long>(clusterX), static_cast<long>(clusterY),
static_cast<long>(clusterZ));
return;
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This still returns normally after deciding a real cluster launch cannot be honored, so callers/tests can continue as if the kernel launched. Since the PR goal is no silent fallback, can we surface this as a real runtime failure (for example hipErrorNotSupported through a shared error path) instead of only fprintf + return?

}

fprintf(stderr,
"[mgpuLaunchClusterKernel] hipDrvLaunchKernelEx failed (err=%d) "
"for cluster=(1,1,1); falling back to hipModuleLaunchKernel.\n",
static_cast<int>(err));
HIP_REPORT_IF_ERROR(hipModuleLaunchKernel(function, gridX, gridY, gridZ,
blockX, blockY, blockZ, smem,
stream, params, extra));
if (deviceClusterCap) {
hipLaunchAttribute attrs[1];
attrs[0].id = hipLaunchAttributeClusterDimension;
attrs[0].value.clusterDim.x = static_cast<unsigned>(clusterX);
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this is a C ABI boundary, it would be safer to validate cluster dims before casting from intptr_t to unsigned. A zero/negative or overflowing value would become a confusing launch config or HIP error, and it also affects the requestedRealCluster check above.

attrs[0].value.clusterDim.y = static_cast<unsigned>(clusterY);
attrs[0].value.clusterDim.z = static_cast<unsigned>(clusterZ);

HIP_LAUNCH_CONFIG config{};
config.gridDimX = static_cast<unsigned>(gridX);
config.gridDimY = static_cast<unsigned>(gridY);
config.gridDimZ = static_cast<unsigned>(gridZ);
config.blockDimX = static_cast<unsigned>(blockX);
config.blockDimY = static_cast<unsigned>(blockY);
config.blockDimZ = static_cast<unsigned>(blockZ);
config.sharedMemBytes = static_cast<unsigned>(smem);
config.hStream = stream;
config.attrs = attrs;
config.numAttrs = 1;

hipError_t err = hipDrvLaunchKernelEx(&config, function, params, extra);
if (err == hipSuccess)
return;

if (requestedRealCluster) {
fprintf(stderr,
"[mgpuLaunchClusterKernel] hipDrvLaunchKernelEx failed (err=%d) "
"for requested cluster=(%ld,%ld,%ld); not falling back.\n",
static_cast<int>(err), static_cast<long>(clusterX),
static_cast<long>(clusterY), static_cast<long>(clusterZ));
HIP_REPORT_IF_ERROR(err);
return;
}

fprintf(stderr,
"[mgpuLaunchClusterKernel] hipDrvLaunchKernelEx failed (err=%d) "
"for cluster=(1,1,1); falling back to hipModuleLaunchKernel.\n",
static_cast<int>(err));
}
#else
// Cluster launch not supported by this HIP version; ignore cluster dims
// and fall back to regular kernel launch.
if ((clusterX > 1) || (clusterY > 1) || (clusterZ > 1)) {
if (requestedRealCluster) {
fprintf(stderr,
"[mgpuLaunchClusterKernel] cluster=(%ld,%ld,%ld) requested but "
"hipLaunchAttributeClusterDimension is not available in this HIP "
"version; falling back to hipModuleLaunchKernel.\n",
"FlyDSL was built against a HIP without "
"hipLaunchAttributeClusterDimension; aborting "
"(no silent fallback).\n",
static_cast<long>(clusterX), static_cast<long>(clusterY),
static_cast<long>(clusterZ));
return;
}
#endif

HIP_REPORT_IF_ERROR(hipModuleLaunchKernel(function, gridX, gridY, gridZ,
blockX, blockY, blockZ, smem,
stream, params, extra));
#endif
}

extern "C" hipStream_t mgpuStreamCreate() {
Expand Down
5 changes: 0 additions & 5 deletions python/flydsl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,4 @@

__version__ = "0.1.7"

# FFM simulator compatibility shim (no-op outside simulator sessions).
from ._compat import _maybe_preload_system_comgr # noqa: E402

_maybe_preload_system_comgr()

from .autotune import Config as Config, autotune as autotune # noqa: E402
45 changes: 0 additions & 45 deletions python/flydsl/_compat.py

This file was deleted.

3 changes: 0 additions & 3 deletions tests/kernels/test_gemm_fp8fp4_gfx1250.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,9 @@
if _PYFLIR_SRC not in sys.path:
sys.path.insert(0, _PYFLIR_SRC)

# workaround for simulator
import pytest # noqa: E402
import torch # noqa: E402

import flydsl # noqa: E402,F401 -- preload system comgr before torch/HIP loads LLVM

pytestmark = [pytest.mark.l2_device, pytest.mark.rocm_lower]

from flydsl.runtime.device import get_rocm_arch # noqa: E402
Expand Down
Loading