diff --git a/csrc/include/custom_all_reduce.cuh b/csrc/include/custom_all_reduce.cuh index be7b4c1eb3..d9b4cb8845 100644 --- a/csrc/include/custom_all_reduce.cuh +++ b/csrc/include/custom_all_reduce.cuh @@ -144,9 +144,9 @@ DINLINE O downcast(V val) } // This function is meant to be used as the first synchronization in the all -// reduce kernel. Thus, it doesn't need to make any visibility guarantees for -// prior memory accesses. Note: volatile writes will not be reordered against -// other volatile writes. +// reduce kernel. When using registered CUDA graph inputs, prior memory accesses +// from producer kernels must be system-visible before peers read the input +// buffers. Issue __threadfence_system() to ensure global visibility. template DINLINE void start_sync(const RankSignals& sg, #ifndef USE_ROCM @@ -156,6 +156,11 @@ DINLINE void start_sync(const RankSignals& sg, int rank) { #ifdef USE_ROCM + // Ensure prior memory writes (e.g. from producer kernels in CUDA graph) + // are visible to peer GPUs before signaling readiness. + if(threadIdx.x == 0) + __threadfence_system(); + __syncthreads(); uint32_t flag = self_sg->_flag[blockIdx.x] + 1; if(threadIdx.x < ngpus) { @@ -907,7 +912,7 @@ struct AbsMaxFunctor template DINLINE T shfl_xor(T var, int mask, int width = opus::get_warp_size()) { - static_assert(sizeof(T) == 4); + static_assert(sizeof(T) == 4); int self = opus::lane_id(); int index = (self & ~(width - 1)) + ((self ^ mask) & (width - 1)); return __builtin_bit_cast(T, __builtin_amdgcn_ds_bpermute(index << 2, __builtin_bit_cast(int, var))); @@ -3539,7 +3544,7 @@ void dispatchFusedQKNormAllReduce(hipStream_t stream, std::to_string(d)); } RankData* ptrs = get_buffer_RD(stream, qkv_in); - + #define DISPATCH_QKNORM_AR_FUSION_KERNEL(NGPUS) \ { \ qknorm_allreduce_fusion_kernel_2stage_launcher(ptrs, \