From dc89682138209bfffe01989f285a6f3ea812774f Mon Sep 17 00:00:00 2001 From: "Nichols A. Romero" Date: Thu, 25 Jul 2024 15:40:41 +0000 Subject: [PATCH 1/7] Add fclang-abi-compat flag to HIP_HCC_FLAGS to match upstream PyTorch. --- fbgemm_gpu/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt index 0d454ff580..4bf31497e1 100644 --- a/fbgemm_gpu/CMakeLists.txt +++ b/fbgemm_gpu/CMakeLists.txt @@ -201,6 +201,7 @@ if(USE_ROCM) list(APPEND HIP_HCC_FLAGS " \"-Wno-#pragma-messages\" " " \"-Wno-#warnings\" " + -fclang-abi-compat=17 -Wno-cuda-compat -Wno-deprecated-declarations -Wno-format From 1c455dd9899f91cea2a0de3a785b640bfed1f16e Mon Sep 17 00:00:00 2001 From: "Nichols A. Romero" Date: Thu, 25 Jul 2024 15:55:51 +0000 Subject: [PATCH 2/7] Initialize variables in support of ROCm 6.2 enablement. --- fbgemm_gpu/codegen/genscript/optimizers.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fbgemm_gpu/codegen/genscript/optimizers.py b/fbgemm_gpu/codegen/genscript/optimizers.py index b056c3c1ba..bd758fe76c 100644 --- a/fbgemm_gpu/codegen/genscript/optimizers.py +++ b/fbgemm_gpu/codegen/genscript/optimizers.py @@ -175,6 +175,8 @@ def rowwise_adagrad() -> Dict[str, Any]: at::acc_type multiplier; at::acc_type correction; + multiplier = 0.0; + correction = 0.0; if (threadIdx.x == 0) { at::acc_type new_sum_square_grads = momentum1[idx] + g_avg_square; momentum1[idx] = new_sum_square_grads; @@ -485,6 +487,8 @@ def rowwise_adagrad_with_counter() -> Dict[str, Any]: at::acc_type adjusted_multiplier; at::acc_type exp_reg_correction; + adjusted_multiplier = 0.0; + exp_reg_correction = 0.0; if (threadIdx.x == 0) { at::acc_type new_sum_square_grads = momentum1[idx] + g_avg_square; @@ -852,6 +856,7 @@ def partial_rowwise_lamb() -> Dict[str, Any]: warpReduceAllSum, kThreadGroupSize>(g_local_sum_square, shfl_sync_mask) / D; at::acc_type m2; + m2 = 0.0; if (threadIdx.x == 0) { m2 = beta2 * momentum2[idx] + (1.0 - beta2) * g_avg_square; momentum2[idx] = m2; @@ -998,6 +1003,7 @@ def partial_rowwise_adam() -> Dict[str, Any]: warpReduceAllSum, kThreadGroupSize>(g_local_sum_square) / D; at::acc_type v_hat_t; + v_hat_t = 0.0; if (threadIdx.x == 0) { at::acc_type v_t = momentum2[idx] * beta2 + g_avg_square * (1.0 - beta2); momentum2[idx] = v_t; From 5e5a7843cbab828f78592900dfe31fc5e338e885 Mon Sep 17 00:00:00 2001 From: "Nichols A. Romero" Date: Thu, 25 Jul 2024 16:09:56 +0000 Subject: [PATCH 3/7] More variables that require initialization in support of ROCm 6.2 enablement. --- fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh index 46fd14c9e5..02d35af313 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh +++ b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh @@ -1102,6 +1102,8 @@ DEVICE_INLINE T warp_reduce_max(T val) { template DEVICE_INLINE float2 warp_find_qparams(scalar_t local_min, scalar_t local_max) { float2 qparams; + qparams.x = 0.0f; + qparams.y = 0.0f; local_min = warp_reduce_min(local_min); local_max = warp_reduce_max(local_max); if (threadIdx.x == 0) { From e431139138233a39cd602d1184aa3154d19931bf Mon Sep 17 00:00:00 2001 From: "Nichols A. Romero" Date: Thu, 25 Jul 2024 19:23:08 +0000 Subject: [PATCH 4/7] Additional variables that require initialization in support of ROCm 6.2 enablement. --- fbgemm_gpu/codegen/embedding_common_code_generator.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fbgemm_gpu/codegen/embedding_common_code_generator.py b/fbgemm_gpu/codegen/embedding_common_code_generator.py index c81b680a26..e1d6062794 100644 --- a/fbgemm_gpu/codegen/embedding_common_code_generator.py +++ b/fbgemm_gpu/codegen/embedding_common_code_generator.py @@ -1096,6 +1096,8 @@ def rowwise_adagrad_with_counter() -> Dict[str, Any]: at::acc_type adjusted_multiplier; at::acc_type exp_reg_correction; + adjusted_multiplier = 0.0; + exp_reg_correction = 0.0; if (threadIdx.x == 0) { at::acc_type new_sum_square_grads = momentum1[idx] + g_avg_square; @@ -1463,6 +1465,7 @@ def partial_rowwise_lamb() -> Dict[str, Any]: warpReduceAllSum, kThreadGroupSize>(g_local_sum_square, shfl_sync_mask) / D; at::acc_type m2; + m2 = 0.0; if (threadIdx.x == 0) { m2 = beta2 * momentum2[idx] + (1.0 - beta2) * g_avg_square; momentum2[idx] = m2; @@ -1609,6 +1612,7 @@ def partial_rowwise_adam() -> Dict[str, Any]: warpReduceAllSum, kThreadGroupSize>(g_local_sum_square) / D; at::acc_type v_hat_t; + v_hat_t = 0.0; if (threadIdx.x == 0) { at::acc_type v_t = momentum2[idx] * beta2 + g_avg_square * (1.0 - beta2); momentum2[idx] = v_t; From ad836874584e8a55aa8d34306ca6918209c3c7b1 Mon Sep 17 00:00:00 2001 From: Stanley Tsang Date: Fri, 17 May 2024 15:45:04 +0000 Subject: [PATCH 5/7] Switch hipcub::DeviceRadixSort::SortPairs call to rocprim::device_radix_sort_pairs --- .../transpose_embedding_input.cu | 39 ++++++++++++++++++- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu b/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu index fe126453fc..c0bbf2492b 100644 --- a/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu +++ b/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu @@ -9,7 +9,7 @@ #include "fbgemm_gpu/embedding_backward_template_helpers.cuh" // @manual #include "fbgemm_gpu/ops_utils.h" // @manual #include "fbgemm_gpu/split_embeddings_utils.cuh" // @manual - +#include // clang-format off #include "fbgemm_gpu/cub_namespace_prefix.cuh" // @manual #include @@ -297,6 +297,7 @@ transpose_embedding_input( } { size_t temp_storage_bytes = 0; +#ifdef __HIP_PLATFORM_NVIDIA__ AT_CUDA_CHECK( FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs( nullptr, @@ -326,7 +327,41 @@ transpose_embedding_input( total_hash_size_bits, at::cuda::getCurrentCUDAStream(), false)); - } +#else + using config = rocprim::radix_sort_config< + rocprim::default_config, + rocprim::default_config, + rocprim::default_config, + 400000>; + rocprim::radix_sort_pairs( + nullptr, + temp_storage_bytes, + linear_indices.data_ptr(), + linear_indices_sorted.data_ptr(), + infos.data_ptr(), + infos_sorted.data_ptr(), + linear_indices.numel(), + 0, + total_hash_size_bits, + at::cuda::getCurrentCUDAStream(), + false); + auto temp_storage = at::empty( + {static_cast(temp_storage_bytes)}, + indices.options().dtype(at::kByte)); + rocprim::radix_sort_pairs( + temp_storage.data_ptr(), + temp_storage_bytes, + linear_indices.data_ptr(), + linear_indices_sorted.data_ptr(), + infos.data_ptr(), + infos_sorted.data_ptr(), + linear_indices.numel(), + 0, + total_hash_size_bits, + at::cuda::getCurrentCUDAStream(), + false); +#endif + } if (total_unique_indices != -1) { TORCH_CHECK(total_unique_indices >= 0); sorted_linear_indices_run = From cf45304240254abd98482e472e943454e8a94527 Mon Sep 17 00:00:00 2001 From: Andrey Bokovoy Date: Tue, 13 Aug 2024 16:27:45 +0200 Subject: [PATCH 6/7] Wrap rocmprim header with #ifdef --- .../src/split_embeddings_utils/transpose_embedding_input.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu b/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu index c0bbf2492b..25f9d0688d 100644 --- a/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu +++ b/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu @@ -9,7 +9,9 @@ #include "fbgemm_gpu/embedding_backward_template_helpers.cuh" // @manual #include "fbgemm_gpu/ops_utils.h" // @manual #include "fbgemm_gpu/split_embeddings_utils.cuh" // @manual +#ifdef USE_ROCM #include +#endif // clang-format off #include "fbgemm_gpu/cub_namespace_prefix.cuh" // @manual #include @@ -297,7 +299,7 @@ transpose_embedding_input( } { size_t temp_storage_bytes = 0; -#ifdef __HIP_PLATFORM_NVIDIA__ +#ifndef USE_ROCM AT_CUDA_CHECK( FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs( nullptr, From f39390077203af572c2b6658b49052c443802303 Mon Sep 17 00:00:00 2001 From: huizzhan Date: Wed, 17 Sep 2025 01:28:39 +0000 Subject: [PATCH 7/7] Remove fwd and warmup from benchmark profiling --- ...plit_table_batched_embeddings_benchmark.py | 44 +++++++++---------- fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py | 10 +++++ 2 files changed, 32 insertions(+), 22 deletions(-) diff --git a/fbgemm_gpu/bench/tbe/split_table_batched_embeddings_benchmark.py b/fbgemm_gpu/bench/tbe/split_table_batched_embeddings_benchmark.py index 4be370b5a8..2bcfdf4144 100644 --- a/fbgemm_gpu/bench/tbe/split_table_batched_embeddings_benchmark.py +++ b/fbgemm_gpu/bench/tbe/split_table_batched_embeddings_benchmark.py @@ -1158,28 +1158,28 @@ def device_with_spec( # noqa C901 f"Accessed weights per batch: {B * sum_DLs * param_size_multiplier / 1.0e9: .2f} GB" ) - # forward - time_per_iter = benchmark_requests( - requests, - lambda indices, offsets, per_sample_weights: emb.forward( - indices, - offsets, - per_sample_weights, - feature_requires_grad=feature_requires_grad, - ), - flush_gpu_cache_size_mb=flush_gpu_cache_size_mb, - num_warmups=warmup_runs, - ) - logging.info( - f"Forward, B: {B}, " - f"Es: {Es}, T: {T}, Ds: {Ds}, Ls: {Ls_str}, W: {weighted}, " - f"BW: {read_write_bytes / time_per_iter / 1.0e9: .2f} GB/s, " # noqa: B950 - f"T: {time_per_iter * 1.0e6:.0f}us" - ) - - if output_dtype == SparseType.INT8: - # backward bench not representative - return + # # forward + # time_per_iter = benchmark_requests( + # requests, + # lambda indices, offsets, per_sample_weights: emb.forward( + # indices, + # offsets, + # per_sample_weights, + # feature_requires_grad=feature_requires_grad, + # ), + # flush_gpu_cache_size_mb=flush_gpu_cache_size_mb, + # num_warmups=warmup_runs, + # ) + # logging.info( + # f"Forward, B: {B}, " + # f"Es: {Es}, T: {T}, Ds: {Ds}, Ls: {Ls_str}, W: {weighted}, " + # f"BW: {read_write_bytes / time_per_iter / 1.0e9: .2f} GB/s, " # noqa: B950 + # f"T: {time_per_iter * 1.0e6:.0f}us" + # ) + + # if output_dtype == SparseType.INT8: + # # backward bench not representative + # return if do_pooling: grad_output = torch.randn(B, sum(Ds)).to(get_device()) diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py b/fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py index 00bf30d230..0f51c66caa 100644 --- a/fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py +++ b/fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py @@ -12,6 +12,9 @@ import time from subprocess import Popen from typing import Callable, List, Optional, Tuple +import roctx +# from roctx.context_decorators import RoctxRange +# from roctx.context_decorators import RoctxProfiler import torch @@ -224,6 +227,8 @@ def benchmark_requests( # noqa: C901 if warmup_ms is None: num_warmups = num_warmups + 1 if num_warmups >= 0 else 1 + tid = roctx.getThreadId() + roctx.profilerPause(tid) # warm-up the GPU before profiling bench_warmup( requests[0], @@ -241,6 +246,7 @@ def benchmark_requests( # noqa: C901 if callback_after_warmup is not None: callback_after_warmup() + roctx.profilerResume(tid) num_reqs = len(requests) iters = num_reqs if iters == -1 else iters @@ -259,7 +265,11 @@ def benchmark_requests( # noqa: C901 indices, offsets, weights = req.unpack_3() if bwd_only: # Run forward before profiling if does backward only + tid = roctx.getThreadId() + roctx.profilerPause(tid) + # fwd kernel should be hidden by profiling tool out = func(indices, offsets, weights) + roctx.profilerResume(tid) start_time = time.time() if torch.cuda.is_available(): if flush_gpu_cache_size_mb: