From dc89682138209bfffe01989f285a6f3ea812774f Mon Sep 17 00:00:00 2001
From: "Nichols A. Romero" <nick.romero@amd.com>
Date: Thu, 25 Jul 2024 15:40:41 +0000
Subject: [PATCH 1/7] Add fclang-abi-compat flag to HIP_HCC_FLAGS to match
 upstream PyTorch.

---
 fbgemm_gpu/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)
diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index 0d454ff580..4bf31497e1 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -201,6 +201,7 @@ if(USE_ROCM)
   list(APPEND HIP_HCC_FLAGS
     " \"-Wno-#pragma-messages\" "
     " \"-Wno-#warnings\" "
+    -fclang-abi-compat=17
     -Wno-cuda-compat
     -Wno-deprecated-declarations
     -Wno-format

From 1c455dd9899f91cea2a0de3a785b640bfed1f16e Mon Sep 17 00:00:00 2001
From: "Nichols A. Romero" <nick.romero@amd.com>
Date: Thu, 25 Jul 2024 15:55:51 +0000
Subject: [PATCH 2/7] Initialize variables in support of ROCm 6.2 enablement.

---
 fbgemm_gpu/codegen/genscript/optimizers.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fbgemm_gpu/codegen/genscript/optimizers.py b/fbgemm_gpu/codegen/genscript/optimizers.py
index b056c3c1ba..bd758fe76c 100644
--- a/fbgemm_gpu/codegen/genscript/optimizers.py
+++ b/fbgemm_gpu/codegen/genscript/optimizers.py
@@ -175,6 +175,8 @@ def rowwise_adagrad() -> Dict[str, Any]:
 
     at::acc_type<cache_t, true> multiplier;
     at::acc_type<cache_t, true> correction;
+    multiplier = 0.0;
+    correction = 0.0;
     if (threadIdx.x == 0) {
         at::acc_type<cache_t, true> new_sum_square_grads = momentum1[idx] + g_avg_square;
         momentum1[idx] = new_sum_square_grads;
@@ -485,6 +487,8 @@ def rowwise_adagrad_with_counter() -> Dict[str, Any]:
 
     at::acc_type<cache_t, true> adjusted_multiplier;
     at::acc_type<cache_t, true> exp_reg_correction;
+    adjusted_multiplier = 0.0;
+    exp_reg_correction = 0.0;
 
     if (threadIdx.x == 0) {
         at::acc_type<cache_t, true> new_sum_square_grads = momentum1[idx] + g_avg_square;
@@ -852,6 +856,7 @@ def partial_rowwise_lamb() -> Dict[str, Any]:
         warpReduceAllSum<at::acc_type<cache_t, true>, kThreadGroupSize>(g_local_sum_square, shfl_sync_mask) / D;
 
     at::acc_type<cache_t, true> m2;
+    m2  = 0.0;
     if (threadIdx.x == 0) {
         m2 = beta2 * momentum2[idx] + (1.0 - beta2) * g_avg_square;
         momentum2[idx] = m2;
@@ -998,6 +1003,7 @@ def partial_rowwise_adam() -> Dict[str, Any]:
         warpReduceAllSum<at::acc_type<cache_t, true>, kThreadGroupSize>(g_local_sum_square) / D;
 
     at::acc_type<cache_t, true> v_hat_t;
+    v_hat_t = 0.0;
     if (threadIdx.x == 0) {
         at::acc_type<cache_t, true> v_t = momentum2[idx] * beta2 + g_avg_square * (1.0 - beta2);
         momentum2[idx] = v_t;

From 5e5a7843cbab828f78592900dfe31fc5e338e885 Mon Sep 17 00:00:00 2001
From: "Nichols A. Romero" <nick.romero@amd.com>
Date: Thu, 25 Jul 2024 16:09:56 +0000
Subject: [PATCH 3/7] More variables that require initialization in support of
 ROCm 6.2 enablement.

---
 fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
index 46fd14c9e5..02d35af313 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
+++ b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
@@ -1102,6 +1102,8 @@ DEVICE_INLINE T warp_reduce_max(T val) {
 template <typename scalar_t>
 DEVICE_INLINE float2 warp_find_qparams(scalar_t local_min, scalar_t local_max) {
   float2 qparams;
+  qparams.x = 0.0f;
+  qparams.y = 0.0f;
   local_min = warp_reduce_min<scalar_t>(local_min);
   local_max = warp_reduce_max<scalar_t>(local_max);
   if (threadIdx.x == 0) {

From e431139138233a39cd602d1184aa3154d19931bf Mon Sep 17 00:00:00 2001
From: "Nichols A. Romero" <nick.romero@amd.com>
Date: Thu, 25 Jul 2024 19:23:08 +0000
Subject: [PATCH 4/7] Additional variables that require initialization in
 support of ROCm 6.2 enablement.

---
 fbgemm_gpu/codegen/embedding_common_code_generator.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fbgemm_gpu/codegen/embedding_common_code_generator.py b/fbgemm_gpu/codegen/embedding_common_code_generator.py
index c81b680a26..e1d6062794 100644
--- a/fbgemm_gpu/codegen/embedding_common_code_generator.py
+++ b/fbgemm_gpu/codegen/embedding_common_code_generator.py
@@ -1096,6 +1096,8 @@ def rowwise_adagrad_with_counter() -> Dict[str, Any]:
 
     at::acc_type<cache_t, true> adjusted_multiplier;
     at::acc_type<cache_t, true> exp_reg_correction;
+    adjusted_multiplier = 0.0;
+    exp_reg_correction = 0.0;
 
     if (threadIdx.x == 0) {
         at::acc_type<cache_t, true> new_sum_square_grads = momentum1[idx] + g_avg_square;
@@ -1463,6 +1465,7 @@ def partial_rowwise_lamb() -> Dict[str, Any]:
         warpReduceAllSum<at::acc_type<cache_t, true>, kThreadGroupSize>(g_local_sum_square, shfl_sync_mask) / D;
 
     at::acc_type<cache_t, true> m2;
+    m2 = 0.0;
     if (threadIdx.x == 0) {
         m2 = beta2 * momentum2[idx] + (1.0 - beta2) * g_avg_square;
         momentum2[idx] = m2;
@@ -1609,6 +1612,7 @@ def partial_rowwise_adam() -> Dict[str, Any]:
         warpReduceAllSum<at::acc_type<cache_t, true>, kThreadGroupSize>(g_local_sum_square) / D;
 
     at::acc_type<cache_t, true> v_hat_t;
+    v_hat_t = 0.0;
     if (threadIdx.x == 0) {
         at::acc_type<cache_t, true> v_t = momentum2[idx] * beta2 + g_avg_square * (1.0 - beta2);
         momentum2[idx] = v_t;

From ad836874584e8a55aa8d34306ca6918209c3c7b1 Mon Sep 17 00:00:00 2001
From: Stanley Tsang <stanley.tsang@amd.com>
Date: Fri, 17 May 2024 15:45:04 +0000
Subject: [PATCH 5/7] Switch hipcub::DeviceRadixSort::SortPairs call to
 rocprim::device_radix_sort_pairs

---
 .../transpose_embedding_input.cu              | 39 ++++++++++++++++++-
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu b/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu
index fe126453fc..c0bbf2492b 100644
--- a/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu
+++ b/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu
@@ -9,7 +9,7 @@
 #include "fbgemm_gpu/embedding_backward_template_helpers.cuh" // @manual
 #include "fbgemm_gpu/ops_utils.h" // @manual
 #include "fbgemm_gpu/split_embeddings_utils.cuh" // @manual
-
+#include <rocprim/device/device_radix_sort.hpp>
 // clang-format off
 #include "fbgemm_gpu/cub_namespace_prefix.cuh" // @manual
 #include <cub/device/device_radix_sort.cuh>
@@ -297,6 +297,7 @@ transpose_embedding_input(
               }
               {
                 size_t temp_storage_bytes = 0;
+#ifdef __HIP_PLATFORM_NVIDIA__ 
                 AT_CUDA_CHECK(
                     FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs(
                         nullptr,
@@ -326,7 +327,41 @@ transpose_embedding_input(
                         total_hash_size_bits,
                         at::cuda::getCurrentCUDAStream(),
                         false));
-              }
+#else
+	        using config = rocprim::radix_sort_config<
+                rocprim::default_config,
+                rocprim::default_config,
+                rocprim::default_config,
+                400000>;		    
+	        rocprim::radix_sort_pairs<config>(
+  	            nullptr,
+                    temp_storage_bytes,
+                    linear_indices.data_ptr<index_t>(),
+                    linear_indices_sorted.data_ptr<index_t>(),
+                    infos.data_ptr<info_t>(),
+                    infos_sorted.data_ptr<info_t>(),
+                    linear_indices.numel(),
+                    0,
+                    total_hash_size_bits,
+                    at::cuda::getCurrentCUDAStream(),
+                    false);
+                auto temp_storage = at::empty(
+                    {static_cast<int64_t>(temp_storage_bytes)},
+                    indices.options().dtype(at::kByte));			    
+                rocprim::radix_sort_pairs<config>(
+                    temp_storage.data_ptr(),
+                    temp_storage_bytes,
+                    linear_indices.data_ptr<index_t>(),
+                    linear_indices_sorted.data_ptr<index_t>(),
+                    infos.data_ptr<info_t>(),
+                    infos_sorted.data_ptr<info_t>(),
+                    linear_indices.numel(),
+                    0,
+                    total_hash_size_bits,
+                    at::cuda::getCurrentCUDAStream(),
+                    false);
+#endif	      
+	      }
               if (total_unique_indices != -1) {
                 TORCH_CHECK(total_unique_indices >= 0);
                 sorted_linear_indices_run =

From cf45304240254abd98482e472e943454e8a94527 Mon Sep 17 00:00:00 2001
From: Andrey Bokovoy <avbokovoy@gmail.com>
Date: Tue, 13 Aug 2024 16:27:45 +0200
Subject: [PATCH 6/7] Wrap rocmprim header with #ifdef

---
 .../src/split_embeddings_utils/transpose_embedding_input.cu   | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu b/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu
index c0bbf2492b..25f9d0688d 100644
--- a/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu
+++ b/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu
@@ -9,7 +9,9 @@
 #include "fbgemm_gpu/embedding_backward_template_helpers.cuh" // @manual
 #include "fbgemm_gpu/ops_utils.h" // @manual
 #include "fbgemm_gpu/split_embeddings_utils.cuh" // @manual
+#ifdef USE_ROCM
 #include <rocprim/device/device_radix_sort.hpp>
+#endif
 // clang-format off
 #include "fbgemm_gpu/cub_namespace_prefix.cuh" // @manual
 #include <cub/device/device_radix_sort.cuh>
@@ -297,7 +299,7 @@ transpose_embedding_input(
               }
               {
                 size_t temp_storage_bytes = 0;
-#ifdef __HIP_PLATFORM_NVIDIA__ 
+#ifndef USE_ROCM
                 AT_CUDA_CHECK(
                     FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs(
                         nullptr,

From f39390077203af572c2b6658b49052c443802303 Mon Sep 17 00:00:00 2001
From: huizzhan <huizzhan@amd.com>
Date: Wed, 17 Sep 2025 01:28:39 +0000
Subject: [PATCH 7/7] Remove fwd and warmup from benchmark profiling

---
 ...plit_table_batched_embeddings_benchmark.py | 44 +++++++++----------
 fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py | 10 +++++
 2 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/fbgemm_gpu/bench/tbe/split_table_batched_embeddings_benchmark.py b/fbgemm_gpu/bench/tbe/split_table_batched_embeddings_benchmark.py
index 4be370b5a8..2bcfdf4144 100644
--- a/fbgemm_gpu/bench/tbe/split_table_batched_embeddings_benchmark.py
+++ b/fbgemm_gpu/bench/tbe/split_table_batched_embeddings_benchmark.py
@@ -1158,28 +1158,28 @@ def device_with_spec(  # noqa C901
         f"Accessed weights per batch: {B * sum_DLs * param_size_multiplier / 1.0e9: .2f} GB"
     )
 
-    # forward
-    time_per_iter = benchmark_requests(
-        requests,
-        lambda indices, offsets, per_sample_weights: emb.forward(
-            indices,
-            offsets,
-            per_sample_weights,
-            feature_requires_grad=feature_requires_grad,
-        ),
-        flush_gpu_cache_size_mb=flush_gpu_cache_size_mb,
-        num_warmups=warmup_runs,
-    )
-    logging.info(
-        f"Forward, B: {B}, "
-        f"Es: {Es}, T: {T}, Ds: {Ds}, Ls: {Ls_str}, W: {weighted}, "
-        f"BW: {read_write_bytes / time_per_iter / 1.0e9: .2f} GB/s, "  # noqa: B950
-        f"T: {time_per_iter * 1.0e6:.0f}us"
-    )
-
-    if output_dtype == SparseType.INT8:
-        # backward bench not representative
-        return
+    # # forward
+    # time_per_iter = benchmark_requests(
+    #     requests,
+    #     lambda indices, offsets, per_sample_weights: emb.forward(
+    #         indices,
+    #         offsets,
+    #         per_sample_weights,
+    #         feature_requires_grad=feature_requires_grad,
+    #     ),
+    #     flush_gpu_cache_size_mb=flush_gpu_cache_size_mb,
+    #     num_warmups=warmup_runs,
+    # )
+    # logging.info(
+    #     f"Forward, B: {B}, "
+    #     f"Es: {Es}, T: {T}, Ds: {Ds}, Ls: {Ls_str}, W: {weighted}, "
+    #     f"BW: {read_write_bytes / time_per_iter / 1.0e9: .2f} GB/s, "  # noqa: B950
+    #     f"T: {time_per_iter * 1.0e6:.0f}us"
+    # )
+
+    # if output_dtype == SparseType.INT8:
+    #     # backward bench not representative
+    #     return
 
     if do_pooling:
         grad_output = torch.randn(B, sum(Ds)).to(get_device())
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py b/fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py
index 00bf30d230..0f51c66caa 100644
--- a/fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py
+++ b/fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py
@@ -12,6 +12,9 @@
 import time
 from subprocess import Popen
 from typing import Callable, List, Optional, Tuple
+import roctx
+# from roctx.context_decorators import RoctxRange
+# from roctx.context_decorators import RoctxProfiler
 
 import torch
 
@@ -224,6 +227,8 @@ def benchmark_requests(  # noqa: C901
     if warmup_ms is None:
         num_warmups = num_warmups + 1 if num_warmups >= 0 else 1
 
+    tid = roctx.getThreadId()
+    roctx.profilerPause(tid)
     # warm-up the GPU before profiling
     bench_warmup(
         requests[0],
@@ -241,6 +246,7 @@ def benchmark_requests(  # noqa: C901
 
     if callback_after_warmup is not None:
         callback_after_warmup()
+    roctx.profilerResume(tid)
 
     num_reqs = len(requests)
     iters = num_reqs if iters == -1 else iters
@@ -259,7 +265,11 @@ def benchmark_requests(  # noqa: C901
         indices, offsets, weights = req.unpack_3()
         if bwd_only:
             # Run forward before profiling if does backward only
+            tid = roctx.getThreadId()
+            roctx.profilerPause(tid)
+            # fwd kernel should be hidden by profiling tool
             out = func(indices, offsets, weights)
+            roctx.profilerResume(tid)
         start_time = time.time()
         if torch.cuda.is_available():
             if flush_gpu_cache_size_mb: