diff --git a/fbgemm_gpu/bench/tbe/split_table_batched_embeddings_benchmark.py b/fbgemm_gpu/bench/tbe/split_table_batched_embeddings_benchmark.py
index 4be370b5a8..2bcfdf4144 100644
--- a/fbgemm_gpu/bench/tbe/split_table_batched_embeddings_benchmark.py
+++ b/fbgemm_gpu/bench/tbe/split_table_batched_embeddings_benchmark.py
@@ -1158,28 +1158,28 @@ def device_with_spec(  # noqa C901
         f"Accessed weights per batch: {B * sum_DLs * param_size_multiplier / 1.0e9: .2f} GB"
     )
 
-    # forward
-    time_per_iter = benchmark_requests(
-        requests,
-        lambda indices, offsets, per_sample_weights: emb.forward(
-            indices,
-            offsets,
-            per_sample_weights,
-            feature_requires_grad=feature_requires_grad,
-        ),
-        flush_gpu_cache_size_mb=flush_gpu_cache_size_mb,
-        num_warmups=warmup_runs,
-    )
-    logging.info(
-        f"Forward, B: {B}, "
-        f"Es: {Es}, T: {T}, Ds: {Ds}, Ls: {Ls_str}, W: {weighted}, "
-        f"BW: {read_write_bytes / time_per_iter / 1.0e9: .2f} GB/s, "  # noqa: B950
-        f"T: {time_per_iter * 1.0e6:.0f}us"
-    )
-
-    if output_dtype == SparseType.INT8:
-        # backward bench not representative
-        return
+    # # forward
+    # time_per_iter = benchmark_requests(
+    #     requests,
+    #     lambda indices, offsets, per_sample_weights: emb.forward(
+    #         indices,
+    #         offsets,
+    #         per_sample_weights,
+    #         feature_requires_grad=feature_requires_grad,
+    #     ),
+    #     flush_gpu_cache_size_mb=flush_gpu_cache_size_mb,
+    #     num_warmups=warmup_runs,
+    # )
+    # logging.info(
+    #     f"Forward, B: {B}, "
+    #     f"Es: {Es}, T: {T}, Ds: {Ds}, Ls: {Ls_str}, W: {weighted}, "
+    #     f"BW: {read_write_bytes / time_per_iter / 1.0e9: .2f} GB/s, "  # noqa: B950
+    #     f"T: {time_per_iter * 1.0e6:.0f}us"
+    # )
+
+    # if output_dtype == SparseType.INT8:
+    #     # backward bench not representative
+    #     return
 
     if do_pooling:
         grad_output = torch.randn(B, sum(Ds)).to(get_device())
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py b/fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py
index 00bf30d230..0f51c66caa 100644
--- a/fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py
+++ b/fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py
@@ -12,6 +12,9 @@
 import time
 from subprocess import Popen
 from typing import Callable, List, Optional, Tuple
+import roctx
+# from roctx.context_decorators import RoctxRange
+# from roctx.context_decorators import RoctxProfiler
 
 import torch
 
@@ -224,6 +227,8 @@ def benchmark_requests(  # noqa: C901
     if warmup_ms is None:
         num_warmups = num_warmups + 1 if num_warmups >= 0 else 1
 
+    tid = roctx.getThreadId()
+    roctx.profilerPause(tid)
     # warm-up the GPU before profiling
     bench_warmup(
         requests[0],
@@ -241,6 +246,7 @@ def benchmark_requests(  # noqa: C901
 
     if callback_after_warmup is not None:
         callback_after_warmup()
+    roctx.profilerResume(tid)
 
     num_reqs = len(requests)
     iters = num_reqs if iters == -1 else iters
@@ -259,7 +265,11 @@ def benchmark_requests(  # noqa: C901
         indices, offsets, weights = req.unpack_3()
         if bwd_only:
             # Run forward before profiling if does backward only
+            tid = roctx.getThreadId()
+            roctx.profilerPause(tid)
+            # fwd kernel should be hidden by profiling tool
             out = func(indices, offsets, weights)
+            roctx.profilerResume(tid)
         start_time = time.time()
         if torch.cuda.is_available():
             if flush_gpu_cache_size_mb: