diff --git a/fbgemm_gpu/bench/tbe/split_table_batched_embeddings_benchmark.py b/fbgemm_gpu/bench/tbe/split_table_batched_embeddings_benchmark.py index 4be370b5a8..2bcfdf4144 100644 --- a/fbgemm_gpu/bench/tbe/split_table_batched_embeddings_benchmark.py +++ b/fbgemm_gpu/bench/tbe/split_table_batched_embeddings_benchmark.py @@ -1158,28 +1158,28 @@ def device_with_spec( # noqa C901 f"Accessed weights per batch: {B * sum_DLs * param_size_multiplier / 1.0e9: .2f} GB" ) - # forward - time_per_iter = benchmark_requests( - requests, - lambda indices, offsets, per_sample_weights: emb.forward( - indices, - offsets, - per_sample_weights, - feature_requires_grad=feature_requires_grad, - ), - flush_gpu_cache_size_mb=flush_gpu_cache_size_mb, - num_warmups=warmup_runs, - ) - logging.info( - f"Forward, B: {B}, " - f"Es: {Es}, T: {T}, Ds: {Ds}, Ls: {Ls_str}, W: {weighted}, " - f"BW: {read_write_bytes / time_per_iter / 1.0e9: .2f} GB/s, " # noqa: B950 - f"T: {time_per_iter * 1.0e6:.0f}us" - ) - - if output_dtype == SparseType.INT8: - # backward bench not representative - return + # # forward + # time_per_iter = benchmark_requests( + # requests, + # lambda indices, offsets, per_sample_weights: emb.forward( + # indices, + # offsets, + # per_sample_weights, + # feature_requires_grad=feature_requires_grad, + # ), + # flush_gpu_cache_size_mb=flush_gpu_cache_size_mb, + # num_warmups=warmup_runs, + # ) + # logging.info( + # f"Forward, B: {B}, " + # f"Es: {Es}, T: {T}, Ds: {Ds}, Ls: {Ls_str}, W: {weighted}, " + # f"BW: {read_write_bytes / time_per_iter / 1.0e9: .2f} GB/s, " # noqa: B950 + # f"T: {time_per_iter * 1.0e6:.0f}us" + # ) + + # if output_dtype == SparseType.INT8: + # # backward bench not representative + # return if do_pooling: grad_output = torch.randn(B, sum(Ds)).to(get_device()) diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py b/fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py index 00bf30d230..0f51c66caa 100644 --- a/fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py +++ b/fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py @@ -12,6 +12,9 @@ import time from subprocess import Popen from typing import Callable, List, Optional, Tuple +import roctx +# from roctx.context_decorators import RoctxRange +# from roctx.context_decorators import RoctxProfiler import torch @@ -224,6 +227,8 @@ def benchmark_requests( # noqa: C901 if warmup_ms is None: num_warmups = num_warmups + 1 if num_warmups >= 0 else 1 + tid = roctx.getThreadId() + roctx.profilerPause(tid) # warm-up the GPU before profiling bench_warmup( requests[0], @@ -241,6 +246,7 @@ def benchmark_requests( # noqa: C901 if callback_after_warmup is not None: callback_after_warmup() + roctx.profilerResume(tid) num_reqs = len(requests) iters = num_reqs if iters == -1 else iters @@ -259,7 +265,11 @@ def benchmark_requests( # noqa: C901 indices, offsets, weights = req.unpack_3() if bwd_only: # Run forward before profiling if does backward only + tid = roctx.getThreadId() + roctx.profilerPause(tid) + # fwd kernel should be hidden by profiling tool out = func(indices, offsets, weights) + roctx.profilerResume(tid) start_time = time.time() if torch.cuda.is_available(): if flush_gpu_cache_size_mb: