ROCm · aryaman-gupta · Dec 5, 2025 · Dec 9, 2025
diff --git a/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h b/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h
@@ -226,7 +226,7 @@ struct load_row_per_warp<half, 320, index_t> {
         llvm_amdgcn_raw_buffer_load_fp16x2(
             emb_res, (lane_id + 64) * sizeof(half2));
     emb_data[4] = llvm_amdgcn_raw_buffer_load_fp16(
-        emb_res, (lane_id + 128) * sizeof(half));
+        emb_res, (lane_id + 256) * sizeof(half));
   }
 };
 

diff --git a/fbgemm_gpu/test/tbe/training/backward_adagrad_test.py b/fbgemm_gpu/test/tbe/training/backward_adagrad_test.py
@@ -19,6 +19,7 @@
 from .backward_adagrad_common import (
     additional_decorators,
     adjust_mixed_B_st,
+    CacheAlgorithm,
     common_settings,
     common_strategy,
     execute_backward_adagrad,
@@ -221,6 +222,29 @@ def test_backward_adagrad_fp16_pmSUM_with_max_norm(  # noqa C901
             **kwargs,
         )
 
+    @unittest.skipIf(*gpu_unavailable)
+    def test_backward_adagrad_fp16_pmSUM_D320(self) -> None:
+        execute_backward_adagrad(
+            T=2,
+            # using D=80 since the test harness multiplies D by 4, so 80*4=320
+            D=80,
+            B=16,
+            log_E=4,
+            L=4,
+            D_gradcheck=1,
+            weights_precision=SparseType.FP16,
+            stochastic_rounding=False,
+            weighted=False,
+            row_wise=True,
+            mixed=False,
+            mixed_B=False,
+            use_cache=False,
+            cache_algorithm=CacheAlgorithm.LRU,
+            pooling_mode=PoolingMode.SUM,
+            use_cpu=False,
+            output_dtype=SparseType.FP16,
+        )
+
 
 if __name__ == "__main__":
     unittest.main()