From 9963bd5d10e8ddfcb9e36fd721808214d7b49ece Mon Sep 17 00:00:00 2001 From: westers Date: Fri, 27 Mar 2026 02:42:09 -0500 Subject: [PATCH] cuda: fix MMVQ performance regression on gfx1103 (Radeon 780M) iGPU Commit 617db241 added nwarps=8 tuning for RDNA3_0 discrete GPUs (W7900/gfx1100), which causes ~30% throughput regression on gfx1103 iGPU (Radeon 780M/760M): 18 tok/s down to 13 tok/s. The iGPU has significantly less memory bandwidth and fewer CUs than discrete RDNA3 GPUs, so the higher warp count causes thread contention rather than improving throughput. Fix: detect gfx1103 specifically and use RDNA2 MMVQ parameters (nwarps=1) instead of RDNA3_0 parameters (nwarps=8). This restores pre-617db241 performance on iGPU while keeping the optimization for discrete RDNA3 GPUs. Changes: - vendors/hip.h: Add RDNA3_IGPU define for __gfx1103__ - common.cuh: Add GGML_CUDA_CC_RDNA3_IGPU constant and macro - mmvq.cu: Route gfx1103 to RDNA2 parameter table (nwarps=1) Fixes ggml-org/llama.cpp#20647 Signed-off-by: Steve Westers Co-Authored-By: Claude Opus 4.6 (1M context) --- ggml/src/ggml-cuda/common.cuh | 2 ++ ggml/src/ggml-cuda/mmvq.cu | 5 ++++- ggml/src/ggml-cuda/vendors/hip.h | 8 ++++++-- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 7d7f20af3a0..b89e325fcfb 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -72,6 +72,7 @@ #define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000 #define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a #define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA +#define GGML_CUDA_CC_RDNA3_IGPU (GGML_CUDA_CC_OFFSET_AMD + 0x1103) // Radeon 780M/760M iGPU #define GGML_CUDA_CC_RDNA3_5 (GGML_CUDA_CC_OFFSET_AMD + 0x1150) // AI 370, AI Max 395 laptops. #define GGML_CUDA_CC_RDNA4 (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000 @@ -79,6 +80,7 @@ #define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1) #define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2) #define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3) +#define GGML_CUDA_CC_IS_RDNA3_IGPU(cc) (cc == GGML_CUDA_CC_RDNA3_IGPU) // gfx1103: Radeon 780M/760M iGPU #define GGML_CUDA_CC_IS_RDNA3_0(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA3_5) #define GGML_CUDA_CC_IS_RDNA3_5(cc) (cc >= GGML_CUDA_CC_RDNA3_5 && cc < GGML_CUDA_CC_RDNA4) #define GGML_CUDA_CC_IS_RDNA3(cc) (GGML_CUDA_CC_IS_RDNA3_0(cc) || GGML_CUDA_CC_IS_RDNA3_5(cc)) diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 66bd8beeae7..5f908d91edb 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -72,7 +72,7 @@ static constexpr __device__ mmvq_parameter_table_id get_device_table_id() { return MMVQ_PARAMETERS_RDNA4; #elif defined(RDNA3_0) return MMVQ_PARAMETERS_RDNA3_0; -#elif defined(RDNA2) || defined(RDNA3_5) +#elif defined(RDNA2) || defined(RDNA3_5) || defined(RDNA3_IGPU) return MMVQ_PARAMETERS_RDNA2; #elif defined(GCN) || defined(CDNA) return MMVQ_PARAMETERS_GCN; @@ -85,6 +85,9 @@ static __host__ mmvq_parameter_table_id get_device_table_id(int cc) { if (GGML_CUDA_CC_IS_RDNA4(cc)) { return MMVQ_PARAMETERS_RDNA4; } + if (GGML_CUDA_CC_IS_RDNA3_IGPU(cc)) { + return MMVQ_PARAMETERS_RDNA2; // iGPU (gfx1103) regresses with nwarps=8, use nwarps=1 + } if (GGML_CUDA_CC_IS_RDNA3_0(cc)) { return MMVQ_PARAMETERS_RDNA3_0; } diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h index 9d9ba1ee219..6aecb8d9c7b 100644 --- a/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -211,9 +211,13 @@ #define RDNA3_5 #endif // defined(__gfx1150__) || defined(__gfx1151__) -#if defined(RDNA3) && !defined(RDNA3_5) +#if defined(__gfx1103__) +#define RDNA3_IGPU +#endif // defined(__gfx1103__) + +#if defined(RDNA3) && !defined(RDNA3_5) && !defined(RDNA3_IGPU) #define RDNA3_0 -#endif // defined(RDNA3) && !defined(RDNA3_5) +#endif // defined(RDNA3) && !defined(RDNA3_5) && !defined(RDNA3_IGPU) #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \ defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)