From b2ac80644fa70467f2f567ffac570aaca6ee8f69 Mon Sep 17 00:00:00 2001 From: Chun Wan Date: Mon, 13 Apr 2026 01:50:52 +0800 Subject: [PATCH 1/3] drm/amdgpu: use mutex_lock_interruptible for gtt_window_lock in copy_mem_to_mem When multiple processes trigger KFD BO eviction and subsequent restore simultaneously, the non-interruptible mutex_lock() on gtt_window_lock in amdgpu_ttm_copy_mem_to_mem() can cause a live-lock: Process A holds the lock waiting for Process B to release a BO, while Process B waits for the same lock. Since mutex_lock() is not interruptible, neither process can back off, resulting in a permanent D2H hang. This was observed on MI300X systems running multi-process RDMA workloads under VRAM pressure, where the hang typically occurs after 30-40 minutes of sustained operation. Replace mutex_lock() with mutex_lock_interruptible() so the wait can be interrupted by signals, returning -ERESTARTSYS to allow the TTM subsystem to retry or abort gracefully. Tested on MI300X/MI308X (gfx942) with 8 GPUs under extreme VRAM pressure (192/196 GB utilized) for 40+ minutes with zero hangs. Signed-off-by: Chun Wan --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 93666b02c39ee..0fba410ee35e4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -331,7 +331,8 @@ static int amdgpu_ttm_copy_mem_to_mem(struct amdgpu_device *adev, amdgpu_res_first(src->mem, src->offset, size, &src_mm); amdgpu_res_first(dst->mem, dst->offset, size, &dst_mm); - mutex_lock(&adev->mman.gtt_window_lock); + if (mutex_lock_interruptible(&adev->mman.gtt_window_lock)) + return -ERESTARTSYS; while (src_mm.remaining) { uint64_t from, to, cur_size, tiling_flags; uint32_t num_type, data_format, max_com, write_compress_disable; From 45d1e7fd20d56d37193a64fece0b5a03438dbb86 Mon Sep 17 00:00:00 2001 From: Chun Wan Date: Mon, 13 Apr 2026 01:51:58 +0800 Subject: [PATCH 2/3] drm/amdkfd: add RDMA pin limit to prevent unbounded VRAM pinning RDMA PeerDirect operations pin GPU buffer objects in VRAM, making them ineligible for eviction. Without any limit, a misbehaving or compromised RDMA peer can pin all available VRAM, starving other processes and triggering cascading eviction failures that lead to system hangs. Add a dedicated atomic64_t rdma_pinned_bytes counter in amdgpu_kfd_dev to track RDMA-pinned VRAM independently from the general vram_pinned counter. In amdgpu_amdkfd_gpuvm_pin_bo(), enforce the existing dmabuf_pin_max_mb module parameter using atomic64_add_return() for race-free accounting. If the total would exceed the configured limit, roll back and return -ENOSPC. Also improve PeerDirect error logging in kfd_peerdirect.c to distinguish quota rejections (-ENOSPC) from other pin errors, and add a rdma_pin_debug module parameter for optional runtime logging. Tested on MI300X/MI308X with 128 RDMA pin attempts (32 GB total): - Without limit (dmabuf_pin_max_mb=0): 120 pins succeed (30 GB pinned) - With limit (dmabuf_pin_max_mb=512): only 2 pins succeed (512 MB), 126 correctly rejected with -ENOSPC in dmesg - No hangs or GPU resets in either configuration over 40 minutes Signed-off-by: Chun Wan --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 + .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 50 ++++++++++++++++--- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 22 ++++++++ drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c | 6 ++- 5 files changed, 75 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 6d1f4acab78e7..be452767d6a5a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -238,6 +238,9 @@ extern int amdgpu_mtype_local; extern int amdgpu_enforce_isolation; #ifdef CONFIG_HSA_AMD extern int sched_policy; +extern unsigned int dmabuf_pin_max_mb; +extern int amdgpu_dmabuf_reject_new_pins; +extern int amdgpu_rdma_pin_debug; extern bool debug_evictions; extern bool no_system_mem_limit; extern int halt_if_hws_hang; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 357a66379d15b..d3df86913134e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -111,6 +111,7 @@ struct amdgpu_kfd_dev { int64_t vram_used[MAX_XCP]; uint64_t vram_used_aligned[MAX_XCP]; atomic64_t vram_pinned; + atomic64_t rdma_pinned_bytes; bool init_complete; struct work_struct reset_work; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index d53080493e6ab..320f7b9ddf636 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1591,10 +1591,37 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info, int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain) { int ret = 0; + struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); + u64 bo_size = amdgpu_bo_size(bo); + bool rdma_accounted = false; + + /* Pin limit: reject new RDMA/P2P pins when global kill switch is on */ + if (unlikely(amdgpu_dmabuf_reject_new_pins)) { + dev_info_ratelimited(adev->dev, + "amdgpu: KFD RDMA pin rejected (dmabuf_reject_new_pins=1)\n"); + return -ENOSPC; + } + + /* Pin limit: enforce per-GPU max pinned VRAM for RDMA/P2P */ + if (dmabuf_pin_max_mb && (domain & AMDGPU_GEM_DOMAIN_VRAM)) { + u64 limit = (u64)dmabuf_pin_max_mb << 20; + u64 new_total = atomic64_add_return((s64)bo_size, + &adev->kfd.rdma_pinned_bytes); + + if ((u64)new_total > limit) { + atomic64_sub((s64)bo_size, &adev->kfd.rdma_pinned_bytes); + dev_info_ratelimited(adev->dev, + "KFD RDMA pin rejected: pinned=%lluMB + new=%lluMB > max=%uMB\n", + (u64)(new_total - bo_size) >> 20, + bo_size >> 20, dmabuf_pin_max_mb); + return -ENOSPC; + } + rdma_accounted = true; + } ret = amdgpu_bo_reserve(bo, false); if (unlikely(ret)) - return ret; + goto err_accounting; if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) { /* @@ -1620,14 +1647,21 @@ int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain) amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false); if (!ret && bo->tbo.resource->mem_type == TTM_PL_VRAM) - atomic64_add(amdgpu_bo_size(bo), - &amdgpu_ttm_adev(bo->tbo.bdev)->kfd.vram_pinned); + atomic64_add(amdgpu_bo_size(bo), &adev->kfd.vram_pinned); out: amdgpu_bo_unreserve(bo); + if (ret) + goto err_accounting; + return 0; + +err_accounting: + if (rdma_accounted) + atomic64_sub((s64)bo_size, &adev->kfd.rdma_pinned_bytes); return ret; } + /** * amdgpu_amdkfd_gpuvm_unpin_bo() - Unpins BO using following criteria * @bo: Handle of buffer object being unpinned @@ -1639,6 +1673,7 @@ int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain) void amdgpu_amdkfd_gpuvm_unpin_bo(struct amdgpu_bo *bo) { int ret = 0; + struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); ret = amdgpu_bo_reserve(bo, false); if (unlikely(ret)) @@ -1646,9 +1681,12 @@ void amdgpu_amdkfd_gpuvm_unpin_bo(struct amdgpu_bo *bo) amdgpu_bo_unpin(bo); - if (bo->tbo.resource->mem_type == TTM_PL_VRAM) - atomic64_sub(amdgpu_bo_size(bo), - &amdgpu_ttm_adev(bo->tbo.bdev)->kfd.vram_pinned); + if (bo->tbo.resource->mem_type == TTM_PL_VRAM) { + atomic64_sub(amdgpu_bo_size(bo), &adev->kfd.vram_pinned); + if (dmabuf_pin_max_mb) + atomic64_sub((s64)amdgpu_bo_size(bo), + &adev->kfd.rdma_pinned_bytes); + } amdgpu_bo_unreserve(bo); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 3bcb7917384b9..7735a762b4f2e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -802,6 +802,28 @@ module_param(max_num_of_queues_per_device, int, 0444); MODULE_PARM_DESC(max_num_of_queues_per_device, "Maximum number of supported queues per device (1 = Minimum, 4096 = default)"); +/** + * DOC: dmabuf_pin_max_mb (uint) + * Maximum MB of VRAM pinned for RDMA/PeerDirect per GPU. 0 = unlimited. + */ +unsigned int dmabuf_pin_max_mb; +module_param_named(dmabuf_pin_max_mb, dmabuf_pin_max_mb, uint, 0644); +MODULE_PARM_DESC(dmabuf_pin_max_mb, + "Max VRAM pinned for RDMA/PeerDirect per GPU in MB (0 = unlimited (default))"); + +/** + * DOC: dmabuf_reject_new_pins (int) + * Reject new RDMA/PeerDirect pins (global kill switch). + */ +int amdgpu_dmabuf_reject_new_pins; +module_param_named(dmabuf_reject_new_pins, amdgpu_dmabuf_reject_new_pins, int, 0644); +MODULE_PARM_DESC(dmabuf_reject_new_pins, + "Reject new RDMA pins (0 = allow (default), 1 = reject with -ENOSPC)"); + +int amdgpu_rdma_pin_debug; +module_param_named(rdma_pin_debug, amdgpu_rdma_pin_debug, int, 0644); +MODULE_PARM_DESC(rdma_pin_debug, "Log RDMA pin/unpin events (0=off, 1=on)"); + /** * DOC: send_sigterm (int) * Send sigterm to HSA process on unhandled exceptions. Default is not to send sigterm diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c b/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c index cd75d2432a9f8..8d5afc49086eb 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c @@ -285,7 +285,11 @@ static int amd_get_pages(unsigned long addr, size_t size, int write, int force, ret = amdgpu_amdkfd_gpuvm_pin_bo(mem_context->bo, mem_context->bo->kfd_bo->domain); if (ret) { - pr_err("Pinning of buffer failed.\n"); + if (ret == -ENOSPC) + pr_info("RDMA pin rejected by quota (addr=%#llx size=%#llx)\n", + mem_context->va, mem_context->size); + else + pr_err("Pinning of buffer failed: %d\n", ret); return ret; } From 937cc36d34e1140a6ed3db44b32687b159d30c73 Mon Sep 17 00:00:00 2001 From: Chun Wan Date: Mon, 13 Apr 2026 01:53:07 +0800 Subject: [PATCH 3/3] drm/amdkfd: fix RDMA pin quota when BO does not land in VRAM amdgpu_amdkfd_gpuvm_pin_bo() increments rdma_pinned_bytes before amdgpu_bo_pin() when the domain includes VRAM. If pinning succeeds but the buffer ends up outside VRAM, unpin_bo() never subtracts from rdma_pinned_bytes (it only does so for TTM_PL_VRAM), leaking quota. Roll back the pre-accounted bytes in that case. --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 320f7b9ddf636..73a5a561d1758 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1648,6 +1648,16 @@ int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain) if (!ret && bo->tbo.resource->mem_type == TTM_PL_VRAM) atomic64_add(amdgpu_bo_size(bo), &adev->kfd.vram_pinned); + else if (!ret && rdma_accounted && + bo->tbo.resource->mem_type != TTM_PL_VRAM) { + /* + * Quota was reserved for a VRAM-domain pin; if the BO did not end + * up in VRAM, roll back rdma_pinned_bytes (unpin only decrements + * when mem_type == TTM_PL_VRAM). + */ + atomic64_sub((s64)bo_size, &adev->kfd.rdma_pinned_bytes); + rdma_accounted = false; + } out: amdgpu_bo_unreserve(bo);