[CUDA] Reduce CPU-side stalls due to the CUDA command buffer being full

gaugarg-nv · gaugarg-nv · commit d3298dc3cbf0 · 2026-01-23T13:37:26.000+05:30
With pipeline parallelism, during prompt processing, the CPU-side CUDA command buffer gets full, stalling the CPU. Due to this, enough work doesn't get submitted to the GPU, causing bubbles in the GPU timeline.
Fix this by setting the CUDA environment variable CUDA_SCALE_LAUNCH_QUEUES to 4x to increase the command buffer size.
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -194,6 +194,22 @@ static int ggml_cuda_parse_id(char devName[]) {
 static ggml_cuda_device_info ggml_cuda_init() {
     ggml_cuda_device_info info = {};
 
+    // Set CUDA_SCALE_LAUNCH_QUEUES before any CUDA API call to improve multi-GPU pipeline parallelism performance
+    if (getenv("CUDA_SCALE_LAUNCH_QUEUES") == nullptr) {
+#ifdef _WIN32
+        _putenv_s("CUDA_SCALE_LAUNCH_QUEUES", "4x");
+#else
+        setenv("CUDA_SCALE_LAUNCH_QUEUES", "4x", 0); // don't overwrite if already set
+#endif
+
+        GGML_LOG_WARN("\n");
+        GGML_LOG_WARN("================================================================================\n");
+        GGML_LOG_WARN("  CUDA_SCALE_LAUNCH_QUEUES=4x has been enabled\n");
+        GGML_LOG_WARN("  This environment variable improves performance with multiple GPUs\n");
+        GGML_LOG_WARN("================================================================================\n");
+        GGML_LOG_WARN("\n");
+    }
+
     cudaError_t err = cudaGetDeviceCount(&info.device_count);
     if (err != cudaSuccess) {
         GGML_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));