From 134b44d5a759820160a19f7d796ec122d3dc964b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Fri, 10 Apr 2026 22:02:40 +0200
Subject: [PATCH 1/3] CUDA: initialize NCCL comms lazily

---
 ggml/src/ggml-cuda/common.cuh   | 16 ++++++++++++----
 ggml/src/ggml-cuda/ggml-cuda.cu | 21 ++++++++++-----------
 2 files changed, 22 insertions(+), 15 deletions(-)
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 56a67f1edc8..55c3074243c 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -25,6 +25,7 @@
 #include <cassert>
 #include <cfloat>
 #include <cstdio>
+#include <map>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -1092,10 +1093,6 @@ struct ggml_cuda_device_info {
     cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
 
     std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
-
-#ifdef GGML_USE_NCCL
-    ncclComm_t comms[GGML_CUDA_MAX_DEVICES];
-#endif // GGML_USE_NCCL
 };
 
 const ggml_cuda_device_info & ggml_cuda_info();
@@ -1154,6 +1151,17 @@ struct ggml_cuda_pool_alloc {
     ggml_cuda_pool_alloc& operator=(ggml_cuda_pool_alloc &&) = delete;
 };
 
+#ifdef GGML_USE_NCCL
+static std::map<std::vector<int>, std::vector<ncclComm_t>> comms;
+
+static std::vector<ncclComm_t> ggml_cuda_get_nccl_comms(const std::vector<int> & devs) {
+    if (comms.find(devs) == comms.end()) {
+        comms[devs].resize(devs.size());
+        NCCL_CHECK(ncclCommInitAll(comms[devs].data(), devs.size(), devs.data()));
+    }
+    return comms[devs];
+}
+#endif // GGML_USE_NCCL
 
 // backend interface
 
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 8613d20b9f9..1f12e447283 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -338,14 +338,6 @@ static ggml_cuda_device_info ggml_cuda_init() {
         }
     }
 
-#ifdef GGML_USE_NCCL
-    int dev_ids[GGML_CUDA_MAX_DEVICES];
-    for (int id = 0; id < info.device_count; ++id) {
-        dev_ids[id] = id;
-    }
-    NCCL_CHECK(ncclCommInitAll(info.comms, info.device_count, dev_ids));
-#endif // GGML_USE_NCCL
-
     return info;
 }
 
@@ -1139,7 +1131,14 @@ bool ggml_backend_cuda_allreduce_tensor(ggml_backend_t * backends, struct ggml_t
         GGML_ASSERT(ggml_is_contiguously_allocated(tensors[i]));
     }
 
-    const ggml_cuda_device_info info = ggml_cuda_info();
+    const ggml_cuda_device_info & info = ggml_cuda_info();
+    std::vector<int> dev_ids;
+    dev_ids.reserve(n_backends);
+    for (size_t i = 0; i < n_backends; ++i) {
+        ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
+        dev_ids.push_back(cuda_ctx->device);
+    }
+    const std::vector<ncclComm_t> comms = ggml_cuda_get_nccl_comms(dev_ids);
 
     // For small tensors, simply reduce them as FP32.
     // The following heuristic for how "small" a tensor should be is based on RTX 4090s connected via 16x PCIe 4.0.
@@ -1147,7 +1146,7 @@ bool ggml_backend_cuda_allreduce_tensor(ggml_backend_t * backends, struct ggml_t
         NCCL_CHECK(ncclGroupStart());
         for (size_t i = 0; i < n_backends; ++i) {
             ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
-            NCCL_CHECK(ncclAllReduce(tensors[i]->data, tensors[i]->data, ne, ncclFloat, ncclSum, info.comms[cuda_ctx->device], cuda_ctx->stream()));
+            NCCL_CHECK(ncclAllReduce(tensors[i]->data, tensors[i]->data, ne, ncclFloat, ncclSum, comms[i], cuda_ctx->stream()));
         }
         NCCL_CHECK(ncclGroupEnd());
 
@@ -1172,7 +1171,7 @@ bool ggml_backend_cuda_allreduce_tensor(ggml_backend_t * backends, struct ggml_t
     NCCL_CHECK(ncclGroupStart());
     for (size_t i = 0; i < n_backends; ++i) {
         ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
-        NCCL_CHECK(ncclAllReduce(tmp[i].get(), tmp[i].get(), ne, ncclBfloat16, ncclSum, info.comms[cuda_ctx->device], cuda_ctx->stream()));
+        NCCL_CHECK(ncclAllReduce(tmp[i].get(), tmp[i].get(), ne, ncclBfloat16, ncclSum, comms[i], cuda_ctx->stream()));
     }
     NCCL_CHECK(ncclGroupEnd());
 

From a0a8859e8698b2487c743bb01614fc5720a5244f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Sat, 11 Apr 2026 13:42:23 +0200
Subject: [PATCH 2/3] add mutex

---
 ggml/src/ggml-cuda/common.cuh   | 13 ++++++++-----
 ggml/src/ggml-cuda/ggml-cuda.cu | 32 +++++++++++++++++++-------------
 2 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 55c3074243c..eb28b5165cb 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -26,6 +26,7 @@
 #include <cfloat>
 #include <cstdio>
 #include <map>
+#include <mutex>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -1152,14 +1153,16 @@ struct ggml_cuda_pool_alloc {
 };
 
 #ifdef GGML_USE_NCCL
-static std::map<std::vector<int>, std::vector<ncclComm_t>> comms;
+static std::map<std::vector<int>, std::vector<ncclComm_t>> nccl_comms;
+static std::mutex nccl_mutex;
 
 static std::vector<ncclComm_t> ggml_cuda_get_nccl_comms(const std::vector<int> & devs) {
-    if (comms.find(devs) == comms.end()) {
-        comms[devs].resize(devs.size());
-        NCCL_CHECK(ncclCommInitAll(comms[devs].data(), devs.size(), devs.data()));
+    std::lock_guard lock(nccl_mutex);
+    if (nccl_comms.find(devs) == nccl_comms.end()) {
+        nccl_comms[devs].resize(devs.size());
+        NCCL_CHECK(ncclCommInitAll(nccl_comms[devs].data(), devs.size(), devs.data()));
     }
-    return comms[devs];
+    return nccl_comms[devs];
 }
 #endif // GGML_USE_NCCL
 
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 1f12e447283..4faee29d64c 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -1142,15 +1142,18 @@ bool ggml_backend_cuda_allreduce_tensor(ggml_backend_t * backends, struct ggml_t
 
     // For small tensors, simply reduce them as FP32.
     // The following heuristic for how "small" a tensor should be is based on RTX 4090s connected via 16x PCIe 4.0.
-    if ((n_backends <= 2 && ne < 32768) || (n_backends == 3 && ne < 131072) || (n_backends >= 4 && ne < 262144)) {
-        NCCL_CHECK(ncclGroupStart());
-        for (size_t i = 0; i < n_backends; ++i) {
-            ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
-            NCCL_CHECK(ncclAllReduce(tensors[i]->data, tensors[i]->data, ne, ncclFloat, ncclSum, comms[i], cuda_ctx->stream()));
-        }
-        NCCL_CHECK(ncclGroupEnd());
+    {
+        std::lock_guard lock(nccl_mutex);
+        if ((n_backends <= 2 && ne < 32768) || (n_backends == 3 && ne < 131072) || (n_backends >= 4 && ne < 262144)) {
+            NCCL_CHECK(ncclGroupStart());
+            for (size_t i = 0; i < n_backends; ++i) {
+                ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
+                NCCL_CHECK(ncclAllReduce(tensors[i]->data, tensors[i]->data, ne, ncclFloat, ncclSum, comms[i], cuda_ctx->stream()));
+            }
+            NCCL_CHECK(ncclGroupEnd());
 
-        return true;
+            return true;
+        }
     }
 
     // For large tensors it's faster to compress them to BF16 for the reduction:
@@ -1168,12 +1171,15 @@ bool ggml_backend_cuda_allreduce_tensor(ggml_backend_t * backends, struct ggml_t
         CUDA_CHECK(cudaGetLastError());
     }
 
-    NCCL_CHECK(ncclGroupStart());
-    for (size_t i = 0; i < n_backends; ++i) {
-        ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
-        NCCL_CHECK(ncclAllReduce(tmp[i].get(), tmp[i].get(), ne, ncclBfloat16, ncclSum, comms[i], cuda_ctx->stream()));
+    {
+        std::lock_guard lock(nccl_mutex);
+        NCCL_CHECK(ncclGroupStart());
+        for (size_t i = 0; i < n_backends; ++i) {
+            ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
+            NCCL_CHECK(ncclAllReduce(tmp[i].get(), tmp[i].get(), ne, ncclBfloat16, ncclSum, comms[i], cuda_ctx->stream()));
+        }
+        NCCL_CHECK(ncclGroupEnd());
     }
-    NCCL_CHECK(ncclGroupEnd());
 
     for (size_t i = 0; i < n_backends; ++i) {
         ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;

From f7dc561c94319ef6e517e9c8f5bb0bf49f8ee587 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Mon, 13 Apr 2026 20:55:16 +0200
Subject: [PATCH 3/3] move NCCL code to ggml-cuda.cu

---
 ggml/src/ggml-cuda/common.cuh   | 14 --------------
 ggml/src/ggml-cuda/ggml-cuda.cu | 18 ++++++++++++++++--
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index eb28b5165cb..5b022ba038a 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -1152,20 +1152,6 @@ struct ggml_cuda_pool_alloc {
     ggml_cuda_pool_alloc& operator=(ggml_cuda_pool_alloc &&) = delete;
 };
 
-#ifdef GGML_USE_NCCL
-static std::map<std::vector<int>, std::vector<ncclComm_t>> nccl_comms;
-static std::mutex nccl_mutex;
-
-static std::vector<ncclComm_t> ggml_cuda_get_nccl_comms(const std::vector<int> & devs) {
-    std::lock_guard lock(nccl_mutex);
-    if (nccl_comms.find(devs) == nccl_comms.end()) {
-        nccl_comms[devs].resize(devs.size());
-        NCCL_CHECK(ncclCommInitAll(nccl_comms[devs].data(), devs.size(), devs.data()));
-    }
-    return nccl_comms[devs];
-}
-#endif // GGML_USE_NCCL
-
 // backend interface
 
 struct ggml_tensor_extra_gpu {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 4faee29d64c..a02d7885274 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -566,6 +566,20 @@ std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(i
     return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
 }
 
+#ifdef GGML_USE_NCCL
+static std::map<std::vector<int>, std::vector<ncclComm_t>> ggml_cuda_nccl_comms;
+static std::mutex ggml_cuda_nccl_mutex;
+
+static std::vector<ncclComm_t> ggml_cuda_get_nccl_comms(const std::vector<int> & devs) {
+    std::lock_guard lock(ggml_cuda_nccl_mutex);
+    if (ggml_cuda_nccl_comms.find(devs) == ggml_cuda_nccl_comms.end()) {
+        ggml_cuda_nccl_comms[devs].resize(devs.size());
+        NCCL_CHECK(ncclCommInitAll(ggml_cuda_nccl_comms[devs].data(), devs.size(), devs.data()));
+    }
+    return ggml_cuda_nccl_comms[devs];
+}
+#endif // GGML_USE_NCCL
+
 // destroying a cuBLAS handle while a graph is being captured in a different thread can result in a CUDA error
 // this lock is used to ensure that no cuBLAS handle is destroyed while a graph is being captured
 
@@ -1143,7 +1157,7 @@ bool ggml_backend_cuda_allreduce_tensor(ggml_backend_t * backends, struct ggml_t
     // For small tensors, simply reduce them as FP32.
     // The following heuristic for how "small" a tensor should be is based on RTX 4090s connected via 16x PCIe 4.0.
     {
-        std::lock_guard lock(nccl_mutex);
+        std::lock_guard lock(ggml_cuda_nccl_mutex);
         if ((n_backends <= 2 && ne < 32768) || (n_backends == 3 && ne < 131072) || (n_backends >= 4 && ne < 262144)) {
             NCCL_CHECK(ncclGroupStart());
             for (size_t i = 0; i < n_backends; ++i) {
@@ -1172,7 +1186,7 @@ bool ggml_backend_cuda_allreduce_tensor(ggml_backend_t * backends, struct ggml_t
     }
 
     {
-        std::lock_guard lock(nccl_mutex);
+        std::lock_guard lock(ggml_cuda_nccl_mutex);
         NCCL_CHECK(ncclGroupStart());
         for (size_t i = 0; i < n_backends; ++i) {
             ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;