From 6c80ace5a3f5888448a812b013d1f5550751ba64 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sat, 11 Apr 2026 14:06:43 +0800 Subject: [PATCH 1/9] ggml: add graph_reused --- ggml/include/ggml.h | 2 ++ ggml/src/ggml-backend.cpp | 4 ++++ ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++++ ggml/src/ggml-impl.h | 2 ++ ggml/src/ggml.c | 6 ++++++ src/llama-context.cpp | 3 +++ 6 files changed, 21 insertions(+) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 11d3e8a8167..e7f4a31a900 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2651,6 +2651,8 @@ extern "C" { GGML_API struct ggml_tensor ** ggml_graph_nodes (struct ggml_cgraph * cgraph); GGML_API int ggml_graph_n_nodes(struct ggml_cgraph * cgraph); + GGML_API void ggml_graph_set_reused(struct ggml_cgraph * cgraph, bool reused); + GGML_API void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); GGML_API size_t ggml_graph_overhead(void); diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 1a555bf2a4d..25abb22424b 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1891,6 +1891,10 @@ enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sch } } + for (int i = 0; i < sched->n_splits; i++) { + sched->splits[i].graph.reused = graph->reused; + } + return ggml_backend_sched_compute_splits(sched); } diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 3113de017f0..ca6dbd8d9e9 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3060,6 +3060,10 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx const void * graph_key = ggml_cuda_graph_get_key(cgraph); ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key); + if (cgraph->reused && (int)graph->node_props.size() == cgraph->n_nodes) { + return false; + } + // Check if the graph size has changed if ((int)graph->node_props.size() != cgraph->n_nodes) { res = true; diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 0639db362e7..18296e10d3d 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -338,6 +338,8 @@ struct ggml_cgraph { struct ggml_hash_set visited_hash_set; enum ggml_cgraph_eval_order order; + + bool reused; }; // returns a slice of cgraph with nodes [i0, i1) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 0142498d967..e318e48b172 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -7098,6 +7098,7 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz /*.use_counts =*/ use_counts_ptr, /*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr }, /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT, + /*.reused =*/ false, }; ggml_hash_set_reset(&cgraph->visited_hash_set); @@ -7125,6 +7126,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) /*.use_counts =*/ cgraph0->use_counts, /*.visited_hash_set =*/ cgraph0->visited_hash_set, /*.order =*/ cgraph0->order, + /*.reused =*/ cgraph0->reused, }; return cgraph; @@ -7260,6 +7262,10 @@ int ggml_graph_n_nodes(struct ggml_cgraph * cgraph) { return cgraph->n_nodes; } +void ggml_graph_set_reused(struct ggml_cgraph * cgraph, bool reused) { + cgraph->reused = reused; +} + void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) { GGML_ASSERT(cgraph->size > cgraph->n_nodes); cgraph->nodes[cgraph->n_nodes] = tensor; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index ee0c29235cd..61c53be090b 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1192,6 +1192,7 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll ggml_backend_sched_synchronize(sched.get()); } + ggml_graph_set_reused(gf, true); n_reused++; } else { res->reset(); @@ -1211,6 +1212,8 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll return nullptr; } + ggml_graph_set_reused(gf, false); + if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) { LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__); ret = GGML_STATUS_ALLOC_FAILED; From b510b595260482d82ade96a65e0d8de00ea67213 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sat, 11 Apr 2026 22:03:27 +0800 Subject: [PATCH 2/9] use versioning instead of reuse flag --- ggml/include/ggml.h | 2 -- ggml/src/ggml-backend.cpp | 2 +- ggml/src/ggml-cuda/common.cuh | 1 + ggml/src/ggml-cuda/ggml-cuda.cu | 6 +++++- ggml/src/ggml-impl.h | 7 ++++++- ggml/src/ggml.c | 10 ++++------ src/llama-context.cpp | 3 --- 7 files changed, 17 insertions(+), 14 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index e7f4a31a900..11d3e8a8167 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2651,8 +2651,6 @@ extern "C" { GGML_API struct ggml_tensor ** ggml_graph_nodes (struct ggml_cgraph * cgraph); GGML_API int ggml_graph_n_nodes(struct ggml_cgraph * cgraph); - GGML_API void ggml_graph_set_reused(struct ggml_cgraph * cgraph, bool reused); - GGML_API void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); GGML_API size_t ggml_graph_overhead(void); diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 25abb22424b..a6cdebf41fb 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1892,7 +1892,7 @@ enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sch } for (int i = 0; i < sched->n_splits; i++) { - sched->splits[i].graph.reused = graph->reused; + sched->splits[i].graph.version = graph->version; } return ggml_backend_sched_compute_splits(sched); diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 8a4246223b5..0624ac82bb8 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1183,6 +1183,7 @@ struct ggml_cuda_graph { std::vector nodes; bool disable_due_to_gpu_arch = false; bool warmup_complete = false; + uint64_t last_graph_version = 0; struct node_properties { ggml_tensor node; void * node_src_data_ptrs[GGML_MAX_SRC]; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index ca6dbd8d9e9..e72158e2cde 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3060,10 +3060,14 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx const void * graph_key = ggml_cuda_graph_get_key(cgraph); ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key); - if (cgraph->reused && (int)graph->node_props.size() == cgraph->n_nodes) { + if (cgraph->version != 0 && + cgraph->version == graph->last_graph_version && + (int)graph->node_props.size() == cgraph->n_nodes) { return false; } + graph->last_graph_version = cgraph->version; + // Check if the graph size has changed if ((int)graph->node_props.size() != cgraph->n_nodes) { res = true; diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 18296e10d3d..76c0a138620 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -30,6 +30,11 @@ extern "C" { void ggml_print_backtrace(void); +static inline uint64_t ggml_graph_next_version(void) { + static uint64_t counter = 0; + return ++counter; +} + #ifndef MIN # define MIN(a, b) ((a) < (b) ? (a) : (b)) #endif @@ -339,7 +344,7 @@ struct ggml_cgraph { enum ggml_cgraph_eval_order order; - bool reused; + uint64_t version; }; // returns a slice of cgraph with nodes [i0, i1) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index e318e48b172..005165571e1 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -6912,6 +6912,7 @@ static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_ten if (n_new > 0) { // the last added node should always be starting point GGML_ASSERT(cgraph->nodes[cgraph->n_nodes - 1] == tensor); + cgraph->version = ggml_graph_next_version(); } } @@ -7098,7 +7099,7 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz /*.use_counts =*/ use_counts_ptr, /*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr }, /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT, - /*.reused =*/ false, + /*.version =*/ 0, }; ggml_hash_set_reset(&cgraph->visited_hash_set); @@ -7126,7 +7127,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) /*.use_counts =*/ cgraph0->use_counts, /*.visited_hash_set =*/ cgraph0->visited_hash_set, /*.order =*/ cgraph0->order, - /*.reused =*/ cgraph0->reused, + /*.version =*/ cgraph0->version, }; return cgraph; @@ -7238,6 +7239,7 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) { cgraph->n_leafs = 0; cgraph->n_nodes = 0; ggml_hash_set_reset(&cgraph->visited_hash_set); + cgraph->version = ggml_graph_next_version(); } int ggml_graph_size(struct ggml_cgraph * cgraph) { @@ -7262,10 +7264,6 @@ int ggml_graph_n_nodes(struct ggml_cgraph * cgraph) { return cgraph->n_nodes; } -void ggml_graph_set_reused(struct ggml_cgraph * cgraph, bool reused) { - cgraph->reused = reused; -} - void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) { GGML_ASSERT(cgraph->size > cgraph->n_nodes); cgraph->nodes[cgraph->n_nodes] = tensor; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 61c53be090b..ee0c29235cd 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1192,7 +1192,6 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll ggml_backend_sched_synchronize(sched.get()); } - ggml_graph_set_reused(gf, true); n_reused++; } else { res->reset(); @@ -1212,8 +1211,6 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll return nullptr; } - ggml_graph_set_reused(gf, false); - if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) { LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__); ret = GGML_STATUS_ALLOC_FAILED; From 6d0a18b98a1e1e4c9922e574e684bfa5afaad717 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sun, 12 Apr 2026 14:49:32 +0800 Subject: [PATCH 3/9] increment version with atomic --- ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++-- ggml/src/ggml-impl.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index e72158e2cde..c2b305cb642 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3061,8 +3061,8 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key); if (cgraph->version != 0 && - cgraph->version == graph->last_graph_version && - (int)graph->node_props.size() == cgraph->n_nodes) { + cgraph->version == graph->last_graph_version) { + GGML_ASSERT((int)graph->node_props.size() == cgraph->n_nodes); return false; } diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 76c0a138620..48b7eb68967 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -32,7 +32,7 @@ void ggml_print_backtrace(void); static inline uint64_t ggml_graph_next_version(void) { static uint64_t counter = 0; - return ++counter; + return __atomic_fetch_add(&counter, 1, __ATOMIC_RELAXED); } #ifndef MIN From 4bb088b2711b9bb562f43af89545e14e2627b38a Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sun, 12 Apr 2026 17:28:42 +0800 Subject: [PATCH 4/9] use top bits for split numbering --- ggml/src/ggml-backend.cpp | 4 +++- ggml/src/ggml-impl.h | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index a6cdebf41fb..e9cb86c8ab8 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -864,6 +864,8 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co return -1; } +#define GGML_SCHED_MAX_SPLITS_BITS 12 + #if 0 #define GGML_SCHED_MAX_SPLITS_DEBUG 4096 static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only @@ -1892,7 +1894,7 @@ enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sch } for (int i = 0; i < sched->n_splits; i++) { - sched->splits[i].graph.version = graph->version; + sched->splits[i].graph.version = graph->version | ((uint64_t)(i + 1) << (64 - GGML_SCHED_MAX_SPLITS_BITS)); } return ggml_backend_sched_compute_splits(sched); diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 48b7eb68967..443436940f8 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -31,7 +31,7 @@ extern "C" { void ggml_print_backtrace(void); static inline uint64_t ggml_graph_next_version(void) { - static uint64_t counter = 0; + static uint64_t counter = 1; return __atomic_fetch_add(&counter, 1, __ATOMIC_RELAXED); } From fd547247c3587607dcf574c40df11373276fe121 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sun, 12 Apr 2026 17:36:22 +0800 Subject: [PATCH 5/9] add assert --- ggml/src/ggml-backend.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index e9cb86c8ab8..a5c2fc57dee 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1893,6 +1893,7 @@ enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sch } } + GGML_ASSERT(sched->n_splits < (1 << GGML_SCHED_MAX_SPLITS_BITS)); for (int i = 0; i < sched->n_splits; i++) { sched->splits[i].graph.version = graph->version | ((uint64_t)(i + 1) << (64 - GGML_SCHED_MAX_SPLITS_BITS)); } From 5d967fbaaf5fc76ad12d870469a15383fa36071f Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sun, 12 Apr 2026 17:48:17 +0800 Subject: [PATCH 6/9] move counter to ggml.c --- ggml/src/ggml-impl.h | 5 +---- ggml/src/ggml.c | 5 +++++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 443436940f8..81e486a84a5 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -30,10 +30,7 @@ extern "C" { void ggml_print_backtrace(void); -static inline uint64_t ggml_graph_next_version(void) { - static uint64_t counter = 1; - return __atomic_fetch_add(&counter, 1, __ATOMIC_RELAXED); -} +uint64_t ggml_graph_next_version(void); #ifndef MIN # define MIN(a, b) ((a) < (b) ? (a) : (b)) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 005165571e1..a6fd0b8adb6 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -53,6 +53,11 @@ #define UNUSED GGML_UNUSED +uint64_t ggml_graph_next_version(void) { + static uint64_t counter = 1; + return __atomic_fetch_add(&counter, 1, __ATOMIC_RELAXED); // TODO: make portable +} + // Needed for ggml_fp32_to_bf16_row() #if defined(__AVX512BF16__) #if defined(_MSC_VER) From 266c3e55ce61778342e0b10c6f6c78af5a8fa404 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Mon, 13 Apr 2026 10:50:56 +0800 Subject: [PATCH 7/9] set uid in split_graph only --- ggml/src/ggml-backend.cpp | 20 +++++++++++++------- ggml/src/ggml-cuda/common.cuh | 2 +- ggml/src/ggml-cuda/ggml-cuda.cu | 7 ++++--- ggml/src/ggml-impl.h | 2 +- ggml/src/ggml.c | 4 +--- 5 files changed, 20 insertions(+), 15 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index a5c2fc57dee..34ca7768a5d 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -761,6 +761,11 @@ static bool ggml_is_view_op(enum ggml_op op) { #define GGML_SCHED_MAX_COPIES 4 #endif +#ifndef GGML_SCHED_MAX_SPLIT_BITS +#define GGML_SCHED_MAX_SPLIT_BITS 12 // log2(4096) +#endif + + struct ggml_backend_sched_split { int backend_id; int i_start; @@ -864,8 +869,6 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co return -1; } -#define GGML_SCHED_MAX_SPLITS_BITS 12 - #if 0 #define GGML_SCHED_MAX_SPLITS_DEBUG 4096 static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only @@ -1032,6 +1035,8 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra GGML_ABORT("%s: failed to initialize context\n", __func__); } + graph->uid = ggml_graph_next_version(); + // pass 1: assign backends to ops with pre-allocated inputs for (int i = 0; i < graph->n_leafs; i++) { struct ggml_tensor * leaf = graph->leafs[i]; @@ -1479,6 +1484,12 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra assert(graph_copy->size > graph_copy->n_leafs); graph_copy->leafs[graph_copy->n_leafs++] = leaf; } + + // set ids for all splits + GGML_ASSERT(sched->n_splits < (1 << GGML_SCHED_MAX_SPLIT_BITS)); + for (int i = 0; i < sched->n_splits; ++i) { + sched->splits[i].graph.uid = graph->uid | ((uint64_t)(i + 1) << (64 - GGML_SCHED_MAX_SPLIT_BITS)); + } } static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { @@ -1893,11 +1904,6 @@ enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sch } } - GGML_ASSERT(sched->n_splits < (1 << GGML_SCHED_MAX_SPLITS_BITS)); - for (int i = 0; i < sched->n_splits; i++) { - sched->splits[i].graph.version = graph->version | ((uint64_t)(i + 1) << (64 - GGML_SCHED_MAX_SPLITS_BITS)); - } - return ggml_backend_sched_compute_splits(sched); } diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 0624ac82bb8..ae8f015b014 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1183,7 +1183,7 @@ struct ggml_cuda_graph { std::vector nodes; bool disable_due_to_gpu_arch = false; bool warmup_complete = false; - uint64_t last_graph_version = 0; + uint64_t last_graph_uid = 0; struct node_properties { ggml_tensor node; void * node_src_data_ptrs[GGML_MAX_SRC]; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index c2b305cb642..c1982631c89 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3060,13 +3060,14 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx const void * graph_key = ggml_cuda_graph_get_key(cgraph); ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key); - if (cgraph->version != 0 && - cgraph->version == graph->last_graph_version) { + if (cgraph->uid != 0 && + cgraph->uid == graph->last_graph_uid) { + GGML_LOG_DEBUG("CUDA Graph id %zu reused\n", cgraph->uid); GGML_ASSERT((int)graph->node_props.size() == cgraph->n_nodes); return false; } - graph->last_graph_version = cgraph->version; + graph->last_graph_uid = cgraph->uid; // Check if the graph size has changed if ((int)graph->node_props.size() != cgraph->n_nodes) { diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 81e486a84a5..863e471cf05 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -341,7 +341,7 @@ struct ggml_cgraph { enum ggml_cgraph_eval_order order; - uint64_t version; + uint64_t uid; }; // returns a slice of cgraph with nodes [i0, i1) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index a6fd0b8adb6..c799572fa63 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -6917,7 +6917,6 @@ static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_ten if (n_new > 0) { // the last added node should always be starting point GGML_ASSERT(cgraph->nodes[cgraph->n_nodes - 1] == tensor); - cgraph->version = ggml_graph_next_version(); } } @@ -7132,7 +7131,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) /*.use_counts =*/ cgraph0->use_counts, /*.visited_hash_set =*/ cgraph0->visited_hash_set, /*.order =*/ cgraph0->order, - /*.version =*/ cgraph0->version, + /*.version =*/ 0 }; return cgraph; @@ -7244,7 +7243,6 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) { cgraph->n_leafs = 0; cgraph->n_nodes = 0; ggml_hash_set_reset(&cgraph->visited_hash_set); - cgraph->version = ggml_graph_next_version(); } int ggml_graph_size(struct ggml_cgraph * cgraph) { From ce409bc78bd6f565379a3218e06700a8d61fdf54 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Mon, 13 Apr 2026 11:26:42 +0800 Subject: [PATCH 8/9] fix windows --- ggml/src/ggml.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index c799572fa63..3c36e9e68e4 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -54,8 +54,13 @@ #define UNUSED GGML_UNUSED uint64_t ggml_graph_next_version(void) { +#ifdef _MSC_VER + static volatile long long counter = 1; + return (uint64_t) _InterlockedIncrement64(&counter) - 1; +#else static uint64_t counter = 1; - return __atomic_fetch_add(&counter, 1, __ATOMIC_RELAXED); // TODO: make portable + return __atomic_fetch_add(&counter, 1, __ATOMIC_RELAXED); +#endif } // Needed for ggml_fp32_to_bf16_row() From 1816cc7dc17bb5edfa1a13fed63070034d16ab91 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Mon, 13 Apr 2026 20:42:17 +0800 Subject: [PATCH 9/9] address further review comments --- ggml/include/ggml.h | 2 ++ ggml/src/ggml-backend.cpp | 7 +------ ggml/src/ggml-impl.h | 2 +- ggml/src/ggml.c | 13 ++++++++----- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 11d3e8a8167..563528da4d4 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -232,6 +232,8 @@ #define GGML_DEFAULT_N_THREADS 4 #define GGML_DEFAULT_GRAPH_SIZE 2048 +#define GGML_SCHED_MAX_SPLIT_BITS 12 // log2(4096) + #if UINTPTR_MAX == 0xFFFFFFFF #define GGML_MEM_ALIGN 4 #elif defined(__EMSCRIPTEN__) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 34ca7768a5d..b3fef7038af 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -761,11 +761,6 @@ static bool ggml_is_view_op(enum ggml_op op) { #define GGML_SCHED_MAX_COPIES 4 #endif -#ifndef GGML_SCHED_MAX_SPLIT_BITS -#define GGML_SCHED_MAX_SPLIT_BITS 12 // log2(4096) -#endif - - struct ggml_backend_sched_split { int backend_id; int i_start; @@ -1035,7 +1030,7 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra GGML_ABORT("%s: failed to initialize context\n", __func__); } - graph->uid = ggml_graph_next_version(); + graph->uid = ggml_graph_next_uid(); // pass 1: assign backends to ops with pre-allocated inputs for (int i = 0; i < graph->n_leafs; i++) { diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 863e471cf05..e28f8e2e06a 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -30,7 +30,7 @@ extern "C" { void ggml_print_backtrace(void); -uint64_t ggml_graph_next_version(void); +uint64_t ggml_graph_next_uid(void); #ifndef MIN # define MIN(a, b) ((a) < (b) ? (a) : (b)) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 3c36e9e68e4..c01ac756b93 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -53,14 +53,17 @@ #define UNUSED GGML_UNUSED -uint64_t ggml_graph_next_version(void) { +uint64_t ggml_graph_next_uid(void) { #ifdef _MSC_VER static volatile long long counter = 1; - return (uint64_t) _InterlockedIncrement64(&counter) - 1; + long long ret = (uint64_t) _InterlockedIncrement64(&counter) - 1; + GGML_ASSERT(ret < (1ULL << (64 - GGML_SCHED_MAX_SPLIT_BITS))); #else static uint64_t counter = 1; - return __atomic_fetch_add(&counter, 1, __ATOMIC_RELAXED); + uint64_t ret = __atomic_fetch_add(&counter, 1, __ATOMIC_RELAXED); + GGML_ASSERT(ret < (1ULL << (64 - GGML_SCHED_MAX_SPLIT_BITS))); #endif + return ret; } // Needed for ggml_fp32_to_bf16_row() @@ -7108,7 +7111,7 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz /*.use_counts =*/ use_counts_ptr, /*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr }, /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT, - /*.version =*/ 0, + /*.uid =*/ 0, }; ggml_hash_set_reset(&cgraph->visited_hash_set); @@ -7136,7 +7139,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) /*.use_counts =*/ cgraph0->use_counts, /*.visited_hash_set =*/ cgraph0->visited_hash_set, /*.order =*/ cgraph0->order, - /*.version =*/ 0 + /*.uid =*/ 0 }; return cgraph;