ggml-org · rockyRunnr · Mar 6, 2026
@@ -1319,6 +1319,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.clear_idle = value;
         }
     ).set_env("LLAMA_ARG_CLEAR_IDLE").set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"--kv-dynamic"},
+        "enable dynamic KV cache resizing (start small, grow on demand)",
+        [](common_params & params) {
+            params.kv_dynamic = true;
+        }
+    ));
     add_opt(common_arg(
         {"--context-shift"},
         {"--no-context-shift"},

@@ -1476,6 +1476,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
     cparams.op_offload        = !params.no_op_offload;
     cparams.swa_full          = params.swa_full;
     cparams.kv_unified        = params.kv_unified;
+    cparams.kv_dynamic        = params.kv_dynamic;
 
     cparams.type_k = params.cache_type_k;
     cparams.type_v = params.cache_type_v;

@@ -530,6 +530,7 @@ struct common_params {
     bool ctx_shift         = false; // context shift on infinite text generation
     bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
     bool kv_unified        = false; // enable unified KV cache
+    bool kv_dynamic        = false; // enable dynamic KV cache resizing
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool use_mmap          = true;  // enable mmap to use filesystem cache

diff --git a/include/llama.h b/include/llama.h
@@ -372,6 +372,7 @@ extern "C" {
         bool kv_unified;  // use a unified buffer across the input sequences when computing the attention
                           // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
                           // ref: https://github.com/ggml-org/llama.cpp/pull/14363
+        bool kv_dynamic;  // enable dynamic KV cache resizing (start small, grow on demand)
 
         // [EXPERIMENTAL]
         // backend sampler chain configuration (make sure the caller keeps the sampler chains alive)

@@ -5,10 +5,32 @@
 #include "llama-batch.h"
 #include "llama-io.h"
 #include "llama-memory.h"
+#include "llama-memory-hybrid.h"
 #include "llama-mmap.h"
 #include "llama-model.h"
 #include "llama-ext.h"
 
+namespace {
+
+bool check_and_clear_resized_kv(llama_memory_i * memory) {
+    if (memory == nullptr) {
+        return false;
+    }
+
+    if (auto * kv = dynamic_cast<llama_kv_cache *>(memory)) {
+        return kv->check_and_clear_resized();
+    }
+
+    if (auto * hybrid = dynamic_cast<llama_memory_hybrid *>(memory)) {
+        auto * mem_attn = hybrid->get_mem_attn();
+        return mem_attn != nullptr && mem_attn->check_and_clear_resized();
+    }
+
+    return false;
+}
+
+} // namespace
+
 #include <cinttypes>
 #include <cmath>
 #include <cstring>
@@ -162,6 +184,7 @@ llama_context::llama_context(
 
     cparams.op_offload = params.op_offload;
     cparams.kv_unified = params.kv_unified;
+    cparams.kv_dynamic = params.kv_dynamic;
 
     // initialized later
     cparams.pipeline_parallel = false;
@@ -1620,6 +1643,11 @@ int llama_context::decode(const llama_batch & batch_inp) {
             return -2;
         }
 
+        if (check_and_clear_resized_kv(memory.get())) {
+            sched_need_reserve = true;
+            sched_reserve();
+        }
+
         switch (mctx->get_status()) {
             case LLAMA_MEMORY_STATUS_SUCCESS:
                 {
@@ -2912,6 +2940,7 @@ llama_context_params llama_context_default_params() {
         /*.op_offload                  =*/ true,
         /*.swa_full                    =*/ true,
         /*.kv_unified                  =*/ false,
+        /*.kv_dynamic                  =*/ false,
         /*.sampler                     =*/ nullptr,
         /*.n_sampler                   =*/ 0,
     };

@@ -38,6 +38,7 @@ struct llama_cparams {
     bool warmup;
     bool op_offload;
     bool kv_unified;
+    bool kv_dynamic;
     bool pipeline_parallel;
 
     enum llama_pooling_type pooling_type;

@@ -89,10 +89,34 @@ llama_kv_cache::llama_kv_cache(
                  uint32_t   n_swa,
            llama_swa_type   swa_type,
     const layer_filter_cb & filter,
-    const  layer_reuse_cb & reuse) :
+    const  layer_reuse_cb & reuse,
+                 uint32_t   kv_size_max) :
     model(model), hparams(model.hparams), v_trans(v_trans),
     n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
 
+    // save construction parameters for dynamic resize
+    saved_type_k     = type_k;
+    saved_type_v     = type_v;
+    saved_v_trans    = v_trans;
+    saved_offload    = offload;
+    saved_unified    = unified;
+    saved_n_seq_max  = n_seq_max;
+    saved_n_pad      = n_pad;
+    saved_n_swa      = n_swa;
+    saved_swa_type   = swa_type;
+    saved_filter     = filter;
+    saved_reuse      = reuse;
+    kv_size_max_val  = kv_size_max;
+
+    // dynamic resize: start with small initial size
+    if (kv_size_max > 0 && kv_size > 256) {
+        kv_size_cur = 256;
+        kv_size     = kv_size_cur;
+        LLAMA_LOG_INFO("%s: dynamic KV cache: start = %u cells, max = %u cells\n", __func__, kv_size_cur, kv_size_max);
+    } else {
+        kv_size_cur = kv_size;
+    }
+
     GGML_ASSERT(kv_size % n_pad == 0);
 
     const uint32_t n_layer_kv = hparams.n_layer_kv();
@@ -254,6 +278,8 @@ llama_kv_cache::llama_kv_cache(
 
         LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
 
+        // Keep unwritten padding deterministic; dynamic mode still saves memory
+        // because the buffer starts small and only grows when needed.
         ggml_backend_buffer_clear(buf, 0);
         ctxs_bufs.emplace_back(std::move(ctx), buf);
     }
@@ -637,6 +663,12 @@ llama_memory_context_ptr llama_kv_cache::init_batch(
         }
 
         auto sinfos = prepare(ubatches);
+        while (sinfos.empty()) {
+            if (!try_resize()) {
+                break;
+            }
+            sinfos = prepare(ubatches);
+        }
         if (sinfos.empty()) {
             break;
         }
@@ -1113,6 +1145,105 @@ ggml_type llama_kv_cache::type_v() const {
     return layers[0].v->type;
 }
 
+bool llama_kv_cache::try_resize() {
+    if (kv_size_max_val == 0) {
+        return false;
+    }
+
+    // already at max capacity?
+    if (kv_size_cur >= kv_size_max_val) {
+        return false;
+    }
+
+    // calculate new size: double for small, +1GB equivalent for large
+    uint32_t new_size;
+    if (kv_size_cur < 4096) {
+        new_size = kv_size_cur * 2;
+    } else {
+        // estimate ~1GB growth based on per-cell memory
+        const size_t total = total_size();
+        const size_t per_cell = total / kv_size_cur;
+        const uint32_t cells_per_gb = (per_cell > 0) ? (uint32_t)(1024ULL * 1024 * 1024 / per_cell) : kv_size_cur;
+        new_size = kv_size_cur + std::max(cells_per_gb, 256u);
+    }
+
+    // clamp to max
+    new_size = std::min(new_size, kv_size_max_val);
+
+    if (new_size <= kv_size_cur) {
+        return false;
+    }
+
+    LLAMA_LOG_INFO("%s: resizing KV cache from %u to %u cells\n", __func__, kv_size_cur, new_size);
+
+    // ensure new_size is aligned to n_pad
+    new_size = ((new_size + n_pad - 1) / n_pad) * n_pad;
+
+    // create a temporary cache with the new size
+    // NOTE: pass kv_size_max=0 so the constructor does NOT apply
+    //       the dynamic start logic (which would shrink back to 256)
+    llama_kv_cache tmp(model, saved_type_k, saved_type_v, saved_v_trans, saved_offload, saved_unified, new_size,
+                       saved_n_seq_max, saved_n_pad, saved_n_swa, saved_swa_type, saved_filter, saved_reuse,
+                       /*kv_size_max=*/0);
+
+    // copy existing data
+    tmp.copy_from(*this);
+
+    // steal the internals from the new cache
+    ctxs_bufs = std::move(tmp.ctxs_bufs);
+    layers    = std::move(tmp.layers);
+    v_cells   = std::move(tmp.v_cells);
+    v_heads   = std::move(tmp.v_heads);
+
+    // resize v_cells to match new size
+    for (uint32_t s = 0; s < n_stream; ++s) {
+        v_cells[s].resize(new_size);
+    }
+
+    kv_size_cur = new_size;
+    was_resized = true;
+
+    return true;
+}
+
+void llama_kv_cache::copy_from(const llama_kv_cache & other) {
+    GGML_ASSERT(layers.size() == other.layers.size());
+
+    for (size_t il = 0; il < layers.size(); ++il) {
+        if (layers[il].k && other.layers[il].k) {
+            GGML_ASSERT(layers[il].k_stream.size() == other.layers[il].k_stream.size());
+            for (size_t s = 0; s < other.layers[il].k_stream.size(); ++s) {
+                const size_t n_bytes = ggml_nbytes(other.layers[il].k_stream[s]);
+                std::vector<uint8_t> staging(n_bytes);
+                ggml_backend_tensor_get(other.layers[il].k_stream[s], staging.data(), 0, n_bytes);
+                ggml_backend_tensor_set(layers[il].k_stream[s], staging.data(), 0, n_bytes);
+            }
+        }
+        if (layers[il].v && other.layers[il].v) {
+            GGML_ASSERT(layers[il].v_stream.size() == other.layers[il].v_stream.size());
+            for (size_t s = 0; s < other.layers[il].v_stream.size(); ++s) {
+                const size_t n_bytes = ggml_nbytes(other.layers[il].v_stream[s]);
+                std::vector<uint8_t> staging(n_bytes);
+                ggml_backend_tensor_get(other.layers[il].v_stream[s], staging.data(), 0, n_bytes);
+                ggml_backend_tensor_set(layers[il].v_stream[s], staging.data(), 0, n_bytes);
+            }
+        }
+    }
+
+    // copy cell metadata
+    for (uint32_t s = 0; s < n_stream && s < (uint32_t) other.v_cells.size(); ++s) {
+        v_cells[s] = other.v_cells[s];
+    }
+}
+
+bool llama_kv_cache::check_and_clear_resized() {
+    if (was_resized) {
+        was_resized = false;
+        return true;
+    }
+    return false;
+}
+
 uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const {
     uint32_t result = 0;
 

@@ -106,7 +106,8 @@ class llama_kv_cache : public llama_memory_i {
                      uint32_t   n_swa,
                llama_swa_type   swa_type,
         const layer_filter_cb & filter,
-        const  layer_reuse_cb & reuse);
+        const  layer_reuse_cb & reuse,
+                     uint32_t   kv_size_max = 0);
 
     ~llama_kv_cache() = default;
 
@@ -155,6 +156,11 @@ class llama_kv_cache : public llama_memory_i {
     ggml_type type_k() const;
     ggml_type type_v() const;
 
+    // dynamic resize support
+    bool try_resize();
+    void copy_from(const llama_kv_cache & other);
+    bool check_and_clear_resized();
+
     //
     // graph_build API
     //
@@ -265,6 +271,24 @@ class llama_kv_cache : public llama_memory_i {
 
     std::vector<kv_layer> layers;
 
+    // dynamic resize state
+    uint32_t kv_size_cur     = 0;
+    uint32_t kv_size_max_val = 0;
+    bool     was_resized     = false;
+
+    // saved construction parameters (for resize)
+    ggml_type        saved_type_k    = GGML_TYPE_F16;
+    ggml_type        saved_type_v    = GGML_TYPE_F16;
+    bool             saved_v_trans   = true;
+    bool             saved_offload   = true;
+    bool             saved_unified   = false;
+    uint32_t         saved_n_seq_max = 1;
+    uint32_t         saved_n_pad     = 1;
+    uint32_t         saved_n_swa     = 0;
+    llama_swa_type   saved_swa_type  = LLAMA_SWA_TYPE_NONE;
+    layer_filter_cb  saved_filter    = nullptr;
+    layer_reuse_cb   saved_reuse     = nullptr;
+
     // model layer id -> KV cache layer id
     std::unordered_map<int32_t, int32_t> map_layer_ids;
 

@@ -28,7 +28,9 @@ llama_memory_hybrid::llama_memory_hybrid(
                      bool   unified,
                             /* layer filters */
     const layer_filter_cb & filter_attn,
-    const layer_filter_cb & filter_recr) :
+    const layer_filter_cb & filter_recr,
+                            /* dynamic resize */
+                 uint32_t   kv_size_max) :
     hparams(model.hparams),
     mem_attn(new llama_kv_cache(
         model,
@@ -45,7 +47,8 @@ llama_memory_hybrid::llama_memory_hybrid(
         filter_attn == nullptr ?
             [&](int32_t il) { return !hparams.is_recurrent(il); }
             : filter_attn,
-        nullptr
+        nullptr,
+        kv_size_max
     )),
     mem_recr(new llama_memory_recurrent(
         model,
@@ -99,6 +102,12 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba
 
         // prepare the attention cache
         auto heads_attn = mem_attn->prepare(ubatches);
+        while (heads_attn.empty()) {
+            if (!mem_attn->try_resize()) {
+                break;
+            }
+            heads_attn = mem_attn->prepare(ubatches);
+        }
         if (heads_attn.empty()) {
             LLAMA_LOG_ERROR("%s: failed to prepare attention ubatches\n", __func__);
             return std::make_unique<llama_memory_hybrid_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);

@@ -38,7 +38,9 @@ class llama_memory_hybrid : public llama_memory_i {
                      bool   unified,
                             /* layer filters */
     const layer_filter_cb & filter_attn = nullptr,
-    const layer_filter_cb & filter_recr = nullptr);
+    const layer_filter_cb & filter_recr = nullptr,
+                            /* dynamic resize */
+                 uint32_t   kv_size_max = 0);
 
     ~llama_memory_hybrid() = default;
 

@@ -8348,7 +8348,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                             /* offload           */ cparams.offload_kqv,
                             /* unified           */ cparams.kv_unified,
                             /* filter_attn       */ std::move(filter_attn),
-                            /* filter_recr       */ std::move(filter_recr));
+                            /* filter_recr       */ std::move(filter_recr),
+                            /* kv_size_max       */ cparams.kv_dynamic ? cparams.n_ctx_seq : 0);
                     }
                 } else {
                     llama_memory_i::layer_reuse_cb reuse = nullptr;
@@ -8396,7 +8397,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                                 hparams.n_swa,
                                 hparams.swa_type,
                                 nullptr,
-                                nullptr);
+                                nullptr,
+                                cparams.kv_dynamic ? cparams.n_ctx_seq : 0);
                     }
                 }
             }