diff --git a/common/arg.cpp b/common/arg.cpp index 2e0f46db519..155ac9d9ed5 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1319,6 +1319,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.clear_idle = value; } ).set_env("LLAMA_ARG_CLEAR_IDLE").set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--kv-dynamic"}, + "enable dynamic KV cache resizing (start small, grow on demand)", + [](common_params & params) { + params.kv_dynamic = true; + } + )); add_opt(common_arg( {"--context-shift"}, {"--no-context-shift"}, diff --git a/common/common.cpp b/common/common.cpp index 16f78debd02..6d73d9c80d5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1476,6 +1476,7 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.op_offload = !params.no_op_offload; cparams.swa_full = params.swa_full; cparams.kv_unified = params.kv_unified; + cparams.kv_dynamic = params.kv_dynamic; cparams.type_k = params.cache_type_k; cparams.type_v = params.cache_type_v; diff --git a/common/common.h b/common/common.h index 020b6a721ff..cf7831037b1 100644 --- a/common/common.h +++ b/common/common.h @@ -530,6 +530,7 @@ struct common_params { bool ctx_shift = false; // context shift on infinite text generation bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) bool kv_unified = false; // enable unified KV cache + bool kv_dynamic = false; // enable dynamic KV cache resizing bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool use_mmap = true; // enable mmap to use filesystem cache diff --git a/include/llama.h b/include/llama.h index a940f9d648a..8893eee0f73 100644 --- a/include/llama.h +++ b/include/llama.h @@ -372,6 +372,7 @@ extern "C" { bool kv_unified; // use a unified buffer across the input sequences when computing the attention // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix // ref: https://github.com/ggml-org/llama.cpp/pull/14363 + bool kv_dynamic; // enable dynamic KV cache resizing (start small, grow on demand) // [EXPERIMENTAL] // backend sampler chain configuration (make sure the caller keeps the sampler chains alive) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index a808e3e4542..94c2766140e 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -5,10 +5,32 @@ #include "llama-batch.h" #include "llama-io.h" #include "llama-memory.h" +#include "llama-memory-hybrid.h" #include "llama-mmap.h" #include "llama-model.h" #include "llama-ext.h" +namespace { + +bool check_and_clear_resized_kv(llama_memory_i * memory) { + if (memory == nullptr) { + return false; + } + + if (auto * kv = dynamic_cast(memory)) { + return kv->check_and_clear_resized(); + } + + if (auto * hybrid = dynamic_cast(memory)) { + auto * mem_attn = hybrid->get_mem_attn(); + return mem_attn != nullptr && mem_attn->check_and_clear_resized(); + } + + return false; +} + +} // namespace + #include #include #include @@ -162,6 +184,7 @@ llama_context::llama_context( cparams.op_offload = params.op_offload; cparams.kv_unified = params.kv_unified; + cparams.kv_dynamic = params.kv_dynamic; // initialized later cparams.pipeline_parallel = false; @@ -1620,6 +1643,11 @@ int llama_context::decode(const llama_batch & batch_inp) { return -2; } + if (check_and_clear_resized_kv(memory.get())) { + sched_need_reserve = true; + sched_reserve(); + } + switch (mctx->get_status()) { case LLAMA_MEMORY_STATUS_SUCCESS: { @@ -2912,6 +2940,7 @@ llama_context_params llama_context_default_params() { /*.op_offload =*/ true, /*.swa_full =*/ true, /*.kv_unified =*/ false, + /*.kv_dynamic =*/ false, /*.sampler =*/ nullptr, /*.n_sampler =*/ 0, }; diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 9d359474132..39493ccf2c5 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -38,6 +38,7 @@ struct llama_cparams { bool warmup; bool op_offload; bool kv_unified; + bool kv_dynamic; bool pipeline_parallel; enum llama_pooling_type pooling_type; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 3e0fd3107f3..6dc9462327b 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -89,10 +89,34 @@ llama_kv_cache::llama_kv_cache( uint32_t n_swa, llama_swa_type swa_type, const layer_filter_cb & filter, - const layer_reuse_cb & reuse) : + const layer_reuse_cb & reuse, + uint32_t kv_size_max) : model(model), hparams(model.hparams), v_trans(v_trans), n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) { + // save construction parameters for dynamic resize + saved_type_k = type_k; + saved_type_v = type_v; + saved_v_trans = v_trans; + saved_offload = offload; + saved_unified = unified; + saved_n_seq_max = n_seq_max; + saved_n_pad = n_pad; + saved_n_swa = n_swa; + saved_swa_type = swa_type; + saved_filter = filter; + saved_reuse = reuse; + kv_size_max_val = kv_size_max; + + // dynamic resize: start with small initial size + if (kv_size_max > 0 && kv_size > 256) { + kv_size_cur = 256; + kv_size = kv_size_cur; + LLAMA_LOG_INFO("%s: dynamic KV cache: start = %u cells, max = %u cells\n", __func__, kv_size_cur, kv_size_max); + } else { + kv_size_cur = kv_size; + } + GGML_ASSERT(kv_size % n_pad == 0); const uint32_t n_layer_kv = hparams.n_layer_kv(); @@ -254,6 +278,8 @@ llama_kv_cache::llama_kv_cache( LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); + // Keep unwritten padding deterministic; dynamic mode still saves memory + // because the buffer starts small and only grows when needed. ggml_backend_buffer_clear(buf, 0); ctxs_bufs.emplace_back(std::move(ctx), buf); } @@ -637,6 +663,12 @@ llama_memory_context_ptr llama_kv_cache::init_batch( } auto sinfos = prepare(ubatches); + while (sinfos.empty()) { + if (!try_resize()) { + break; + } + sinfos = prepare(ubatches); + } if (sinfos.empty()) { break; } @@ -1113,6 +1145,105 @@ ggml_type llama_kv_cache::type_v() const { return layers[0].v->type; } +bool llama_kv_cache::try_resize() { + if (kv_size_max_val == 0) { + return false; + } + + // already at max capacity? + if (kv_size_cur >= kv_size_max_val) { + return false; + } + + // calculate new size: double for small, +1GB equivalent for large + uint32_t new_size; + if (kv_size_cur < 4096) { + new_size = kv_size_cur * 2; + } else { + // estimate ~1GB growth based on per-cell memory + const size_t total = total_size(); + const size_t per_cell = total / kv_size_cur; + const uint32_t cells_per_gb = (per_cell > 0) ? (uint32_t)(1024ULL * 1024 * 1024 / per_cell) : kv_size_cur; + new_size = kv_size_cur + std::max(cells_per_gb, 256u); + } + + // clamp to max + new_size = std::min(new_size, kv_size_max_val); + + if (new_size <= kv_size_cur) { + return false; + } + + LLAMA_LOG_INFO("%s: resizing KV cache from %u to %u cells\n", __func__, kv_size_cur, new_size); + + // ensure new_size is aligned to n_pad + new_size = ((new_size + n_pad - 1) / n_pad) * n_pad; + + // create a temporary cache with the new size + // NOTE: pass kv_size_max=0 so the constructor does NOT apply + // the dynamic start logic (which would shrink back to 256) + llama_kv_cache tmp(model, saved_type_k, saved_type_v, saved_v_trans, saved_offload, saved_unified, new_size, + saved_n_seq_max, saved_n_pad, saved_n_swa, saved_swa_type, saved_filter, saved_reuse, + /*kv_size_max=*/0); + + // copy existing data + tmp.copy_from(*this); + + // steal the internals from the new cache + ctxs_bufs = std::move(tmp.ctxs_bufs); + layers = std::move(tmp.layers); + v_cells = std::move(tmp.v_cells); + v_heads = std::move(tmp.v_heads); + + // resize v_cells to match new size + for (uint32_t s = 0; s < n_stream; ++s) { + v_cells[s].resize(new_size); + } + + kv_size_cur = new_size; + was_resized = true; + + return true; +} + +void llama_kv_cache::copy_from(const llama_kv_cache & other) { + GGML_ASSERT(layers.size() == other.layers.size()); + + for (size_t il = 0; il < layers.size(); ++il) { + if (layers[il].k && other.layers[il].k) { + GGML_ASSERT(layers[il].k_stream.size() == other.layers[il].k_stream.size()); + for (size_t s = 0; s < other.layers[il].k_stream.size(); ++s) { + const size_t n_bytes = ggml_nbytes(other.layers[il].k_stream[s]); + std::vector staging(n_bytes); + ggml_backend_tensor_get(other.layers[il].k_stream[s], staging.data(), 0, n_bytes); + ggml_backend_tensor_set(layers[il].k_stream[s], staging.data(), 0, n_bytes); + } + } + if (layers[il].v && other.layers[il].v) { + GGML_ASSERT(layers[il].v_stream.size() == other.layers[il].v_stream.size()); + for (size_t s = 0; s < other.layers[il].v_stream.size(); ++s) { + const size_t n_bytes = ggml_nbytes(other.layers[il].v_stream[s]); + std::vector staging(n_bytes); + ggml_backend_tensor_get(other.layers[il].v_stream[s], staging.data(), 0, n_bytes); + ggml_backend_tensor_set(layers[il].v_stream[s], staging.data(), 0, n_bytes); + } + } + } + + // copy cell metadata + for (uint32_t s = 0; s < n_stream && s < (uint32_t) other.v_cells.size(); ++s) { + v_cells[s] = other.v_cells[s]; + } +} + +bool llama_kv_cache::check_and_clear_resized() { + if (was_resized) { + was_resized = false; + return true; + } + return false; +} + uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const { uint32_t result = 0; diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index d4569a06f71..ff450ea75b9 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -106,7 +106,8 @@ class llama_kv_cache : public llama_memory_i { uint32_t n_swa, llama_swa_type swa_type, const layer_filter_cb & filter, - const layer_reuse_cb & reuse); + const layer_reuse_cb & reuse, + uint32_t kv_size_max = 0); ~llama_kv_cache() = default; @@ -155,6 +156,11 @@ class llama_kv_cache : public llama_memory_i { ggml_type type_k() const; ggml_type type_v() const; + // dynamic resize support + bool try_resize(); + void copy_from(const llama_kv_cache & other); + bool check_and_clear_resized(); + // // graph_build API // @@ -265,6 +271,24 @@ class llama_kv_cache : public llama_memory_i { std::vector layers; + // dynamic resize state + uint32_t kv_size_cur = 0; + uint32_t kv_size_max_val = 0; + bool was_resized = false; + + // saved construction parameters (for resize) + ggml_type saved_type_k = GGML_TYPE_F16; + ggml_type saved_type_v = GGML_TYPE_F16; + bool saved_v_trans = true; + bool saved_offload = true; + bool saved_unified = false; + uint32_t saved_n_seq_max = 1; + uint32_t saved_n_pad = 1; + uint32_t saved_n_swa = 0; + llama_swa_type saved_swa_type = LLAMA_SWA_TYPE_NONE; + layer_filter_cb saved_filter = nullptr; + layer_reuse_cb saved_reuse = nullptr; + // model layer id -> KV cache layer id std::unordered_map map_layer_ids; diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp index 4ce1af592c1..99aa97e30f4 100644 --- a/src/llama-memory-hybrid.cpp +++ b/src/llama-memory-hybrid.cpp @@ -28,7 +28,9 @@ llama_memory_hybrid::llama_memory_hybrid( bool unified, /* layer filters */ const layer_filter_cb & filter_attn, - const layer_filter_cb & filter_recr) : + const layer_filter_cb & filter_recr, + /* dynamic resize */ + uint32_t kv_size_max) : hparams(model.hparams), mem_attn(new llama_kv_cache( model, @@ -45,7 +47,8 @@ llama_memory_hybrid::llama_memory_hybrid( filter_attn == nullptr ? [&](int32_t il) { return !hparams.is_recurrent(il); } : filter_attn, - nullptr + nullptr, + kv_size_max )), mem_recr(new llama_memory_recurrent( model, @@ -99,6 +102,12 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba // prepare the attention cache auto heads_attn = mem_attn->prepare(ubatches); + while (heads_attn.empty()) { + if (!mem_attn->try_resize()) { + break; + } + heads_attn = mem_attn->prepare(ubatches); + } if (heads_attn.empty()) { LLAMA_LOG_ERROR("%s: failed to prepare attention ubatches\n", __func__); return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); diff --git a/src/llama-memory-hybrid.h b/src/llama-memory-hybrid.h index 558cafdf984..fb0a9c74e5d 100644 --- a/src/llama-memory-hybrid.h +++ b/src/llama-memory-hybrid.h @@ -38,7 +38,9 @@ class llama_memory_hybrid : public llama_memory_i { bool unified, /* layer filters */ const layer_filter_cb & filter_attn = nullptr, - const layer_filter_cb & filter_recr = nullptr); + const layer_filter_cb & filter_recr = nullptr, + /* dynamic resize */ + uint32_t kv_size_max = 0); ~llama_memory_hybrid() = default; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 79d08ff41e3..44b25f44103 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -8348,7 +8348,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, /* offload */ cparams.offload_kqv, /* unified */ cparams.kv_unified, /* filter_attn */ std::move(filter_attn), - /* filter_recr */ std::move(filter_recr)); + /* filter_recr */ std::move(filter_recr), + /* kv_size_max */ cparams.kv_dynamic ? cparams.n_ctx_seq : 0); } } else { llama_memory_i::layer_reuse_cb reuse = nullptr; @@ -8396,7 +8397,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, hparams.n_swa, hparams.swa_type, nullptr, - nullptr); + nullptr, + cparams.kv_dynamic ? cparams.n_ctx_seq : 0); } } }