Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1319,6 +1319,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.clear_idle = value;
}
).set_env("LLAMA_ARG_CLEAR_IDLE").set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--kv-dynamic"},
"enable dynamic KV cache resizing (start small, grow on demand)",
[](common_params & params) {
params.kv_dynamic = true;
}
));
add_opt(common_arg(
{"--context-shift"},
{"--no-context-shift"},
Expand Down
1 change: 1 addition & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1476,6 +1476,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.op_offload = !params.no_op_offload;
cparams.swa_full = params.swa_full;
cparams.kv_unified = params.kv_unified;
cparams.kv_dynamic = params.kv_dynamic;

cparams.type_k = params.cache_type_k;
cparams.type_v = params.cache_type_v;
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,7 @@ struct common_params {
bool ctx_shift = false; // context shift on infinite text generation
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
bool kv_unified = false; // enable unified KV cache
bool kv_dynamic = false; // enable dynamic KV cache resizing

bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool use_mmap = true; // enable mmap to use filesystem cache
Expand Down
1 change: 1 addition & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,7 @@ extern "C" {
bool kv_unified; // use a unified buffer across the input sequences when computing the attention
// try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
// ref: https://github.com/ggml-org/llama.cpp/pull/14363
bool kv_dynamic; // enable dynamic KV cache resizing (start small, grow on demand)

// [EXPERIMENTAL]
// backend sampler chain configuration (make sure the caller keeps the sampler chains alive)
Expand Down
29 changes: 29 additions & 0 deletions src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,32 @@
#include "llama-batch.h"
#include "llama-io.h"
#include "llama-memory.h"
#include "llama-memory-hybrid.h"
#include "llama-mmap.h"
#include "llama-model.h"
#include "llama-ext.h"

namespace {

bool check_and_clear_resized_kv(llama_memory_i * memory) {
if (memory == nullptr) {
return false;
}

if (auto * kv = dynamic_cast<llama_kv_cache *>(memory)) {
return kv->check_and_clear_resized();
}

if (auto * hybrid = dynamic_cast<llama_memory_hybrid *>(memory)) {
auto * mem_attn = hybrid->get_mem_attn();
return mem_attn != nullptr && mem_attn->check_and_clear_resized();
}

return false;
}

} // namespace

#include <cinttypes>
#include <cmath>
#include <cstring>
Expand Down Expand Up @@ -162,6 +184,7 @@ llama_context::llama_context(

cparams.op_offload = params.op_offload;
cparams.kv_unified = params.kv_unified;
cparams.kv_dynamic = params.kv_dynamic;

// initialized later
cparams.pipeline_parallel = false;
Expand Down Expand Up @@ -1620,6 +1643,11 @@ int llama_context::decode(const llama_batch & batch_inp) {
return -2;
}

if (check_and_clear_resized_kv(memory.get())) {
sched_need_reserve = true;
sched_reserve();
}

switch (mctx->get_status()) {
case LLAMA_MEMORY_STATUS_SUCCESS:
{
Expand Down Expand Up @@ -2912,6 +2940,7 @@ llama_context_params llama_context_default_params() {
/*.op_offload =*/ true,
/*.swa_full =*/ true,
/*.kv_unified =*/ false,
/*.kv_dynamic =*/ false,
/*.sampler =*/ nullptr,
/*.n_sampler =*/ 0,
};
Expand Down
1 change: 1 addition & 0 deletions src/llama-cparams.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ struct llama_cparams {
bool warmup;
bool op_offload;
bool kv_unified;
bool kv_dynamic;
bool pipeline_parallel;

enum llama_pooling_type pooling_type;
Expand Down
133 changes: 132 additions & 1 deletion src/llama-kv-cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,34 @@ llama_kv_cache::llama_kv_cache(
uint32_t n_swa,
llama_swa_type swa_type,
const layer_filter_cb & filter,
const layer_reuse_cb & reuse) :
const layer_reuse_cb & reuse,
uint32_t kv_size_max) :
model(model), hparams(model.hparams), v_trans(v_trans),
n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {

// save construction parameters for dynamic resize
saved_type_k = type_k;
saved_type_v = type_v;
saved_v_trans = v_trans;
saved_offload = offload;
saved_unified = unified;
saved_n_seq_max = n_seq_max;
saved_n_pad = n_pad;
saved_n_swa = n_swa;
saved_swa_type = swa_type;
saved_filter = filter;
saved_reuse = reuse;
kv_size_max_val = kv_size_max;

// dynamic resize: start with small initial size
if (kv_size_max > 0 && kv_size > 256) {
kv_size_cur = 256;
kv_size = kv_size_cur;
LLAMA_LOG_INFO("%s: dynamic KV cache: start = %u cells, max = %u cells\n", __func__, kv_size_cur, kv_size_max);
} else {
kv_size_cur = kv_size;
}

GGML_ASSERT(kv_size % n_pad == 0);

const uint32_t n_layer_kv = hparams.n_layer_kv();
Expand Down Expand Up @@ -254,6 +278,8 @@ llama_kv_cache::llama_kv_cache(

LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);

// Keep unwritten padding deterministic; dynamic mode still saves memory
// because the buffer starts small and only grows when needed.
ggml_backend_buffer_clear(buf, 0);
ctxs_bufs.emplace_back(std::move(ctx), buf);
}
Expand Down Expand Up @@ -637,6 +663,12 @@ llama_memory_context_ptr llama_kv_cache::init_batch(
}

auto sinfos = prepare(ubatches);
while (sinfos.empty()) {
if (!try_resize()) {
break;
}
sinfos = prepare(ubatches);
}
if (sinfos.empty()) {
break;
}
Expand Down Expand Up @@ -1113,6 +1145,105 @@ ggml_type llama_kv_cache::type_v() const {
return layers[0].v->type;
}

bool llama_kv_cache::try_resize() {
if (kv_size_max_val == 0) {
return false;
}

// already at max capacity?
if (kv_size_cur >= kv_size_max_val) {
return false;
}

// calculate new size: double for small, +1GB equivalent for large
uint32_t new_size;
if (kv_size_cur < 4096) {
new_size = kv_size_cur * 2;
} else {
// estimate ~1GB growth based on per-cell memory
const size_t total = total_size();
const size_t per_cell = total / kv_size_cur;
const uint32_t cells_per_gb = (per_cell > 0) ? (uint32_t)(1024ULL * 1024 * 1024 / per_cell) : kv_size_cur;
new_size = kv_size_cur + std::max(cells_per_gb, 256u);
}

// clamp to max
new_size = std::min(new_size, kv_size_max_val);

if (new_size <= kv_size_cur) {
return false;
}

LLAMA_LOG_INFO("%s: resizing KV cache from %u to %u cells\n", __func__, kv_size_cur, new_size);

// ensure new_size is aligned to n_pad
new_size = ((new_size + n_pad - 1) / n_pad) * n_pad;

// create a temporary cache with the new size
// NOTE: pass kv_size_max=0 so the constructor does NOT apply
// the dynamic start logic (which would shrink back to 256)
llama_kv_cache tmp(model, saved_type_k, saved_type_v, saved_v_trans, saved_offload, saved_unified, new_size,
saved_n_seq_max, saved_n_pad, saved_n_swa, saved_swa_type, saved_filter, saved_reuse,
/*kv_size_max=*/0);

// copy existing data
tmp.copy_from(*this);

// steal the internals from the new cache
ctxs_bufs = std::move(tmp.ctxs_bufs);
layers = std::move(tmp.layers);
v_cells = std::move(tmp.v_cells);
v_heads = std::move(tmp.v_heads);

// resize v_cells to match new size
for (uint32_t s = 0; s < n_stream; ++s) {
v_cells[s].resize(new_size);
}

kv_size_cur = new_size;
was_resized = true;

return true;
}

void llama_kv_cache::copy_from(const llama_kv_cache & other) {
GGML_ASSERT(layers.size() == other.layers.size());

for (size_t il = 0; il < layers.size(); ++il) {
if (layers[il].k && other.layers[il].k) {
GGML_ASSERT(layers[il].k_stream.size() == other.layers[il].k_stream.size());
for (size_t s = 0; s < other.layers[il].k_stream.size(); ++s) {
const size_t n_bytes = ggml_nbytes(other.layers[il].k_stream[s]);
std::vector<uint8_t> staging(n_bytes);
ggml_backend_tensor_get(other.layers[il].k_stream[s], staging.data(), 0, n_bytes);
ggml_backend_tensor_set(layers[il].k_stream[s], staging.data(), 0, n_bytes);
}
}
if (layers[il].v && other.layers[il].v) {
GGML_ASSERT(layers[il].v_stream.size() == other.layers[il].v_stream.size());
for (size_t s = 0; s < other.layers[il].v_stream.size(); ++s) {
const size_t n_bytes = ggml_nbytes(other.layers[il].v_stream[s]);
std::vector<uint8_t> staging(n_bytes);
ggml_backend_tensor_get(other.layers[il].v_stream[s], staging.data(), 0, n_bytes);
ggml_backend_tensor_set(layers[il].v_stream[s], staging.data(), 0, n_bytes);
}
}
}

// copy cell metadata
for (uint32_t s = 0; s < n_stream && s < (uint32_t) other.v_cells.size(); ++s) {
v_cells[s] = other.v_cells[s];
}
}

bool llama_kv_cache::check_and_clear_resized() {
if (was_resized) {
was_resized = false;
return true;
}
return false;
}

uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const {
uint32_t result = 0;

Expand Down
26 changes: 25 additions & 1 deletion src/llama-kv-cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,8 @@ class llama_kv_cache : public llama_memory_i {
uint32_t n_swa,
llama_swa_type swa_type,
const layer_filter_cb & filter,
const layer_reuse_cb & reuse);
const layer_reuse_cb & reuse,
uint32_t kv_size_max = 0);

~llama_kv_cache() = default;

Expand Down Expand Up @@ -155,6 +156,11 @@ class llama_kv_cache : public llama_memory_i {
ggml_type type_k() const;
ggml_type type_v() const;

// dynamic resize support
bool try_resize();
void copy_from(const llama_kv_cache & other);
bool check_and_clear_resized();

//
// graph_build API
//
Expand Down Expand Up @@ -265,6 +271,24 @@ class llama_kv_cache : public llama_memory_i {

std::vector<kv_layer> layers;

// dynamic resize state
uint32_t kv_size_cur = 0;
uint32_t kv_size_max_val = 0;
bool was_resized = false;

// saved construction parameters (for resize)
ggml_type saved_type_k = GGML_TYPE_F16;
ggml_type saved_type_v = GGML_TYPE_F16;
bool saved_v_trans = true;
bool saved_offload = true;
bool saved_unified = false;
uint32_t saved_n_seq_max = 1;
uint32_t saved_n_pad = 1;
uint32_t saved_n_swa = 0;
llama_swa_type saved_swa_type = LLAMA_SWA_TYPE_NONE;
layer_filter_cb saved_filter = nullptr;
layer_reuse_cb saved_reuse = nullptr;

// model layer id -> KV cache layer id
std::unordered_map<int32_t, int32_t> map_layer_ids;

Expand Down
13 changes: 11 additions & 2 deletions src/llama-memory-hybrid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ llama_memory_hybrid::llama_memory_hybrid(
bool unified,
/* layer filters */
const layer_filter_cb & filter_attn,
const layer_filter_cb & filter_recr) :
const layer_filter_cb & filter_recr,
/* dynamic resize */
uint32_t kv_size_max) :
hparams(model.hparams),
mem_attn(new llama_kv_cache(
model,
Expand All @@ -45,7 +47,8 @@ llama_memory_hybrid::llama_memory_hybrid(
filter_attn == nullptr ?
[&](int32_t il) { return !hparams.is_recurrent(il); }
: filter_attn,
nullptr
nullptr,
kv_size_max
)),
mem_recr(new llama_memory_recurrent(
model,
Expand Down Expand Up @@ -99,6 +102,12 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba

// prepare the attention cache
auto heads_attn = mem_attn->prepare(ubatches);
while (heads_attn.empty()) {
if (!mem_attn->try_resize()) {
break;
}
heads_attn = mem_attn->prepare(ubatches);
}
if (heads_attn.empty()) {
LLAMA_LOG_ERROR("%s: failed to prepare attention ubatches\n", __func__);
return std::make_unique<llama_memory_hybrid_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
Expand Down
4 changes: 3 additions & 1 deletion src/llama-memory-hybrid.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@ class llama_memory_hybrid : public llama_memory_i {
bool unified,
/* layer filters */
const layer_filter_cb & filter_attn = nullptr,
const layer_filter_cb & filter_recr = nullptr);
const layer_filter_cb & filter_recr = nullptr,
/* dynamic resize */
uint32_t kv_size_max = 0);

~llama_memory_hybrid() = default;

Expand Down
6 changes: 4 additions & 2 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8348,7 +8348,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
/* offload */ cparams.offload_kqv,
/* unified */ cparams.kv_unified,
/* filter_attn */ std::move(filter_attn),
/* filter_recr */ std::move(filter_recr));
/* filter_recr */ std::move(filter_recr),
/* kv_size_max */ cparams.kv_dynamic ? cparams.n_ctx_seq : 0);
}
} else {
llama_memory_i::layer_reuse_cb reuse = nullptr;
Expand Down Expand Up @@ -8396,7 +8397,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
hparams.n_swa,
hparams.swa_type,
nullptr,
nullptr);
nullptr,
cparams.kv_dynamic ? cparams.n_ctx_seq : 0);
}
}
}
Expand Down