From e328e2e7ec37ced181ac0b59755c6126d91bd056 Mon Sep 17 00:00:00 2001 From: anavoi Date: Sat, 11 Apr 2026 00:16:09 +0200 Subject: [PATCH] server: ensure prompt caching for SWA models When using --swa-full with Gemma 4 and other SWA models, the server can sometimes incorrectly force full prompt re-processing on every request, even though --swa-full allocates a full-size SWA cache where tokens are not pruned. Fix by: - Setting pos_min_thold = 0 when swa_full is enabled (any cached position is useful) - Skipping checkpoint restoration logic when swa_full is enabled (unnecessary since cache is full-size) --- tools/server/server-context.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index b31981c5628..d3415450881 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2344,7 +2344,7 @@ struct server_context_impl { const auto n_swa = std::max(0, llama_model_n_swa(model)); // the largest pos_min required for a checkpoint to be useful - const auto pos_min_thold = std::max(0, pos_next - n_swa); + const auto pos_min_thold = params_base.swa_full ? 0 : std::max(0, pos_next - n_swa); if (n_past > 0 && n_past < slot.prompt.n_tokens()) { const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id); @@ -2396,7 +2396,7 @@ struct server_context_impl { SLT_WRN(slot, "%s\n", st1.str().c_str()); } - if (pos_min >= pos_min_thold) { + if (pos_min >= pos_min_thold && !params_base.swa_full) { SLT_WRN(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min, n_swa); // search for a context checkpoint