From d1aca48b6496122c72c9a47139afeda63615c030 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nearly=20Ther=D1=A5?=
<188969017+yychyo@users.noreply.github.com>
Date: Fri, 10 Apr 2026 22:36:19 +0300
Subject: [PATCH] server: rename --clear-idle to --cache-idle-slots
---
common/arg.cpp | 8 ++++----
common/common.h | 2 +-
tools/server/README.md | 2 +-
tools/server/server-context.cpp | 12 ++++++------
tools/server/tests/unit/test_kv_keep_only_active.py | 2 +-
tools/server/tests/utils.py | 6 +++---
6 files changed, 16 insertions(+), 16 deletions(-)
diff --git a/common/arg.cpp b/common/arg.cpp
index c0cc576f29c..f5ee910a537 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1312,13 +1312,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
add_opt(common_arg(
- {"--clear-idle"},
- {"--no-clear-idle"},
+ {"--cache-idle-slots"},
+ {"--no-cache-idle-slots"},
"save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)",
[](common_params & params, bool value) {
- params.clear_idle = value;
+ params.cache_idle_slots = value;
}
- ).set_env("LLAMA_ARG_CLEAR_IDLE").set_examples({LLAMA_EXAMPLE_SERVER}));
+ ).set_env("LLAMA_ARG_CACHE_IDLE_SLOTS").set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--context-shift"},
{"--no-context-shift"},
diff --git a/common/common.h b/common/common.h
index 020b6a721ff..026ff9dcf9d 100644
--- a/common/common.h
+++ b/common/common.h
@@ -579,7 +579,7 @@ struct common_params {
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
bool cache_prompt = true; // whether to enable prompt caching
- bool clear_idle = true; // save and clear idle slots upon starting a new task
+ bool cache_idle_slots = true; // save and clear idle slots upon starting a new task
int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot
int32_t checkpoint_every_nt = 8192; // make a checkpoint every n tokens during prefill
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
diff --git a/tools/server/README.md b/tools/server/README.md
index b30309bf3b0..56f21677c4e 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -167,7 +167,7 @@ For the full list of features, please refer to [server's changelog](https://gith
| `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)
(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) |
| `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) |
| `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)
(env: LLAMA_ARG_KV_UNIFIED) |
-| `--clear-idle, --no-clear-idle` | save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)
(env: LLAMA_ARG_CLEAR_IDLE) |
+| `--cache-idle-slots, --no-cache-idle-slots` | save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)
(env: LLAMA_ARG_CACHE_IDLE_SLOTS) |
| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) |
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode |
| `-sp, --special` | special tokens output enabled (default: false) |
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index b31981c5628..2ac3c420b33 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -878,13 +878,13 @@ struct server_context_impl {
metrics.init();
- if (params_base.clear_idle) {
+ if (params_base.cache_idle_slots) {
if (!params_base.kv_unified) {
- SRV_WRN("%s: --clear-idle requires --kv-unified, disabling\n", __func__);
- params_base.clear_idle = false;
+ SRV_WRN("%s: --cache-idle-slots requires --kv-unified, disabling\n", __func__);
+ params_base.cache_idle_slots = false;
} else if (params_base.cache_ram_mib == 0) {
- SRV_WRN("%s: --clear-idle requires --cache-ram, disabling\n", __func__);
- params_base.clear_idle = false;
+ SRV_WRN("%s: --cache-idle-slots requires --cache-ram, disabling\n", __func__);
+ params_base.cache_idle_slots = false;
} else {
SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__);
SRV_DBG("%s", "__TEST_TAG_CLEAR_IDLE_ENABLED__\n");
@@ -1757,7 +1757,7 @@ struct server_context_impl {
break; // drop the task
}
- if (params_base.clear_idle) {
+ if (params_base.cache_idle_slots) {
for (auto & s : slots) {
if (!s.is_processing()) {
slot_save_and_clear(s);
diff --git a/tools/server/tests/unit/test_kv_keep_only_active.py b/tools/server/tests/unit/test_kv_keep_only_active.py
index da93d50011e..f4b08b5dd0e 100644
--- a/tools/server/tests/unit/test_kv_keep_only_active.py
+++ b/tools/server/tests/unit/test_kv_keep_only_active.py
@@ -91,7 +91,7 @@ def test_clear_and_restore():
def test_disabled_with_flag():
global server
- server.no_clear_idle = True
+ server.no_cache_idle_slots = True
server.start()
log = LogReader(server.log_path)
diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py
index 5ddac5be496..ddbb76c9adb 100644
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@@ -103,7 +103,7 @@ class ServerProcess:
media_path: str | None = None
sleep_idle_seconds: int | None = None
cache_ram: int | None = None
- no_clear_idle: bool = False
+ no_cache_idle_slots: bool = False
log_path: str | None = None
webui_mcp_proxy: bool = False
@@ -242,8 +242,8 @@ def start(self, timeout_seconds: int = DEFAULT_HTTP_TIMEOUT) -> None:
server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds])
if self.cache_ram is not None:
server_args.extend(["--cache-ram", self.cache_ram])
- if self.no_clear_idle:
- server_args.append("--no-clear-idle")
+ if self.no_cache_idle_slots:
+ server_args.append("--no-cache-idle-slots")
if self.webui_mcp_proxy:
server_args.append("--webui-mcp-proxy")