From d1aca48b6496122c72c9a47139afeda63615c030 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nearly=20Ther=D1=A5?= <188969017+yychyo@users.noreply.github.com> Date: Fri, 10 Apr 2026 22:36:19 +0300 Subject: [PATCH] server: rename --clear-idle to --cache-idle-slots --- common/arg.cpp | 8 ++++---- common/common.h | 2 +- tools/server/README.md | 2 +- tools/server/server-context.cpp | 12 ++++++------ tools/server/tests/unit/test_kv_keep_only_active.py | 2 +- tools/server/tests/utils.py | 6 +++--- 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index c0cc576f29c..f5ee910a537 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1312,13 +1312,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL})); add_opt(common_arg( - {"--clear-idle"}, - {"--no-clear-idle"}, + {"--cache-idle-slots"}, + {"--no-cache-idle-slots"}, "save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)", [](common_params & params, bool value) { - params.clear_idle = value; + params.cache_idle_slots = value; } - ).set_env("LLAMA_ARG_CLEAR_IDLE").set_examples({LLAMA_EXAMPLE_SERVER})); + ).set_env("LLAMA_ARG_CACHE_IDLE_SLOTS").set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--context-shift"}, {"--no-context-shift"}, diff --git a/common/common.h b/common/common.h index 020b6a721ff..026ff9dcf9d 100644 --- a/common/common.h +++ b/common/common.h @@ -579,7 +579,7 @@ struct common_params { int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting bool cache_prompt = true; // whether to enable prompt caching - bool clear_idle = true; // save and clear idle slots upon starting a new task + bool cache_idle_slots = true; // save and clear idle slots upon starting a new task int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot int32_t checkpoint_every_nt = 8192; // make a checkpoint every n tokens during prefill int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc. diff --git a/tools/server/README.md b/tools/server/README.md index b30309bf3b0..56f21677c4e 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -167,7 +167,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)
(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) | | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | | `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)
(env: LLAMA_ARG_KV_UNIFIED) | -| `--clear-idle, --no-clear-idle` | save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)
(env: LLAMA_ARG_CLEAR_IDLE) | +| `--cache-idle-slots, --no-cache-idle-slots` | save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)
(env: LLAMA_ARG_CACHE_IDLE_SLOTS) | | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode | | `-sp, --special` | special tokens output enabled (default: false) | diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index b31981c5628..2ac3c420b33 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -878,13 +878,13 @@ struct server_context_impl { metrics.init(); - if (params_base.clear_idle) { + if (params_base.cache_idle_slots) { if (!params_base.kv_unified) { - SRV_WRN("%s: --clear-idle requires --kv-unified, disabling\n", __func__); - params_base.clear_idle = false; + SRV_WRN("%s: --cache-idle-slots requires --kv-unified, disabling\n", __func__); + params_base.cache_idle_slots = false; } else if (params_base.cache_ram_mib == 0) { - SRV_WRN("%s: --clear-idle requires --cache-ram, disabling\n", __func__); - params_base.clear_idle = false; + SRV_WRN("%s: --cache-idle-slots requires --cache-ram, disabling\n", __func__); + params_base.cache_idle_slots = false; } else { SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__); SRV_DBG("%s", "__TEST_TAG_CLEAR_IDLE_ENABLED__\n"); @@ -1757,7 +1757,7 @@ struct server_context_impl { break; // drop the task } - if (params_base.clear_idle) { + if (params_base.cache_idle_slots) { for (auto & s : slots) { if (!s.is_processing()) { slot_save_and_clear(s); diff --git a/tools/server/tests/unit/test_kv_keep_only_active.py b/tools/server/tests/unit/test_kv_keep_only_active.py index da93d50011e..f4b08b5dd0e 100644 --- a/tools/server/tests/unit/test_kv_keep_only_active.py +++ b/tools/server/tests/unit/test_kv_keep_only_active.py @@ -91,7 +91,7 @@ def test_clear_and_restore(): def test_disabled_with_flag(): global server - server.no_clear_idle = True + server.no_cache_idle_slots = True server.start() log = LogReader(server.log_path) diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py index 5ddac5be496..ddbb76c9adb 100644 --- a/tools/server/tests/utils.py +++ b/tools/server/tests/utils.py @@ -103,7 +103,7 @@ class ServerProcess: media_path: str | None = None sleep_idle_seconds: int | None = None cache_ram: int | None = None - no_clear_idle: bool = False + no_cache_idle_slots: bool = False log_path: str | None = None webui_mcp_proxy: bool = False @@ -242,8 +242,8 @@ def start(self, timeout_seconds: int = DEFAULT_HTTP_TIMEOUT) -> None: server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds]) if self.cache_ram is not None: server_args.extend(["--cache-ram", self.cache_ram]) - if self.no_clear_idle: - server_args.append("--no-clear-idle") + if self.no_cache_idle_slots: + server_args.append("--no-cache-idle-slots") if self.webui_mcp_proxy: server_args.append("--webui-mcp-proxy")