Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1312,13 +1312,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
add_opt(common_arg(
{"--clear-idle"},
{"--no-clear-idle"},
{"--cache-idle-slots"},
{"--no-cache-idle-slots"},
"save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)",
[](common_params & params, bool value) {
params.clear_idle = value;
params.cache_idle_slots = value;
}
).set_env("LLAMA_ARG_CLEAR_IDLE").set_examples({LLAMA_EXAMPLE_SERVER}));
).set_env("LLAMA_ARG_CACHE_IDLE_SLOTS").set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--context-shift"},
{"--no-context-shift"},
Expand Down
2 changes: 1 addition & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -579,7 +579,7 @@ struct common_params {
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
bool cache_prompt = true; // whether to enable prompt caching
bool clear_idle = true; // save and clear idle slots upon starting a new task
bool cache_idle_slots = true; // save and clear idle slots upon starting a new task
int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot
int32_t checkpoint_every_nt = 8192; // make a checkpoint every n tokens during prefill
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
Expand Down
2 changes: 1 addition & 1 deletion tools/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ For the full list of features, please refer to [server's changelog](https://gith
| `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)<br/>(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) |
| `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
| `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
| `--clear-idle, --no-clear-idle` | save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)<br/>(env: LLAMA_ARG_CLEAR_IDLE) |
| `--cache-idle-slots, --no-cache-idle-slots` | save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)<br/>(env: LLAMA_ARG_CACHE_IDLE_SLOTS) |
| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode |
| `-sp, --special` | special tokens output enabled (default: false) |
Expand Down
12 changes: 6 additions & 6 deletions tools/server/server-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -878,13 +878,13 @@ struct server_context_impl {

metrics.init();

if (params_base.clear_idle) {
if (params_base.cache_idle_slots) {
if (!params_base.kv_unified) {
SRV_WRN("%s: --clear-idle requires --kv-unified, disabling\n", __func__);
params_base.clear_idle = false;
SRV_WRN("%s: --cache-idle-slots requires --kv-unified, disabling\n", __func__);
params_base.cache_idle_slots = false;
} else if (params_base.cache_ram_mib == 0) {
SRV_WRN("%s: --clear-idle requires --cache-ram, disabling\n", __func__);
params_base.clear_idle = false;
SRV_WRN("%s: --cache-idle-slots requires --cache-ram, disabling\n", __func__);
params_base.cache_idle_slots = false;
} else {
SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__);
SRV_DBG("%s", "__TEST_TAG_CLEAR_IDLE_ENABLED__\n");
Expand Down Expand Up @@ -1757,7 +1757,7 @@ struct server_context_impl {
break; // drop the task
}

if (params_base.clear_idle) {
if (params_base.cache_idle_slots) {
for (auto & s : slots) {
if (!s.is_processing()) {
slot_save_and_clear(s);
Expand Down
2 changes: 1 addition & 1 deletion tools/server/tests/unit/test_kv_keep_only_active.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def test_clear_and_restore():

def test_disabled_with_flag():
global server
server.no_clear_idle = True
server.no_cache_idle_slots = True
server.start()
log = LogReader(server.log_path)

Expand Down
6 changes: 3 additions & 3 deletions tools/server/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ class ServerProcess:
media_path: str | None = None
sleep_idle_seconds: int | None = None
cache_ram: int | None = None
no_clear_idle: bool = False
no_cache_idle_slots: bool = False
log_path: str | None = None
webui_mcp_proxy: bool = False

Expand Down Expand Up @@ -242,8 +242,8 @@ def start(self, timeout_seconds: int = DEFAULT_HTTP_TIMEOUT) -> None:
server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds])
if self.cache_ram is not None:
server_args.extend(["--cache-ram", self.cache_ram])
if self.no_clear_idle:
server_args.append("--no-clear-idle")
if self.no_cache_idle_slots:
server_args.append("--no-cache-idle-slots")
if self.webui_mcp_proxy:
server_args.append("--webui-mcp-proxy")

Expand Down