From bdc786761ee2f12720b10279c5e862bbb855aec5 Mon Sep 17 00:00:00 2001 From: kvc0 <3454741+kvc0@users.noreply.github.com> Date: Sat, 11 Apr 2026 22:25:40 -0700 Subject: [PATCH 1/5] anthropic: fix prefix caching When testing claude code against llama.cpp, I noticed that only n_past 18577 was used even when context was 60k or more. The log in llama-server says: ``` slot update_slots: id 3 | task 10342 | old: ... ; cch= | defa0;You are slot update_slots: id 3 | task 10342 | new: ... ; cch= | 1c8b4; ``` I observed that the cch value changed every time. Reading about that, the x-anthropic-billing-header system message seems to be specially handled inside of the anthropic api. I could remove it, but there is a meaningful string sometimes included at the end. So instead, I just replace the changing cch checksum with fffff. It's always 5 hexadecimal characters, but I've written the replacement defensively in case they change the protocol. --- tools/server/server-common.cpp | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index ed5e306fc5b..bffcfc280d2 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1447,11 +1447,28 @@ json convert_anthropic_to_oai(const json & body) { } else if (system_param.is_array()) { for (const auto & block : system_param) { if (json_value(block, "type", std::string()) == "text") { - system_content += json_value(block, "text", std::string()); + auto system_text = json_value(block, "text", std::string()); + if (system_text.rfind("x-anthropic-billing-header:", 0) == 0 ) { + // This is a claude message with a "cch=ef01a" attribute that breaks prefix caching. + // The cch stamp is a whitebox end-to-end integrity hint. It's not meaningful as a + // system prompt data, particularly to llama.cpp, but its presence means the prefix + // cache will not get past it. + size_t index_cch = system_text.rfind("cch="); + size_t index_replace = index_cch + 4; + size_t cch_length = 5; + if (index_cch != std::string::npos && index_replace + cch_length < system_text.length() && system_text[index_replace + cch_length] == ';') { + // only change this value if the cch still looks right. + for (size_t i = 0; i < cch_length; ++i) { + system_text[index_replace + i] = 'f'; + } + } else { + LOG_ERR("anthropic string not as expected: %s", system_text.c_str()); + } + } + system_content += system_text; } } } - oai_messages.push_back({ {"role", "system"}, {"content", system_content} From a1dea2bfc10633c35f993a14c0541ee798678052 Mon Sep 17 00:00:00 2001 From: kvc0 <3454741+kvc0@users.noreply.github.com> Date: Sun, 12 Apr 2026 14:15:15 -0700 Subject: [PATCH 2/5] review feedback --- tools/server/server-common.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index bffcfc280d2..c1324431968 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1453,9 +1453,10 @@ json convert_anthropic_to_oai(const json & body) { // The cch stamp is a whitebox end-to-end integrity hint. It's not meaningful as a // system prompt data, particularly to llama.cpp, but its presence means the prefix // cache will not get past it. - size_t index_cch = system_text.rfind("cch="); + const size_t header_prefix_length = strlen("x-anthropic-billing-header:"); + const size_t cch_length = 5; + size_t index_cch = system_text.find("cch=", header_prefix_length); size_t index_replace = index_cch + 4; - size_t cch_length = 5; if (index_cch != std::string::npos && index_replace + cch_length < system_text.length() && system_text[index_replace + cch_length] == ';') { // only change this value if the cch still looks right. for (size_t i = 0; i < cch_length; ++i) { From ecaebff8f5ad398726686a1b8a9992c3569a3fc5 Mon Sep 17 00:00:00 2001 From: kvc0 <3454741+kvc0@users.noreply.github.com> Date: Sun, 12 Apr 2026 14:20:25 -0700 Subject: [PATCH 3/5] more comment clarity --- tools/server/server-common.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index c1324431968..3aeeb1a3bb0 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1452,7 +1452,11 @@ json convert_anthropic_to_oai(const json & body) { // This is a claude message with a "cch=ef01a" attribute that breaks prefix caching. // The cch stamp is a whitebox end-to-end integrity hint. It's not meaningful as a // system prompt data, particularly to llama.cpp, but its presence means the prefix - // cache will not get past it. + // cache will not get past it: It changes on each request. + // + // Reference: https://github.com/ggml-org/llama.cpp/pull/21793 + // Example header: x-anthropic-billing-header: cc_version=2.1.101.e51; cc_entrypoint=cli; cch=a5145;You are Claude Code, Anthropic's official CLI for Claude. + // ^^^^^ const size_t header_prefix_length = strlen("x-anthropic-billing-header:"); const size_t cch_length = 5; size_t index_cch = system_text.find("cch=", header_prefix_length); From 3a993a7c5a988bed8fd1b678a30a1e20f636ca58 Mon Sep 17 00:00:00 2001 From: kvc0 <3454741+kvc0@users.noreply.github.com> Date: Sun, 12 Apr 2026 15:06:29 -0700 Subject: [PATCH 4/5] another autofeedback --- tools/server/server-common.cpp | 60 +++++++++++++++++++++------------- tools/server/server-common.h | 3 ++ 2 files changed, 41 insertions(+), 22 deletions(-) diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index 3aeeb1a3bb0..93c583d6298 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1433,6 +1433,42 @@ json convert_responses_to_chatcmpl(const json & response_body) { return chatcmpl_body; } +// Edits the cch section of an "x-anthropic-billing-header" system prompt. +// Does nothing to any other prompt. +// +// This is a claude message with a "cch=ef01a" attribute that breaks prefix caching. +// The cch stamp is a whitebox end-to-end integrity hint. It's not meaningful as a +// system prompt data, particularly to llama.cpp, but its presence means the prefix +// cache will not get past it: It changes on each request. +// +// Reference: https://github.com/ggml-org/llama.cpp/pull/21793 +// Example header: +// ``` +// x-anthropic-billing-header: cc_version=2.1.101.e51; cc_entrypoint=cli; cch=a5145;You are Claude Code, Anthropic's official CLI for Claude. +// ^^^^^ +// ``` +void normalize_anthropic_billing_header(std::string & system_text) { + if (system_text.rfind("x-anthropic-billing-header:", 0) != 0) { + return; + } + + const size_t header_prefix_length = strlen("x-anthropic-billing-header:"); + const size_t cch_length = 5; + const size_t index_cch = system_text.find("cch=", header_prefix_length); + if (index_cch == std::string::npos) { + return; + } + + const size_t index_replace = index_cch + 4; + if (index_replace + cch_length < system_text.length() && system_text[index_replace + cch_length] == ';') { + for (size_t i = 0; i < cch_length; ++i) { + system_text[index_replace + i] = 'f'; + } + } else { + LOG_ERR("anthropic string not as expected: %s", system_text.c_str()); + } +} + json convert_anthropic_to_oai(const json & body) { json oai_body; @@ -1444,32 +1480,12 @@ json convert_anthropic_to_oai(const json & body) { if (system_param.is_string()) { system_content = system_param.get(); + normalize_anthropic_billing_header(system_content); } else if (system_param.is_array()) { for (const auto & block : system_param) { if (json_value(block, "type", std::string()) == "text") { auto system_text = json_value(block, "text", std::string()); - if (system_text.rfind("x-anthropic-billing-header:", 0) == 0 ) { - // This is a claude message with a "cch=ef01a" attribute that breaks prefix caching. - // The cch stamp is a whitebox end-to-end integrity hint. It's not meaningful as a - // system prompt data, particularly to llama.cpp, but its presence means the prefix - // cache will not get past it: It changes on each request. - // - // Reference: https://github.com/ggml-org/llama.cpp/pull/21793 - // Example header: x-anthropic-billing-header: cc_version=2.1.101.e51; cc_entrypoint=cli; cch=a5145;You are Claude Code, Anthropic's official CLI for Claude. - // ^^^^^ - const size_t header_prefix_length = strlen("x-anthropic-billing-header:"); - const size_t cch_length = 5; - size_t index_cch = system_text.find("cch=", header_prefix_length); - size_t index_replace = index_cch + 4; - if (index_cch != std::string::npos && index_replace + cch_length < system_text.length() && system_text[index_replace + cch_length] == ';') { - // only change this value if the cch still looks right. - for (size_t i = 0; i < cch_length; ++i) { - system_text[index_replace + i] = 'f'; - } - } else { - LOG_ERR("anthropic string not as expected: %s", system_text.c_str()); - } - } + normalize_anthropic_billing_header(system_text); system_content += system_text; } } diff --git a/tools/server/server-common.h b/tools/server/server-common.h index 213ae52bb09..35acf6a43d4 100644 --- a/tools/server/server-common.h +++ b/tools/server/server-common.h @@ -347,6 +347,9 @@ std::string format_oai_resp_sse(const json & data); // format Anthropic-style SSE with event types std::string format_anthropic_sse(const json & data); +// re-format an x-anthropic-billing-header system prompt's cch section for prompt caching friendliness +void normalize_anthropic_billing_header(std::string & system_text); + bool is_valid_utf8(const std::string & str); // From f3ec37c2857f2f00e0feea2ca8be23410e2726aa Mon Sep 17 00:00:00 2001 From: kvc0 <3454741+kvc0@users.noreply.github.com> Date: Sun, 12 Apr 2026 15:15:06 -0700 Subject: [PATCH 5/5] just declare the helper locally --- tools/server/server-common.cpp | 2 ++ tools/server/server-common.h | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index 93c583d6298..feea541c374 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1433,6 +1433,8 @@ json convert_responses_to_chatcmpl(const json & response_body) { return chatcmpl_body; } +// re-format an x-anthropic-billing-header system prompt's cch section for prompt caching friendliness +void normalize_anthropic_billing_header(std::string & system_text); // Edits the cch section of an "x-anthropic-billing-header" system prompt. // Does nothing to any other prompt. // diff --git a/tools/server/server-common.h b/tools/server/server-common.h index 35acf6a43d4..213ae52bb09 100644 --- a/tools/server/server-common.h +++ b/tools/server/server-common.h @@ -347,9 +347,6 @@ std::string format_oai_resp_sse(const json & data); // format Anthropic-style SSE with event types std::string format_anthropic_sse(const json & data); -// re-format an x-anthropic-billing-header system prompt's cch section for prompt caching friendliness -void normalize_anthropic_billing_header(std::string & system_text); - bool is_valid_utf8(const std::string & str); //