agent0ai · nsyring · May 13, 2026
diff --git a/app/L0/_all/mod/_core/agent_prompt/AGENTS.md b/app/L0/_all/mod/_core/agent_prompt/AGENTS.md
@@ -14,6 +14,7 @@ This module owns:
 
 - `prompt-items.js`: shared prompt-budget ratio parsing, keyed prompt-item normalization and merge helpers, long-message trimming placeholders, and the `space.chat` prompt-item access installer used by onscreen and admin chat stores
 - `prompt-runtime.js`: shared `AgentPromptInstance` lifecycle, prompt-input cloning, prompt-history rebuild fallback, and the stable `createAgentPromptInstance(...)` / `hasPreparedPromptInput(...)` helpers used by agent surfaces
+- `trim-quantum.js`: shared cut-position quantization helper for long-message trimming so the byte position of every trimmed cut snaps to a stable grid across turns
 
 ## Local Contracts
 
@@ -24,6 +25,7 @@ Current shared runtime contract:
 - normalized prompt items should cache `valueTokenCount` alongside the normalized string value so repeated prompt builds can reuse tokenizer results for the same item body
 - prompt-budget ratios are stored as percentages of the configured model `maxTokens`, with `system`, `history`, and `transient` required to total 100 while `singleMessage` is a separate percentage of the history budget
 - long prompt contributors may be trimmed through the shared middle-replacement placeholder emitted by `trimPromptLongMessage(...)`; the placeholder must keep a stable `space.chat.readLongMessage({ id, from, to })` instruction so the active chat runtime can expose the removed text on demand during that turn
+- callers that plan a `removeChars` value for `trimPromptLongMessage(...)` from a live overflow budget should route the planned value through `quantizeRemovedChars(...)` first, so the byte position of the cut snaps to a stable grid across turns instead of drifting by tens of characters as the history grows; this keeps prefix prompt-cache backends (llama.cpp `--prompt-cache`, qwen serve, vLLM prefix cache, etc.) warm across consecutive turns of long-context conversations
 - part-level prompt-budget trimming should build a one-shot thresholded multi-contributor plan that trims only contributors whose planned cut is at least `250` tokens; system and transient consumers may then fall back to one combined section-body trim when contributor-level trims would all be smaller than that threshold
 - `installPromptItemAccess(...)` must keep full prompt-item text in runtime-only memory while publishing only redacted `space.chat.promptItems` metadata plus `readLongMessage(...)` on the live chat namespace
 - this module is prompt-builder-agnostic; callers must provide `buildPromptInput(context)` and may optionally provide `updatePromptHistory({ context, historyMessages, options, prompt, promptInput })`

diff --git a/app/L0/_all/mod/_core/agent_prompt/trim-quantum.js b/app/L0/_all/mod/_core/agent_prompt/trim-quantum.js
@@ -0,0 +1,38 @@
+// Quantization grid for the long-message-trim cut position.
+//
+// The onscreen-agent trimmer plans `removeChars` from the live overflow
+// budget on every turn. Because the budget drifts as the history grows, the
+// planned cut position shifts by a small number of characters from one turn
+// to the next, even when the underlying message content is unchanged. For
+// inference backends that reuse a prefix prompt cache (llama.cpp
+// `--prompt-cache`, qwen serve, vLLM prefix cache, etc.) every byte after
+// that shifting cut becomes a cache miss, which turns a 10-second warm
+// reply into a multi-minute full prompt re-prefill on long-context Talk
+// sessions.
+//
+// Snapping the planned cut to a fixed grid keeps the byte position of the
+// cut stable across the vast majority of turns. The cut only moves when
+// the underlying drift exceeds one full quantum step, which empirically
+// happens far less often than once per realistic session.
+//
+// The grid width is chosen large enough to absorb the per-turn drift
+// observed in long Talk sessions (tens of characters per turn) by orders
+// of magnitude, while staying small enough that the first trim of a
+// message does not waste a meaningful share of the budget.
+export const LONG_MESSAGE_CUT_QUANTUM_CHARS = 4096;
+
+// Round a planned `removeChars` value up to the next multiple of the cut
+// quantum. Returns 0 unchanged so the trim path can still represent the
+// "no trim yet" state, and clamps non-positive inputs to 0.
+export function quantizeRemovedChars(value, quantum = LONG_MESSAGE_CUT_QUANTUM_CHARS) {
+  const normalizedValue = Number.isFinite(Number(value)) ? Math.max(0, Math.ceil(Number(value))) : 0;
+  const normalizedQuantum = Number.isFinite(Number(quantum)) && Number(quantum) > 0
+    ? Math.max(1, Math.floor(Number(quantum)))
+    : LONG_MESSAGE_CUT_QUANTUM_CHARS;
+
+  if (!normalizedValue) {
+    return 0;
+  }
+
+  return Math.ceil(normalizedValue / normalizedQuantum) * normalizedQuantum;
+}
diff --git a/app/L0/_all/mod/_core/onscreen_agent/llm.js b/app/L0/_all/mod/_core/onscreen_agent/llm.js
@@ -12,6 +12,7 @@ import {
   trimPromptLongMessage
 } from "/mod/_core/agent_prompt/prompt-items.js";
 import { createAgentPromptInstance } from "/mod/_core/agent_prompt/prompt-runtime.js";
+import { quantizeRemovedChars } from "/mod/_core/agent_prompt/trim-quantum.js";
 import { buildMessagePromptParts, MESSAGE_PROMPT_PART_BLOCK } from "/mod/_core/onscreen_agent/attachments.js";
 import * as llmParams from "/mod/_core/onscreen_agent/llm-params.js";
 import * as skills from "/mod/_core/onscreen_agent/skills.js";
@@ -729,12 +730,14 @@ function trimPromptContributorByOverflow(contributor, overflowTokens) {
       removedChars: estimatedRemovedChars
     })
   );
-  const nextRemovedChars = contributor.removedChars + estimatePromptCharsForTokenRemoval(
-    contributor.originalValueText,
-    normalizedOverflowTokens + placeholderTokenCount,
-    {
-      tokenCount: originalValueTokenCount
-    }
+  const nextRemovedChars = quantizeRemovedChars(
+    contributor.removedChars + estimatePromptCharsForTokenRemoval(
+      contributor.originalValueText,
+      normalizedOverflowTokens + placeholderTokenCount,
+      {
+        tokenCount: originalValueTokenCount
+      }
+    )
   );
   let trimmedValue = trimPromptLongMessage(contributor.originalValueText, {
     id: contributor.id,
@@ -745,12 +748,14 @@ function trimPromptContributorByOverflow(contributor, overflowTokens) {
 
   if (trimmedValueTokenCount > targetTokenCount) {
     const additionalOverflowTokens = trimmedValueTokenCount - targetTokenCount;
-    const recalibratedRemovedChars = trimmedValue.removedChars + estimatePromptCharsForTokenRemoval(
-      contributor.originalValueText,
-      additionalOverflowTokens,
-      {
-        tokenCount: originalValueTokenCount
-      }
+    const recalibratedRemovedChars = quantizeRemovedChars(
+      trimmedValue.removedChars + estimatePromptCharsForTokenRemoval(
+        contributor.originalValueText,
+        additionalOverflowTokens,
+        {
+          tokenCount: originalValueTokenCount
+        }
+      )
     );
 
     trimmedValue = trimPromptLongMessage(contributor.originalValueText, {

diff --git a/tests/trim_quantum_test.mjs b/tests/trim_quantum_test.mjs
@@ -0,0 +1,59 @@
+import assert from "node:assert/strict";
+import test from "node:test";
+
+import {
+  LONG_MESSAGE_CUT_QUANTUM_CHARS,
+  quantizeRemovedChars
+} from "../app/L0/_all/mod/_core/agent_prompt/trim-quantum.js";
+
+test("quantizeRemovedChars returns 0 unchanged so the no-trim state stays representable", () => {
+  assert.equal(quantizeRemovedChars(0), 0);
+  assert.equal(quantizeRemovedChars(-5), 0);
+  assert.equal(quantizeRemovedChars(NaN), 0);
+  assert.equal(quantizeRemovedChars(undefined), 0);
+});
+
+test("quantizeRemovedChars rounds any positive value up to the next quantum step", () => {
+  assert.equal(quantizeRemovedChars(1), LONG_MESSAGE_CUT_QUANTUM_CHARS);
+  assert.equal(quantizeRemovedChars(LONG_MESSAGE_CUT_QUANTUM_CHARS - 1), LONG_MESSAGE_CUT_QUANTUM_CHARS);
+  assert.equal(quantizeRemovedChars(LONG_MESSAGE_CUT_QUANTUM_CHARS), LONG_MESSAGE_CUT_QUANTUM_CHARS);
+  assert.equal(quantizeRemovedChars(LONG_MESSAGE_CUT_QUANTUM_CHARS + 1), 2 * LONG_MESSAGE_CUT_QUANTUM_CHARS);
+});
+
+test("quantizeRemovedChars holds the cut position stable across small per-turn drift", () => {
+  // The onscreen trimmer plans `removeChars` from the live overflow budget,
+  // which drifts by tens of characters as the conversation history grows.
+  // The whole point of quantization is that those small drifts are absorbed
+  // and the cut position remains byte-identical across turns. This guards
+  // against a future refactor that accidentally narrows the quantum below
+  // realistic per-turn drift.
+  const baseline = 12_649; // empirically observed cut from a real Talk session
+  const driftedSamples = [12_649, 12_661, 12_673, 12_685, 12_700];
+
+  const quantized = driftedSamples.map((value) => quantizeRemovedChars(value));
+  const reference = quantizeRemovedChars(baseline);
+
+  for (const value of quantized) {
+    assert.equal(value, reference);
+  }
+});
+
+test("quantizeRemovedChars steps to the next quantum only when drift exceeds a full step", () => {
+  const baseline = LONG_MESSAGE_CUT_QUANTUM_CHARS + 1;
+  const oneStepLater = baseline + LONG_MESSAGE_CUT_QUANTUM_CHARS;
+
+  assert.equal(quantizeRemovedChars(baseline), 2 * LONG_MESSAGE_CUT_QUANTUM_CHARS);
+  assert.equal(quantizeRemovedChars(oneStepLater), 3 * LONG_MESSAGE_CUT_QUANTUM_CHARS);
+});
+
+test("quantizeRemovedChars accepts a custom quantum so future tuning can compare grid sizes", () => {
+  assert.equal(quantizeRemovedChars(1, 1024), 1024);
+  assert.equal(quantizeRemovedChars(1025, 1024), 2048);
+  assert.equal(quantizeRemovedChars(1, 16_384), 16_384);
+});
+
+test("quantizeRemovedChars falls back to the default quantum when given a non-positive override", () => {
+  assert.equal(quantizeRemovedChars(1, 0), LONG_MESSAGE_CUT_QUANTUM_CHARS);
+  assert.equal(quantizeRemovedChars(1, -10), LONG_MESSAGE_CUT_QUANTUM_CHARS);
+  assert.equal(quantizeRemovedChars(1, NaN), LONG_MESSAGE_CUT_QUANTUM_CHARS);
+});