diff --git a/app/L0/_all/mod/_core/agent_prompt/AGENTS.md b/app/L0/_all/mod/_core/agent_prompt/AGENTS.md index 6b7e3a7d..8ddbe855 100644 --- a/app/L0/_all/mod/_core/agent_prompt/AGENTS.md +++ b/app/L0/_all/mod/_core/agent_prompt/AGENTS.md @@ -14,6 +14,7 @@ This module owns: - `prompt-items.js`: shared prompt-budget ratio parsing, keyed prompt-item normalization and merge helpers, long-message trimming placeholders, and the `space.chat` prompt-item access installer used by onscreen and admin chat stores - `prompt-runtime.js`: shared `AgentPromptInstance` lifecycle, prompt-input cloning, prompt-history rebuild fallback, and the stable `createAgentPromptInstance(...)` / `hasPreparedPromptInput(...)` helpers used by agent surfaces +- `trim-quantum.js`: shared cut-position quantization helper for long-message trimming so the byte position of every trimmed cut snaps to a stable grid across turns ## Local Contracts @@ -24,6 +25,7 @@ Current shared runtime contract: - normalized prompt items should cache `valueTokenCount` alongside the normalized string value so repeated prompt builds can reuse tokenizer results for the same item body - prompt-budget ratios are stored as percentages of the configured model `maxTokens`, with `system`, `history`, and `transient` required to total 100 while `singleMessage` is a separate percentage of the history budget - long prompt contributors may be trimmed through the shared middle-replacement placeholder emitted by `trimPromptLongMessage(...)`; the placeholder must keep a stable `space.chat.readLongMessage({ id, from, to })` instruction so the active chat runtime can expose the removed text on demand during that turn +- callers that plan a `removeChars` value for `trimPromptLongMessage(...)` from a live overflow budget should route the planned value through `quantizeRemovedChars(...)` first, so the byte position of the cut snaps to a stable grid across turns instead of drifting by tens of characters as the history grows; this keeps prefix prompt-cache backends (llama.cpp `--prompt-cache`, qwen serve, vLLM prefix cache, etc.) warm across consecutive turns of long-context conversations - part-level prompt-budget trimming should build a one-shot thresholded multi-contributor plan that trims only contributors whose planned cut is at least `250` tokens; system and transient consumers may then fall back to one combined section-body trim when contributor-level trims would all be smaller than that threshold - `installPromptItemAccess(...)` must keep full prompt-item text in runtime-only memory while publishing only redacted `space.chat.promptItems` metadata plus `readLongMessage(...)` on the live chat namespace - this module is prompt-builder-agnostic; callers must provide `buildPromptInput(context)` and may optionally provide `updatePromptHistory({ context, historyMessages, options, prompt, promptInput })` diff --git a/app/L0/_all/mod/_core/agent_prompt/trim-quantum.js b/app/L0/_all/mod/_core/agent_prompt/trim-quantum.js new file mode 100644 index 00000000..3602a3c3 --- /dev/null +++ b/app/L0/_all/mod/_core/agent_prompt/trim-quantum.js @@ -0,0 +1,38 @@ +// Quantization grid for the long-message-trim cut position. +// +// The onscreen-agent trimmer plans `removeChars` from the live overflow +// budget on every turn. Because the budget drifts as the history grows, the +// planned cut position shifts by a small number of characters from one turn +// to the next, even when the underlying message content is unchanged. For +// inference backends that reuse a prefix prompt cache (llama.cpp +// `--prompt-cache`, qwen serve, vLLM prefix cache, etc.) every byte after +// that shifting cut becomes a cache miss, which turns a 10-second warm +// reply into a multi-minute full prompt re-prefill on long-context Talk +// sessions. +// +// Snapping the planned cut to a fixed grid keeps the byte position of the +// cut stable across the vast majority of turns. The cut only moves when +// the underlying drift exceeds one full quantum step, which empirically +// happens far less often than once per realistic session. +// +// The grid width is chosen large enough to absorb the per-turn drift +// observed in long Talk sessions (tens of characters per turn) by orders +// of magnitude, while staying small enough that the first trim of a +// message does not waste a meaningful share of the budget. +export const LONG_MESSAGE_CUT_QUANTUM_CHARS = 4096; + +// Round a planned `removeChars` value up to the next multiple of the cut +// quantum. Returns 0 unchanged so the trim path can still represent the +// "no trim yet" state, and clamps non-positive inputs to 0. +export function quantizeRemovedChars(value, quantum = LONG_MESSAGE_CUT_QUANTUM_CHARS) { + const normalizedValue = Number.isFinite(Number(value)) ? Math.max(0, Math.ceil(Number(value))) : 0; + const normalizedQuantum = Number.isFinite(Number(quantum)) && Number(quantum) > 0 + ? Math.max(1, Math.floor(Number(quantum))) + : LONG_MESSAGE_CUT_QUANTUM_CHARS; + + if (!normalizedValue) { + return 0; + } + + return Math.ceil(normalizedValue / normalizedQuantum) * normalizedQuantum; +} diff --git a/app/L0/_all/mod/_core/onscreen_agent/llm.js b/app/L0/_all/mod/_core/onscreen_agent/llm.js index c761436e..3b10424e 100644 --- a/app/L0/_all/mod/_core/onscreen_agent/llm.js +++ b/app/L0/_all/mod/_core/onscreen_agent/llm.js @@ -12,6 +12,7 @@ import { trimPromptLongMessage } from "/mod/_core/agent_prompt/prompt-items.js"; import { createAgentPromptInstance } from "/mod/_core/agent_prompt/prompt-runtime.js"; +import { quantizeRemovedChars } from "/mod/_core/agent_prompt/trim-quantum.js"; import { buildMessagePromptParts, MESSAGE_PROMPT_PART_BLOCK } from "/mod/_core/onscreen_agent/attachments.js"; import * as llmParams from "/mod/_core/onscreen_agent/llm-params.js"; import * as skills from "/mod/_core/onscreen_agent/skills.js"; @@ -729,12 +730,14 @@ function trimPromptContributorByOverflow(contributor, overflowTokens) { removedChars: estimatedRemovedChars }) ); - const nextRemovedChars = contributor.removedChars + estimatePromptCharsForTokenRemoval( - contributor.originalValueText, - normalizedOverflowTokens + placeholderTokenCount, - { - tokenCount: originalValueTokenCount - } + const nextRemovedChars = quantizeRemovedChars( + contributor.removedChars + estimatePromptCharsForTokenRemoval( + contributor.originalValueText, + normalizedOverflowTokens + placeholderTokenCount, + { + tokenCount: originalValueTokenCount + } + ) ); let trimmedValue = trimPromptLongMessage(contributor.originalValueText, { id: contributor.id, @@ -745,12 +748,14 @@ function trimPromptContributorByOverflow(contributor, overflowTokens) { if (trimmedValueTokenCount > targetTokenCount) { const additionalOverflowTokens = trimmedValueTokenCount - targetTokenCount; - const recalibratedRemovedChars = trimmedValue.removedChars + estimatePromptCharsForTokenRemoval( - contributor.originalValueText, - additionalOverflowTokens, - { - tokenCount: originalValueTokenCount - } + const recalibratedRemovedChars = quantizeRemovedChars( + trimmedValue.removedChars + estimatePromptCharsForTokenRemoval( + contributor.originalValueText, + additionalOverflowTokens, + { + tokenCount: originalValueTokenCount + } + ) ); trimmedValue = trimPromptLongMessage(contributor.originalValueText, { diff --git a/tests/trim_quantum_test.mjs b/tests/trim_quantum_test.mjs new file mode 100644 index 00000000..2a318732 --- /dev/null +++ b/tests/trim_quantum_test.mjs @@ -0,0 +1,59 @@ +import assert from "node:assert/strict"; +import test from "node:test"; + +import { + LONG_MESSAGE_CUT_QUANTUM_CHARS, + quantizeRemovedChars +} from "../app/L0/_all/mod/_core/agent_prompt/trim-quantum.js"; + +test("quantizeRemovedChars returns 0 unchanged so the no-trim state stays representable", () => { + assert.equal(quantizeRemovedChars(0), 0); + assert.equal(quantizeRemovedChars(-5), 0); + assert.equal(quantizeRemovedChars(NaN), 0); + assert.equal(quantizeRemovedChars(undefined), 0); +}); + +test("quantizeRemovedChars rounds any positive value up to the next quantum step", () => { + assert.equal(quantizeRemovedChars(1), LONG_MESSAGE_CUT_QUANTUM_CHARS); + assert.equal(quantizeRemovedChars(LONG_MESSAGE_CUT_QUANTUM_CHARS - 1), LONG_MESSAGE_CUT_QUANTUM_CHARS); + assert.equal(quantizeRemovedChars(LONG_MESSAGE_CUT_QUANTUM_CHARS), LONG_MESSAGE_CUT_QUANTUM_CHARS); + assert.equal(quantizeRemovedChars(LONG_MESSAGE_CUT_QUANTUM_CHARS + 1), 2 * LONG_MESSAGE_CUT_QUANTUM_CHARS); +}); + +test("quantizeRemovedChars holds the cut position stable across small per-turn drift", () => { + // The onscreen trimmer plans `removeChars` from the live overflow budget, + // which drifts by tens of characters as the conversation history grows. + // The whole point of quantization is that those small drifts are absorbed + // and the cut position remains byte-identical across turns. This guards + // against a future refactor that accidentally narrows the quantum below + // realistic per-turn drift. + const baseline = 12_649; // empirically observed cut from a real Talk session + const driftedSamples = [12_649, 12_661, 12_673, 12_685, 12_700]; + + const quantized = driftedSamples.map((value) => quantizeRemovedChars(value)); + const reference = quantizeRemovedChars(baseline); + + for (const value of quantized) { + assert.equal(value, reference); + } +}); + +test("quantizeRemovedChars steps to the next quantum only when drift exceeds a full step", () => { + const baseline = LONG_MESSAGE_CUT_QUANTUM_CHARS + 1; + const oneStepLater = baseline + LONG_MESSAGE_CUT_QUANTUM_CHARS; + + assert.equal(quantizeRemovedChars(baseline), 2 * LONG_MESSAGE_CUT_QUANTUM_CHARS); + assert.equal(quantizeRemovedChars(oneStepLater), 3 * LONG_MESSAGE_CUT_QUANTUM_CHARS); +}); + +test("quantizeRemovedChars accepts a custom quantum so future tuning can compare grid sizes", () => { + assert.equal(quantizeRemovedChars(1, 1024), 1024); + assert.equal(quantizeRemovedChars(1025, 1024), 2048); + assert.equal(quantizeRemovedChars(1, 16_384), 16_384); +}); + +test("quantizeRemovedChars falls back to the default quantum when given a non-positive override", () => { + assert.equal(quantizeRemovedChars(1, 0), LONG_MESSAGE_CUT_QUANTUM_CHARS); + assert.equal(quantizeRemovedChars(1, -10), LONG_MESSAGE_CUT_QUANTUM_CHARS); + assert.equal(quantizeRemovedChars(1, NaN), LONG_MESSAGE_CUT_QUANTUM_CHARS); +});