From 458c551cbdf0a42fbb97e14ff2c674a6936100c8 Mon Sep 17 00:00:00 2001 From: hexin Date: Mon, 25 May 2026 11:53:47 +0800 Subject: [PATCH] fix(engine): keep auto-compaction working on sub-500K self-hosted windows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `context_input_budget` reserved the full TURN_MAX_OUTPUT_TOKENS (262K) for output regardless of window size. For a self-hosted model with a window below that reservation (e.g. a 256K vLLM Qwen deployment) the math was `256K - 262K - 1K`, which underflows `checked_sub` to `None`. A `None` budget silently disables every preflight check and emergency context recovery path, so the session never compacts and runs until the provider hard-rejects on context length. Two coupled fixes: 1. `context_window_for_model`: apply the explicit `_Nk` suffix hint for any vendor, not just DeepSeek. A self-hosted served-model-name like `qwen3-32b-256k` is the only window signal we have for non-DeepSeek/Claude models; without this its window resolves to `None`. Renamed the helper `deepseek_context_window_hint` -> `explicit_context_window_hint` since it is now vendor-agnostic. 2. `context_input_budget`: tier the reserved-output term by window — `>= 500K` keeps the full 262K headroom (preserves the V4 interleaved- thinking contract), `< 500K` falls back to `effective_max_output_tokens` (what the API actually caps output at), yielding a usable positive budget. Dropped the vestigial `requested_output_tokens` parameter (every caller passed the same constant). Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/tui/src/core/engine.rs | 17 +++------ crates/tui/src/core/engine/capacity_flow.rs | 2 +- crates/tui/src/core/engine/context.rs | 32 +++++++++++++++-- crates/tui/src/core/engine/tests.rs | 40 +++++++++------------ crates/tui/src/core/engine/turn_loop.rs | 16 ++------- crates/tui/src/models.rs | 22 ++++++++---- 6 files changed, 70 insertions(+), 59 deletions(-) diff --git a/crates/tui/src/core/engine.rs b/crates/tui/src/core/engine.rs index 202cd1648..52bddbc32 100644 --- a/crates/tui/src/core/engine.rs +++ b/crates/tui/src/core/engine.rs @@ -1286,15 +1286,8 @@ impl Engine { removed } - async fn recover_context_overflow( - &mut self, - client: &DeepSeekClient, - reason: &str, - requested_output_tokens: u32, - ) -> bool { - let Some(target_budget) = - context_input_budget(&self.session.model, requested_output_tokens) - else { + async fn recover_context_overflow(&mut self, client: &DeepSeekClient, reason: &str) -> bool { + let Some(target_budget) = context_input_budget(&self.session.model) else { return false; }; @@ -1970,9 +1963,9 @@ mod handle; pub(crate) use context::compact_tool_result_for_context; use context::{ COMPACTION_SUMMARY_MARKER, MAX_CONTEXT_RECOVERY_ATTEMPTS, MIN_RECENT_MESSAGES_TO_KEEP, - TURN_MAX_OUTPUT_TOKENS, context_input_budget, effective_max_output_tokens, - estimate_input_tokens_conservative, extract_compaction_summary_prompt, - is_context_length_error_message, summarize_text, turn_response_headroom_tokens, + context_input_budget, effective_max_output_tokens, estimate_input_tokens_conservative, + extract_compaction_summary_prompt, is_context_length_error_message, summarize_text, + turn_response_headroom_tokens, }; mod dispatch; mod loop_guard; diff --git a/crates/tui/src/core/engine/capacity_flow.rs b/crates/tui/src/core/engine/capacity_flow.rs index cee5fb76e..fe3577622 100644 --- a/crates/tui/src/core/engine/capacity_flow.rs +++ b/crates/tui/src/core/engine/capacity_flow.rs @@ -435,7 +435,7 @@ impl Engine { } if !refreshed { - let target_budget = context_input_budget(&self.session.model, TURN_MAX_OUTPUT_TOKENS) + let target_budget = context_input_budget(&self.session.model) .unwrap_or(self.config.compaction.token_threshold.max(1)); if self.estimated_input_tokens() > target_budget { let trimmed = self.trim_oldest_messages_to_budget(target_budget); diff --git a/crates/tui/src/core/engine/context.rs b/crates/tui/src/core/engine/context.rs index cb97e7744..6a28d6b45 100644 --- a/crates/tui/src/core/engine/context.rs +++ b/crates/tui/src/core/engine/context.rs @@ -354,9 +354,35 @@ pub(super) fn estimate_input_tokens_conservative( .saturating_add(framing_overhead) } -pub(super) fn context_input_budget(model: &str, requested_output_tokens: u32) -> Option { - let window = usize::try_from(context_window_for_model(model)?).ok()?; - let output = usize::try_from(requested_output_tokens).ok()?; +/// Context windows at or above this size reserve the full +/// [`TURN_MAX_OUTPUT_TOKENS`] (262K) when computing the internal input budget, +/// leaving room for V4-class interleaved thinking. Below it, the reservation +/// falls back to [`effective_max_output_tokens`] so a smaller self-hosted +/// window does not underflow to a negative budget. +const INTERNAL_BUDGET_LARGE_WINDOW_THRESHOLD: u32 = 500_000; + +/// Internal input-side token budget for a model: `window - reserved_output - +/// headroom`. Used by the preflight check, emergency recovery, and capacity +/// trimming to decide when to compact. +/// +/// The reserved-output term is window-dependent: +/// * `window >= 500K` (V4-class large-context) -> [`TURN_MAX_OUTPUT_TOKENS`] +/// (262K). Preserves the "leave room for interleaved thinking" contract. +/// * `window < 500K` (smaller / self-hosted, e.g. a 256K vLLM Qwen window) +/// -> [`effective_max_output_tokens`], i.e. what the API actually caps +/// output at. Reserving the full 262K here would compute +/// `256K - 262K - 1K`, which underflows `checked_sub` to `None` and +/// *silently disables every preflight and emergency recovery path* — the +/// session then runs until the provider hard-rejects on context length. +pub(super) fn context_input_budget(model: &str) -> Option { + let window_tokens = context_window_for_model(model)?; + let window = usize::try_from(window_tokens).ok()?; + let reserved_output = if window_tokens >= INTERNAL_BUDGET_LARGE_WINDOW_THRESHOLD { + TURN_MAX_OUTPUT_TOKENS + } else { + effective_max_output_tokens(model) + }; + let output = usize::try_from(reserved_output).ok()?; window .checked_sub(output) .and_then(|v| v.checked_sub(CONTEXT_HEADROOM_TOKENS)) diff --git a/crates/tui/src/core/engine/tests.rs b/crates/tui/src/core/engine/tests.rs index 851b09ea0..e68f5fb24 100644 --- a/crates/tui/src/core/engine/tests.rs +++ b/crates/tui/src/core/engine/tests.rs @@ -1,5 +1,6 @@ use super::*; +use super::context::TURN_MAX_OUTPUT_TOKENS; use crate::models::SystemBlock; use crate::test_support::lock_test_env; use crate::tools::spec::ToolCapability; @@ -916,7 +917,7 @@ fn detects_context_length_errors_from_provider_payloads() { fn context_budget_reserves_output_and_headroom() { // V4 has a 1M context window — the only family that comfortably hosts // a 256K output reservation without saturating the input budget to 0. - let budget = context_input_budget("deepseek-v4-pro", TURN_MAX_OUTPUT_TOKENS) + let budget = context_input_budget("deepseek-v4-pro") .expect("deepseek-v4-pro should have a known context window"); let v4_window: usize = 1_000_000; let expected = v4_window - (TURN_MAX_OUTPUT_TOKENS as usize) - 1_024usize; @@ -943,31 +944,24 @@ fn effective_max_output_tokens_caps_api_request_for_large_window_models() { } #[test] -fn internal_context_budget_unaffected_by_api_request_cap() { - // The internal context budget (used for compaction/preflight/recovery) - // must still use the full TURN_MAX_OUTPUT_TOKENS headroom, NOT the - // smaller API request cap. This ensures long-context V4 sessions don't - // compact prematurely. - let internal_budget = context_input_budget("deepseek-v4-pro", TURN_MAX_OUTPUT_TOKENS) - .expect("V4 should have a known context window"); - let api_cap_budget = context_input_budget( - "deepseek-v4-pro", - effective_max_output_tokens("deepseek-v4-pro"), - ) - .expect("V4 should have a known context window"); - - // Internal budget reserves 262K for output; API-cap budget would only - // reserve 64K. Internal budget must be smaller (more conservative). - assert!( - internal_budget < api_cap_budget, - "Internal budget ({internal_budget}) should be smaller than API-cap budget ({api_cap_budget}) \ - because it reserves more headroom for output" - ); - - // Verify the internal budget is what the compaction logic actually uses. +fn internal_context_budget_tiers_reserved_output_by_window() { + // Large-context (>=500K) models reserve the full TURN_MAX_OUTPUT_TOKENS + // headroom so long V4 sessions don't compact prematurely. + let internal_budget = + context_input_budget("deepseek-v4-pro").expect("V4 should have a known context window"); let v4_window: usize = 1_000_000; let expected_internal = v4_window - (TURN_MAX_OUTPUT_TOKENS as usize) - 1_024usize; assert_eq!(internal_budget, expected_internal); + + // Sub-500K windows cross into the effective-cap branch: a 256K self-hosted + // deployment must yield a usable positive budget rather than None. The + // previous formula reserved the full 262K and computed 256K - 262K - 1K, + // which underflowed to None and silently disabled preflight/recovery. + let small_window_budget = context_input_budget("qwen3-32b-256k") + .expect("a 256K-suffix model must yield Some budget via the effective-cap branch"); + let effective_output = effective_max_output_tokens("qwen3-32b-256k") as usize; + let expected_small = 256_000 - effective_output - 1_024; + assert_eq!(small_window_budget, expected_small); } #[test] diff --git a/crates/tui/src/core/engine/turn_loop.rs b/crates/tui/src/core/engine/turn_loop.rs index 9f2da5ffd..a74e64302 100644 --- a/crates/tui/src/core/engine/turn_loop.rs +++ b/crates/tui/src/core/engine/turn_loop.rs @@ -173,9 +173,7 @@ impl Engine { continue; } - if let Some(input_budget) = - context_input_budget(&self.session.model, TURN_MAX_OUTPUT_TOKENS) - { + if let Some(input_budget) = context_input_budget(&self.session.model) { let estimated_input = self.estimated_input_tokens(); if estimated_input > input_budget { if context_recovery_attempts >= MAX_CONTEXT_RECOVERY_ATTEMPTS { @@ -192,11 +190,7 @@ impl Engine { } if self - .recover_context_overflow( - &client, - "preflight token budget", - TURN_MAX_OUTPUT_TOKENS, - ) + .recover_context_overflow(&client, "preflight token budget") .await { context_recovery_attempts = context_recovery_attempts.saturating_add(1); @@ -326,11 +320,7 @@ impl Engine { if is_context_length_error_message(&message) && context_recovery_attempts < MAX_CONTEXT_RECOVERY_ATTEMPTS && self - .recover_context_overflow( - &client, - "provider context-length rejection", - TURN_MAX_OUTPUT_TOKENS, - ) + .recover_context_overflow(&client, "provider context-length rejection") .await { context_recovery_attempts = context_recovery_attempts.saturating_add(1); diff --git a/crates/tui/src/models.rs b/crates/tui/src/models.rs index a5f52c6d3..91c642e16 100644 --- a/crates/tui/src/models.rs +++ b/crates/tui/src/models.rs @@ -208,16 +208,22 @@ pub struct Usage { } /// Map known models to their approximate context window sizes. +/// +/// Lookup order: +/// 1. An explicit `_Nk` suffix in the model name, for **any** vendor. This +/// lets self-hosted deployments advertise their window through the served +/// model name (e.g. a vLLM `--served-model-name qwen3-32b-256k`), which is +/// the only signal we have for non-DeepSeek/Claude models. The 1000-token +/// approximation is fine for compaction-threshold math. +/// 2. DeepSeek vendor heuristics (V4 family -> 1M, legacy -> 128K). +/// 3. Claude -> 200K. #[must_use] pub fn context_window_for_model(model: &str) -> Option { let lower = model.to_lowercase(); - // Unknown legacy DeepSeek model IDs default to 128K unless an explicit - // *k suffix is present. DeepSeek-V4 family and current compatibility - // aliases ship with a 1M context window. + if let Some(explicit_window) = explicit_context_window_hint(&lower) { + return Some(explicit_window); + } if lower.contains("deepseek") { - if let Some(explicit_window) = deepseek_context_window_hint(&lower) { - return Some(explicit_window); - } if lower.contains("v4") { return Some(DEEPSEEK_V4_CONTEXT_WINDOW_TOKENS); } @@ -229,7 +235,9 @@ pub fn context_window_for_model(model: &str) -> Option { None } -fn deepseek_context_window_hint(model_lower: &str) -> Option { +/// Parse an explicit `_Nk` context-window hint from a model name (vendor +/// agnostic). Returns the window in tokens for `N` in `8..=1024`. +fn explicit_context_window_hint(model_lower: &str) -> Option { let bytes = model_lower.as_bytes(); let mut i = 0usize; while i < bytes.len() {