From 458c551cbdf0a42fbb97e14ff2c674a6936100c8 Mon Sep 17 00:00:00 2001
From: hexin <he.xin@h3c.com>
Date: Mon, 25 May 2026 11:53:47 +0800
Subject: [PATCH] fix(engine): keep auto-compaction working on sub-500K
 self-hosted windows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`context_input_budget` reserved the full TURN_MAX_OUTPUT_TOKENS (262K) for
output regardless of window size. For a self-hosted model with a window
below that reservation (e.g. a 256K vLLM Qwen deployment) the math was
`256K - 262K - 1K`, which underflows `checked_sub` to `None`. A `None`
budget silently disables every preflight check and emergency context
recovery path, so the session never compacts and runs until the provider
hard-rejects on context length.

Two coupled fixes:

1. `context_window_for_model`: apply the explicit `_Nk` suffix hint for any
   vendor, not just DeepSeek. A self-hosted served-model-name like
   `qwen3-32b-256k` is the only window signal we have for non-DeepSeek/Claude
   models; without this its window resolves to `None`. Renamed the helper
   `deepseek_context_window_hint` -> `explicit_context_window_hint` since it
   is now vendor-agnostic.

2. `context_input_budget`: tier the reserved-output term by window —
   `>= 500K` keeps the full 262K headroom (preserves the V4 interleaved-
   thinking contract), `< 500K` falls back to `effective_max_output_tokens`
   (what the API actually caps output at), yielding a usable positive budget.
   Dropped the vestigial `requested_output_tokens` parameter (every caller
   passed the same constant).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/tui/src/core/engine.rs               | 17 +++------
 crates/tui/src/core/engine/capacity_flow.rs |  2 +-
 crates/tui/src/core/engine/context.rs       | 32 +++++++++++++++--
 crates/tui/src/core/engine/tests.rs         | 40 +++++++++------------
 crates/tui/src/core/engine/turn_loop.rs     | 16 ++-------
 crates/tui/src/models.rs                    | 22 ++++++++----
 6 files changed, 70 insertions(+), 59 deletions(-)

diff --git a/crates/tui/src/core/engine.rs b/crates/tui/src/core/engine.rs
index 202cd1648..52bddbc32 100644
--- a/crates/tui/src/core/engine.rs
+++ b/crates/tui/src/core/engine.rs
@@ -1286,15 +1286,8 @@ impl Engine {
         removed
     }
 
-    async fn recover_context_overflow(
-        &mut self,
-        client: &DeepSeekClient,
-        reason: &str,
-        requested_output_tokens: u32,
-    ) -> bool {
-        let Some(target_budget) =
-            context_input_budget(&self.session.model, requested_output_tokens)
-        else {
+    async fn recover_context_overflow(&mut self, client: &DeepSeekClient, reason: &str) -> bool {
+        let Some(target_budget) = context_input_budget(&self.session.model) else {
             return false;
         };
 
@@ -1970,9 +1963,9 @@ mod handle;
 pub(crate) use context::compact_tool_result_for_context;
 use context::{
     COMPACTION_SUMMARY_MARKER, MAX_CONTEXT_RECOVERY_ATTEMPTS, MIN_RECENT_MESSAGES_TO_KEEP,
-    TURN_MAX_OUTPUT_TOKENS, context_input_budget, effective_max_output_tokens,
-    estimate_input_tokens_conservative, extract_compaction_summary_prompt,
-    is_context_length_error_message, summarize_text, turn_response_headroom_tokens,
+    context_input_budget, effective_max_output_tokens, estimate_input_tokens_conservative,
+    extract_compaction_summary_prompt, is_context_length_error_message, summarize_text,
+    turn_response_headroom_tokens,
 };
 mod dispatch;
 mod loop_guard;
diff --git a/crates/tui/src/core/engine/capacity_flow.rs b/crates/tui/src/core/engine/capacity_flow.rs
index cee5fb76e..fe3577622 100644
--- a/crates/tui/src/core/engine/capacity_flow.rs
+++ b/crates/tui/src/core/engine/capacity_flow.rs
@@ -435,7 +435,7 @@ impl Engine {
         }
 
         if !refreshed {
-            let target_budget = context_input_budget(&self.session.model, TURN_MAX_OUTPUT_TOKENS)
+            let target_budget = context_input_budget(&self.session.model)
                 .unwrap_or(self.config.compaction.token_threshold.max(1));
             if self.estimated_input_tokens() > target_budget {
                 let trimmed = self.trim_oldest_messages_to_budget(target_budget);
diff --git a/crates/tui/src/core/engine/context.rs b/crates/tui/src/core/engine/context.rs
index cb97e7744..6a28d6b45 100644
--- a/crates/tui/src/core/engine/context.rs
+++ b/crates/tui/src/core/engine/context.rs
@@ -354,9 +354,35 @@ pub(super) fn estimate_input_tokens_conservative(
         .saturating_add(framing_overhead)
 }
 
-pub(super) fn context_input_budget(model: &str, requested_output_tokens: u32) -> Option<usize> {
-    let window = usize::try_from(context_window_for_model(model)?).ok()?;
-    let output = usize::try_from(requested_output_tokens).ok()?;
+/// Context windows at or above this size reserve the full
+/// [`TURN_MAX_OUTPUT_TOKENS`] (262K) when computing the internal input budget,
+/// leaving room for V4-class interleaved thinking. Below it, the reservation
+/// falls back to [`effective_max_output_tokens`] so a smaller self-hosted
+/// window does not underflow to a negative budget.
+const INTERNAL_BUDGET_LARGE_WINDOW_THRESHOLD: u32 = 500_000;
+
+/// Internal input-side token budget for a model: `window - reserved_output -
+/// headroom`. Used by the preflight check, emergency recovery, and capacity
+/// trimming to decide when to compact.
+///
+/// The reserved-output term is window-dependent:
+///   * `window >= 500K` (V4-class large-context) -> [`TURN_MAX_OUTPUT_TOKENS`]
+///     (262K). Preserves the "leave room for interleaved thinking" contract.
+///   * `window < 500K` (smaller / self-hosted, e.g. a 256K vLLM Qwen window)
+///     -> [`effective_max_output_tokens`], i.e. what the API actually caps
+///     output at. Reserving the full 262K here would compute
+///     `256K - 262K - 1K`, which underflows `checked_sub` to `None` and
+///     *silently disables every preflight and emergency recovery path* — the
+///     session then runs until the provider hard-rejects on context length.
+pub(super) fn context_input_budget(model: &str) -> Option<usize> {
+    let window_tokens = context_window_for_model(model)?;
+    let window = usize::try_from(window_tokens).ok()?;
+    let reserved_output = if window_tokens >= INTERNAL_BUDGET_LARGE_WINDOW_THRESHOLD {
+        TURN_MAX_OUTPUT_TOKENS
+    } else {
+        effective_max_output_tokens(model)
+    };
+    let output = usize::try_from(reserved_output).ok()?;
     window
         .checked_sub(output)
         .and_then(|v| v.checked_sub(CONTEXT_HEADROOM_TOKENS))
diff --git a/crates/tui/src/core/engine/tests.rs b/crates/tui/src/core/engine/tests.rs
index 851b09ea0..e68f5fb24 100644
--- a/crates/tui/src/core/engine/tests.rs
+++ b/crates/tui/src/core/engine/tests.rs
@@ -1,5 +1,6 @@
 use super::*;
 
+use super::context::TURN_MAX_OUTPUT_TOKENS;
 use crate::models::SystemBlock;
 use crate::test_support::lock_test_env;
 use crate::tools::spec::ToolCapability;
@@ -916,7 +917,7 @@ fn detects_context_length_errors_from_provider_payloads() {
 fn context_budget_reserves_output_and_headroom() {
     // V4 has a 1M context window — the only family that comfortably hosts
     // a 256K output reservation without saturating the input budget to 0.
-    let budget = context_input_budget("deepseek-v4-pro", TURN_MAX_OUTPUT_TOKENS)
+    let budget = context_input_budget("deepseek-v4-pro")
         .expect("deepseek-v4-pro should have a known context window");
     let v4_window: usize = 1_000_000;
     let expected = v4_window - (TURN_MAX_OUTPUT_TOKENS as usize) - 1_024usize;
@@ -943,31 +944,24 @@ fn effective_max_output_tokens_caps_api_request_for_large_window_models() {
 }
 
 #[test]
-fn internal_context_budget_unaffected_by_api_request_cap() {
-    // The internal context budget (used for compaction/preflight/recovery)
-    // must still use the full TURN_MAX_OUTPUT_TOKENS headroom, NOT the
-    // smaller API request cap. This ensures long-context V4 sessions don't
-    // compact prematurely.
-    let internal_budget = context_input_budget("deepseek-v4-pro", TURN_MAX_OUTPUT_TOKENS)
-        .expect("V4 should have a known context window");
-    let api_cap_budget = context_input_budget(
-        "deepseek-v4-pro",
-        effective_max_output_tokens("deepseek-v4-pro"),
-    )
-    .expect("V4 should have a known context window");
-
-    // Internal budget reserves 262K for output; API-cap budget would only
-    // reserve 64K. Internal budget must be smaller (more conservative).
-    assert!(
-        internal_budget < api_cap_budget,
-        "Internal budget ({internal_budget}) should be smaller than API-cap budget ({api_cap_budget}) \
-         because it reserves more headroom for output"
-    );
-
-    // Verify the internal budget is what the compaction logic actually uses.
+fn internal_context_budget_tiers_reserved_output_by_window() {
+    // Large-context (>=500K) models reserve the full TURN_MAX_OUTPUT_TOKENS
+    // headroom so long V4 sessions don't compact prematurely.
+    let internal_budget =
+        context_input_budget("deepseek-v4-pro").expect("V4 should have a known context window");
     let v4_window: usize = 1_000_000;
     let expected_internal = v4_window - (TURN_MAX_OUTPUT_TOKENS as usize) - 1_024usize;
     assert_eq!(internal_budget, expected_internal);
+
+    // Sub-500K windows cross into the effective-cap branch: a 256K self-hosted
+    // deployment must yield a usable positive budget rather than None. The
+    // previous formula reserved the full 262K and computed 256K - 262K - 1K,
+    // which underflowed to None and silently disabled preflight/recovery.
+    let small_window_budget = context_input_budget("qwen3-32b-256k")
+        .expect("a 256K-suffix model must yield Some budget via the effective-cap branch");
+    let effective_output = effective_max_output_tokens("qwen3-32b-256k") as usize;
+    let expected_small = 256_000 - effective_output - 1_024;
+    assert_eq!(small_window_budget, expected_small);
 }
 
 #[test]
diff --git a/crates/tui/src/core/engine/turn_loop.rs b/crates/tui/src/core/engine/turn_loop.rs
index 9f2da5ffd..a74e64302 100644
--- a/crates/tui/src/core/engine/turn_loop.rs
+++ b/crates/tui/src/core/engine/turn_loop.rs
@@ -173,9 +173,7 @@ impl Engine {
                 continue;
             }
 
-            if let Some(input_budget) =
-                context_input_budget(&self.session.model, TURN_MAX_OUTPUT_TOKENS)
-            {
+            if let Some(input_budget) = context_input_budget(&self.session.model) {
                 let estimated_input = self.estimated_input_tokens();
                 if estimated_input > input_budget {
                     if context_recovery_attempts >= MAX_CONTEXT_RECOVERY_ATTEMPTS {
@@ -192,11 +190,7 @@ impl Engine {
                     }
 
                     if self
-                        .recover_context_overflow(
-                            &client,
-                            "preflight token budget",
-                            TURN_MAX_OUTPUT_TOKENS,
-                        )
+                        .recover_context_overflow(&client, "preflight token budget")
                         .await
                     {
                         context_recovery_attempts = context_recovery_attempts.saturating_add(1);
@@ -326,11 +320,7 @@ impl Engine {
                     if is_context_length_error_message(&message)
                         && context_recovery_attempts < MAX_CONTEXT_RECOVERY_ATTEMPTS
                         && self
-                            .recover_context_overflow(
-                                &client,
-                                "provider context-length rejection",
-                                TURN_MAX_OUTPUT_TOKENS,
-                            )
+                            .recover_context_overflow(&client, "provider context-length rejection")
                             .await
                     {
                         context_recovery_attempts = context_recovery_attempts.saturating_add(1);
diff --git a/crates/tui/src/models.rs b/crates/tui/src/models.rs
index a5f52c6d3..91c642e16 100644
--- a/crates/tui/src/models.rs
+++ b/crates/tui/src/models.rs
@@ -208,16 +208,22 @@ pub struct Usage {
 }
 
 /// Map known models to their approximate context window sizes.
+///
+/// Lookup order:
+/// 1. An explicit `_Nk` suffix in the model name, for **any** vendor. This
+///    lets self-hosted deployments advertise their window through the served
+///    model name (e.g. a vLLM `--served-model-name qwen3-32b-256k`), which is
+///    the only signal we have for non-DeepSeek/Claude models. The 1000-token
+///    approximation is fine for compaction-threshold math.
+/// 2. DeepSeek vendor heuristics (V4 family -> 1M, legacy -> 128K).
+/// 3. Claude -> 200K.
 #[must_use]
 pub fn context_window_for_model(model: &str) -> Option<u32> {
     let lower = model.to_lowercase();
-    // Unknown legacy DeepSeek model IDs default to 128K unless an explicit
-    // *k suffix is present. DeepSeek-V4 family and current compatibility
-    // aliases ship with a 1M context window.
+    if let Some(explicit_window) = explicit_context_window_hint(&lower) {
+        return Some(explicit_window);
+    }
     if lower.contains("deepseek") {
-        if let Some(explicit_window) = deepseek_context_window_hint(&lower) {
-            return Some(explicit_window);
-        }
         if lower.contains("v4") {
             return Some(DEEPSEEK_V4_CONTEXT_WINDOW_TOKENS);
         }
@@ -229,7 +235,9 @@ pub fn context_window_for_model(model: &str) -> Option<u32> {
     None
 }
 
-fn deepseek_context_window_hint(model_lower: &str) -> Option<u32> {
+/// Parse an explicit `_Nk` context-window hint from a model name (vendor
+/// agnostic). Returns the window in tokens for `N` in `8..=1024`.
+fn explicit_context_window_hint(model_lower: &str) -> Option<u32> {
     let bytes = model_lower.as_bytes();
     let mut i = 0usize;
     while i < bytes.len() {