Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 5 additions & 12 deletions crates/tui/src/core/engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1286,15 +1286,8 @@ impl Engine {
removed
}

async fn recover_context_overflow(
&mut self,
client: &DeepSeekClient,
reason: &str,
requested_output_tokens: u32,
) -> bool {
let Some(target_budget) =
context_input_budget(&self.session.model, requested_output_tokens)
else {
async fn recover_context_overflow(&mut self, client: &DeepSeekClient, reason: &str) -> bool {
let Some(target_budget) = context_input_budget(&self.session.model) else {
return false;
};

Expand Down Expand Up @@ -1970,9 +1963,9 @@ mod handle;
pub(crate) use context::compact_tool_result_for_context;
use context::{
COMPACTION_SUMMARY_MARKER, MAX_CONTEXT_RECOVERY_ATTEMPTS, MIN_RECENT_MESSAGES_TO_KEEP,
TURN_MAX_OUTPUT_TOKENS, context_input_budget, effective_max_output_tokens,
estimate_input_tokens_conservative, extract_compaction_summary_prompt,
is_context_length_error_message, summarize_text, turn_response_headroom_tokens,
context_input_budget, effective_max_output_tokens, estimate_input_tokens_conservative,
extract_compaction_summary_prompt, is_context_length_error_message, summarize_text,
turn_response_headroom_tokens,
};
mod dispatch;
mod loop_guard;
Expand Down
2 changes: 1 addition & 1 deletion crates/tui/src/core/engine/capacity_flow.rs
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,7 @@ impl Engine {
}

if !refreshed {
let target_budget = context_input_budget(&self.session.model, TURN_MAX_OUTPUT_TOKENS)
let target_budget = context_input_budget(&self.session.model)
.unwrap_or(self.config.compaction.token_threshold.max(1));
if self.estimated_input_tokens() > target_budget {
let trimmed = self.trim_oldest_messages_to_budget(target_budget);
Expand Down
32 changes: 29 additions & 3 deletions crates/tui/src/core/engine/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -354,9 +354,35 @@ pub(super) fn estimate_input_tokens_conservative(
.saturating_add(framing_overhead)
}

pub(super) fn context_input_budget(model: &str, requested_output_tokens: u32) -> Option<usize> {
let window = usize::try_from(context_window_for_model(model)?).ok()?;
let output = usize::try_from(requested_output_tokens).ok()?;
/// Context windows at or above this size reserve the full
/// [`TURN_MAX_OUTPUT_TOKENS`] (262K) when computing the internal input budget,
/// leaving room for V4-class interleaved thinking. Below it, the reservation
/// falls back to [`effective_max_output_tokens`] so a smaller self-hosted
/// window does not underflow to a negative budget.
const INTERNAL_BUDGET_LARGE_WINDOW_THRESHOLD: u32 = 500_000;

/// Internal input-side token budget for a model: `window - reserved_output -
/// headroom`. Used by the preflight check, emergency recovery, and capacity
/// trimming to decide when to compact.
///
/// The reserved-output term is window-dependent:
/// * `window >= 500K` (V4-class large-context) -> [`TURN_MAX_OUTPUT_TOKENS`]
/// (262K). Preserves the "leave room for interleaved thinking" contract.
/// * `window < 500K` (smaller / self-hosted, e.g. a 256K vLLM Qwen window)
/// -> [`effective_max_output_tokens`], i.e. what the API actually caps
/// output at. Reserving the full 262K here would compute
/// `256K - 262K - 1K`, which underflows `checked_sub` to `None` and
/// *silently disables every preflight and emergency recovery path* — the
/// session then runs until the provider hard-rejects on context length.
pub(super) fn context_input_budget(model: &str) -> Option<usize> {
let window_tokens = context_window_for_model(model)?;
let window = usize::try_from(window_tokens).ok()?;
let reserved_output = if window_tokens >= INTERNAL_BUDGET_LARGE_WINDOW_THRESHOLD {
TURN_MAX_OUTPUT_TOKENS
} else {
effective_max_output_tokens(model)
};
let output = usize::try_from(reserved_output).ok()?;
window
.checked_sub(output)
.and_then(|v| v.checked_sub(CONTEXT_HEADROOM_TOKENS))
Expand Down
40 changes: 17 additions & 23 deletions crates/tui/src/core/engine/tests.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use super::*;

use super::context::TURN_MAX_OUTPUT_TOKENS;
use crate::models::SystemBlock;
use crate::test_support::lock_test_env;
use crate::tools::spec::ToolCapability;
Expand Down Expand Up @@ -916,7 +917,7 @@ fn detects_context_length_errors_from_provider_payloads() {
fn context_budget_reserves_output_and_headroom() {
// V4 has a 1M context window — the only family that comfortably hosts
// a 256K output reservation without saturating the input budget to 0.
let budget = context_input_budget("deepseek-v4-pro", TURN_MAX_OUTPUT_TOKENS)
let budget = context_input_budget("deepseek-v4-pro")
.expect("deepseek-v4-pro should have a known context window");
let v4_window: usize = 1_000_000;
let expected = v4_window - (TURN_MAX_OUTPUT_TOKENS as usize) - 1_024usize;
Expand All @@ -943,31 +944,24 @@ fn effective_max_output_tokens_caps_api_request_for_large_window_models() {
}

#[test]
fn internal_context_budget_unaffected_by_api_request_cap() {
// The internal context budget (used for compaction/preflight/recovery)
// must still use the full TURN_MAX_OUTPUT_TOKENS headroom, NOT the
// smaller API request cap. This ensures long-context V4 sessions don't
// compact prematurely.
let internal_budget = context_input_budget("deepseek-v4-pro", TURN_MAX_OUTPUT_TOKENS)
.expect("V4 should have a known context window");
let api_cap_budget = context_input_budget(
"deepseek-v4-pro",
effective_max_output_tokens("deepseek-v4-pro"),
)
.expect("V4 should have a known context window");

// Internal budget reserves 262K for output; API-cap budget would only
// reserve 64K. Internal budget must be smaller (more conservative).
assert!(
internal_budget < api_cap_budget,
"Internal budget ({internal_budget}) should be smaller than API-cap budget ({api_cap_budget}) \
because it reserves more headroom for output"
);

// Verify the internal budget is what the compaction logic actually uses.
fn internal_context_budget_tiers_reserved_output_by_window() {
// Large-context (>=500K) models reserve the full TURN_MAX_OUTPUT_TOKENS
// headroom so long V4 sessions don't compact prematurely.
let internal_budget =
context_input_budget("deepseek-v4-pro").expect("V4 should have a known context window");
let v4_window: usize = 1_000_000;
let expected_internal = v4_window - (TURN_MAX_OUTPUT_TOKENS as usize) - 1_024usize;
assert_eq!(internal_budget, expected_internal);

// Sub-500K windows cross into the effective-cap branch: a 256K self-hosted
// deployment must yield a usable positive budget rather than None. The
// previous formula reserved the full 262K and computed 256K - 262K - 1K,
// which underflowed to None and silently disabled preflight/recovery.
let small_window_budget = context_input_budget("qwen3-32b-256k")
.expect("a 256K-suffix model must yield Some budget via the effective-cap branch");
let effective_output = effective_max_output_tokens("qwen3-32b-256k") as usize;
let expected_small = 256_000 - effective_output - 1_024;
assert_eq!(small_window_budget, expected_small);
}

#[test]
Expand Down
16 changes: 3 additions & 13 deletions crates/tui/src/core/engine/turn_loop.rs
Original file line number Diff line number Diff line change
Expand Up @@ -173,9 +173,7 @@ impl Engine {
continue;
}

if let Some(input_budget) =
context_input_budget(&self.session.model, TURN_MAX_OUTPUT_TOKENS)
{
if let Some(input_budget) = context_input_budget(&self.session.model) {
let estimated_input = self.estimated_input_tokens();
if estimated_input > input_budget {
if context_recovery_attempts >= MAX_CONTEXT_RECOVERY_ATTEMPTS {
Expand All @@ -192,11 +190,7 @@ impl Engine {
}

if self
.recover_context_overflow(
&client,
"preflight token budget",
TURN_MAX_OUTPUT_TOKENS,
)
.recover_context_overflow(&client, "preflight token budget")
.await
{
context_recovery_attempts = context_recovery_attempts.saturating_add(1);
Expand Down Expand Up @@ -326,11 +320,7 @@ impl Engine {
if is_context_length_error_message(&message)
&& context_recovery_attempts < MAX_CONTEXT_RECOVERY_ATTEMPTS
&& self
.recover_context_overflow(
&client,
"provider context-length rejection",
TURN_MAX_OUTPUT_TOKENS,
)
.recover_context_overflow(&client, "provider context-length rejection")
.await
{
context_recovery_attempts = context_recovery_attempts.saturating_add(1);
Expand Down
22 changes: 15 additions & 7 deletions crates/tui/src/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -208,16 +208,22 @@ pub struct Usage {
}

/// Map known models to their approximate context window sizes.
///
/// Lookup order:
/// 1. An explicit `_Nk` suffix in the model name, for **any** vendor. This
/// lets self-hosted deployments advertise their window through the served
/// model name (e.g. a vLLM `--served-model-name qwen3-32b-256k`), which is
/// the only signal we have for non-DeepSeek/Claude models. The 1000-token
/// approximation is fine for compaction-threshold math.
/// 2. DeepSeek vendor heuristics (V4 family -> 1M, legacy -> 128K).
/// 3. Claude -> 200K.
#[must_use]
pub fn context_window_for_model(model: &str) -> Option<u32> {
let lower = model.to_lowercase();
// Unknown legacy DeepSeek model IDs default to 128K unless an explicit
// *k suffix is present. DeepSeek-V4 family and current compatibility
// aliases ship with a 1M context window.
if let Some(explicit_window) = explicit_context_window_hint(&lower) {
return Some(explicit_window);
}
if lower.contains("deepseek") {
if let Some(explicit_window) = deepseek_context_window_hint(&lower) {
return Some(explicit_window);
}
if lower.contains("v4") {
return Some(DEEPSEEK_V4_CONTEXT_WINDOW_TOKENS);
}
Expand All @@ -229,7 +235,9 @@ pub fn context_window_for_model(model: &str) -> Option<u32> {
None
}

fn deepseek_context_window_hint(model_lower: &str) -> Option<u32> {
/// Parse an explicit `_Nk` context-window hint from a model name (vendor
/// agnostic). Returns the window in tokens for `N` in `8..=1024`.
fn explicit_context_window_hint(model_lower: &str) -> Option<u32> {
let bytes = model_lower.as_bytes();
let mut i = 0usize;
while i < bytes.len() {
Expand Down
Loading