From af65096a4bd5d30374ab8fd727793c8ab1bf744d Mon Sep 17 00:00:00 2001
From: Meur3ault <rua.zhou@foxmail.com>
Date: Sun, 10 May 2026 18:59:54 +0900
Subject: [PATCH] fix(research): derive context budget from actual model
 context window
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The research sub-agent's context thresholds were hard-coded as module-level
constants assuming a 200k context window:

    _RESEARCH_CONTEXT_WARN = 170_000  # 85% of 200k
    _RESEARCH_CONTEXT_MAX  = 190_000

With claude-sonnet-4-6 (the default research model) having a 1M context
window, the sub-agent was being terminated at 19% of its actual capacity.

Also fixes two secondary issues:
- Warn threshold was 85% but the injected system message said 75% —
  now aligned to 75% to match the prompt text.
- For models with <190k context windows, the old hard-coded MAX would
  exceed the API limit causing ContextWindowExceededError mid-research.

Move the constants inside research_handler and compute them from
_get_max_tokens_safe(research_model), the same helper the main session
already uses for its own compaction threshold.

Adds tests/unit/test_research_context_budget.py (7 cases).
---
 agent/tools/research_tool.py               | 19 ++++---
 tests/unit/test_research_context_budget.py | 65 ++++++++++++++++++++++
 2 files changed, 76 insertions(+), 8 deletions(-)
 create mode 100644 tests/unit/test_research_context_budget.py

diff --git a/agent/tools/research_tool.py b/agent/tools/research_tool.py
index f5815be8..5a49d11e 100644
--- a/agent/tools/research_tool.py
+++ b/agent/tools/research_tool.py
@@ -18,15 +18,10 @@
 from agent.core.doom_loop import check_for_doom_loop
 from agent.core.llm_params import _resolve_llm_params
 from agent.core.prompt_caching import with_prompt_caching
-from agent.core.session import Event
+from agent.core.session import Event, _get_max_tokens_safe
 
 logger = logging.getLogger(__name__)
 
-# Context budget for the research subagent (tokens).
-# When usage exceeds WARN threshold, the subagent is told to wrap up.
-# At MAX, the loop is force-stopped and whatever content exists is returned.
-_RESEARCH_CONTEXT_WARN = 170_000  # 85% of 200k
-_RESEARCH_CONTEXT_MAX = 190_000
 
 # Tools the research agent can use (read-only subset)
 RESEARCH_TOOL_NAMES = {
@@ -254,6 +249,14 @@ async def research_handler(
     # Use a cheaper/faster model for research
     main_model = session.config.model_name
     research_model = _get_research_model(main_model)
+    _ctx_max = _get_max_tokens_safe(research_model)
+    # Context budget for the research subagent (tokens).
+    # When usage exceeds WARN threshold, the subagent is told to wrap up.
+    # At MAX, the loop is force-stopped and whatever content exists is returned.
+    _research_context_warn = int(
+        _ctx_max * 0.75
+    )  # Corresponding to the prompt "You have used 75% of your context budget."
+    _research_context_max = int(_ctx_max * 0.95)  # Hard-stop at 95%
     # Research is a cheap sub-call — cap the main session's effort at "high"
     # so a user preference of ``max``/``xhigh`` (valid for Opus 4.6/4.7) doesn't
     # propagate to a Sonnet research model that may not accept those levels.
@@ -322,7 +325,7 @@ async def _log(text: str) -> None:
             messages.append(Message(role="user", content=doom_prompt))
 
         # ── Context budget: warn at 75%, hard-stop at 95% ──
-        if _total_tokens >= _RESEARCH_CONTEXT_MAX:
+        if _total_tokens >= _research_context_max:
             logger.warning(
                 "Research sub-agent hit context max (%d tokens) — forcing summary",
                 _total_tokens,
@@ -374,7 +377,7 @@ async def _log(text: str) -> None:
             except Exception:
                 return "Research context exhausted and summary call failed.", False
 
-        if not _warned_context and _total_tokens >= _RESEARCH_CONTEXT_WARN:
+        if not _warned_context and _total_tokens >= _research_context_warn:
             _warned_context = True
             await _log(f"Context at {_total_tokens} tokens — nudging to wrap up")
             messages.append(
diff --git a/tests/unit/test_research_context_budget.py b/tests/unit/test_research_context_budget.py
new file mode 100644
index 00000000..d2609424
--- /dev/null
+++ b/tests/unit/test_research_context_budget.py
@@ -0,0 +1,65 @@
+"""Tests for dynamic research sub-agent context budget.
+
+Regression for the hard-coded 170k/190k budget that assumed a 200k context
+window regardless of the actual research model. With claude-sonnet-4-6 having
+a 1M context window, the sub-agent was being terminated at ~19% capacity.
+"""
+
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from agent.tools.research_tool import _get_research_model, research_handler
+
+
+# ── _get_research_model ────────────────────────────────────────────────
+
+
+def test_anthropic_main_model_uses_sonnet_for_research():
+    assert (
+        _get_research_model("anthropic/claude-opus-4-7")
+        == "anthropic/claude-sonnet-4-6"
+    )
+    assert (
+        _get_research_model("anthropic/claude-opus-4-6")
+        == "anthropic/claude-sonnet-4-6"
+    )
+
+
+def test_bedrock_anthropic_model_uses_bedrock_sonnet():
+    result = _get_research_model("bedrock/us.anthropic.claude-opus-4-6-v1")
+    assert result == "bedrock/us.anthropic.claude-sonnet-4-6"
+
+
+def test_non_anthropic_model_falls_back_to_same_model():
+    assert _get_research_model("openai/gpt-5.5") == "openai/gpt-5.5"
+    assert _get_research_model("moonshotai/Kimi-K2.6") == "moonshotai/Kimi-K2.6"
+
+
+# ── research_handler calls _get_max_tokens_safe ────────────────────────
+
+
+@pytest.mark.asyncio
+async def test_research_handler_calls_get_max_tokens_safe_with_research_model():
+    """`_get_max_tokens_safe` must be called with the *research* model id,
+    not the main model id, so the budget reflects the sub-agent's model."""
+    fake_session = MagicMock()
+    fake_session.config.model_name = "anthropic/claude-opus-4-7"
+    fake_session.config.reasoning_effort = None
+    fake_session.hf_token = None
+    fake_session.tool_router.get_tool_specs_for_llm.return_value = []
+    fake_session.send_event = AsyncMock()
+
+    with (
+        patch(
+            "agent.tools.research_tool._get_max_tokens_safe", return_value=1_000_000
+        ) as mock_ctx,
+        patch(
+            "agent.tools.research_tool.acompletion",
+            side_effect=RuntimeError("abort"),
+        ),
+    ):
+        await research_handler({"task": "test task"}, session=fake_session)
+
+    # Must be called with the research model (sonnet), not the main model (opus)
+    mock_ctx.assert_called_once_with("anthropic/claude-sonnet-4-6")