From af65096a4bd5d30374ab8fd727793c8ab1bf744d Mon Sep 17 00:00:00 2001 From: Meur3ault Date: Sun, 10 May 2026 18:59:54 +0900 Subject: [PATCH] fix(research): derive context budget from actual model context window MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The research sub-agent's context thresholds were hard-coded as module-level constants assuming a 200k context window: _RESEARCH_CONTEXT_WARN = 170_000 # 85% of 200k _RESEARCH_CONTEXT_MAX = 190_000 With claude-sonnet-4-6 (the default research model) having a 1M context window, the sub-agent was being terminated at 19% of its actual capacity. Also fixes two secondary issues: - Warn threshold was 85% but the injected system message said 75% — now aligned to 75% to match the prompt text. - For models with <190k context windows, the old hard-coded MAX would exceed the API limit causing ContextWindowExceededError mid-research. Move the constants inside research_handler and compute them from _get_max_tokens_safe(research_model), the same helper the main session already uses for its own compaction threshold. Adds tests/unit/test_research_context_budget.py (7 cases). --- agent/tools/research_tool.py | 19 ++++--- tests/unit/test_research_context_budget.py | 65 ++++++++++++++++++++++ 2 files changed, 76 insertions(+), 8 deletions(-) create mode 100644 tests/unit/test_research_context_budget.py diff --git a/agent/tools/research_tool.py b/agent/tools/research_tool.py index f5815be8..5a49d11e 100644 --- a/agent/tools/research_tool.py +++ b/agent/tools/research_tool.py @@ -18,15 +18,10 @@ from agent.core.doom_loop import check_for_doom_loop from agent.core.llm_params import _resolve_llm_params from agent.core.prompt_caching import with_prompt_caching -from agent.core.session import Event +from agent.core.session import Event, _get_max_tokens_safe logger = logging.getLogger(__name__) -# Context budget for the research subagent (tokens). -# When usage exceeds WARN threshold, the subagent is told to wrap up. -# At MAX, the loop is force-stopped and whatever content exists is returned. -_RESEARCH_CONTEXT_WARN = 170_000 # 85% of 200k -_RESEARCH_CONTEXT_MAX = 190_000 # Tools the research agent can use (read-only subset) RESEARCH_TOOL_NAMES = { @@ -254,6 +249,14 @@ async def research_handler( # Use a cheaper/faster model for research main_model = session.config.model_name research_model = _get_research_model(main_model) + _ctx_max = _get_max_tokens_safe(research_model) + # Context budget for the research subagent (tokens). + # When usage exceeds WARN threshold, the subagent is told to wrap up. + # At MAX, the loop is force-stopped and whatever content exists is returned. + _research_context_warn = int( + _ctx_max * 0.75 + ) # Corresponding to the prompt "You have used 75% of your context budget." + _research_context_max = int(_ctx_max * 0.95) # Hard-stop at 95% # Research is a cheap sub-call — cap the main session's effort at "high" # so a user preference of ``max``/``xhigh`` (valid for Opus 4.6/4.7) doesn't # propagate to a Sonnet research model that may not accept those levels. @@ -322,7 +325,7 @@ async def _log(text: str) -> None: messages.append(Message(role="user", content=doom_prompt)) # ── Context budget: warn at 75%, hard-stop at 95% ── - if _total_tokens >= _RESEARCH_CONTEXT_MAX: + if _total_tokens >= _research_context_max: logger.warning( "Research sub-agent hit context max (%d tokens) — forcing summary", _total_tokens, @@ -374,7 +377,7 @@ async def _log(text: str) -> None: except Exception: return "Research context exhausted and summary call failed.", False - if not _warned_context and _total_tokens >= _RESEARCH_CONTEXT_WARN: + if not _warned_context and _total_tokens >= _research_context_warn: _warned_context = True await _log(f"Context at {_total_tokens} tokens — nudging to wrap up") messages.append( diff --git a/tests/unit/test_research_context_budget.py b/tests/unit/test_research_context_budget.py new file mode 100644 index 00000000..d2609424 --- /dev/null +++ b/tests/unit/test_research_context_budget.py @@ -0,0 +1,65 @@ +"""Tests for dynamic research sub-agent context budget. + +Regression for the hard-coded 170k/190k budget that assumed a 200k context +window regardless of the actual research model. With claude-sonnet-4-6 having +a 1M context window, the sub-agent was being terminated at ~19% capacity. +""" + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from agent.tools.research_tool import _get_research_model, research_handler + + +# ── _get_research_model ──────────────────────────────────────────────── + + +def test_anthropic_main_model_uses_sonnet_for_research(): + assert ( + _get_research_model("anthropic/claude-opus-4-7") + == "anthropic/claude-sonnet-4-6" + ) + assert ( + _get_research_model("anthropic/claude-opus-4-6") + == "anthropic/claude-sonnet-4-6" + ) + + +def test_bedrock_anthropic_model_uses_bedrock_sonnet(): + result = _get_research_model("bedrock/us.anthropic.claude-opus-4-6-v1") + assert result == "bedrock/us.anthropic.claude-sonnet-4-6" + + +def test_non_anthropic_model_falls_back_to_same_model(): + assert _get_research_model("openai/gpt-5.5") == "openai/gpt-5.5" + assert _get_research_model("moonshotai/Kimi-K2.6") == "moonshotai/Kimi-K2.6" + + +# ── research_handler calls _get_max_tokens_safe ──────────────────────── + + +@pytest.mark.asyncio +async def test_research_handler_calls_get_max_tokens_safe_with_research_model(): + """`_get_max_tokens_safe` must be called with the *research* model id, + not the main model id, so the budget reflects the sub-agent's model.""" + fake_session = MagicMock() + fake_session.config.model_name = "anthropic/claude-opus-4-7" + fake_session.config.reasoning_effort = None + fake_session.hf_token = None + fake_session.tool_router.get_tool_specs_for_llm.return_value = [] + fake_session.send_event = AsyncMock() + + with ( + patch( + "agent.tools.research_tool._get_max_tokens_safe", return_value=1_000_000 + ) as mock_ctx, + patch( + "agent.tools.research_tool.acompletion", + side_effect=RuntimeError("abort"), + ), + ): + await research_handler({"task": "test task"}, session=fake_session) + + # Must be called with the research model (sonnet), not the main model (opus) + mock_ctx.assert_called_once_with("anthropic/claude-sonnet-4-6")