Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 11 additions & 8 deletions agent/tools/research_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,10 @@
from agent.core.doom_loop import check_for_doom_loop
from agent.core.llm_params import _resolve_llm_params
from agent.core.prompt_caching import with_prompt_caching
from agent.core.session import Event
from agent.core.session import Event, _get_max_tokens_safe

logger = logging.getLogger(__name__)

# Context budget for the research subagent (tokens).
# When usage exceeds WARN threshold, the subagent is told to wrap up.
# At MAX, the loop is force-stopped and whatever content exists is returned.
_RESEARCH_CONTEXT_WARN = 170_000 # 85% of 200k
_RESEARCH_CONTEXT_MAX = 190_000

# Tools the research agent can use (read-only subset)
RESEARCH_TOOL_NAMES = {
Expand Down Expand Up @@ -254,6 +249,14 @@ async def research_handler(
# Use a cheaper/faster model for research
main_model = session.config.model_name
research_model = _get_research_model(main_model)
_ctx_max = _get_max_tokens_safe(research_model)
# Context budget for the research subagent (tokens).
# When usage exceeds WARN threshold, the subagent is told to wrap up.
# At MAX, the loop is force-stopped and whatever content exists is returned.
_research_context_warn = int(
_ctx_max * 0.75
) # Corresponding to the prompt "You have used 75% of your context budget."
_research_context_max = int(_ctx_max * 0.95) # Hard-stop at 95%
# Research is a cheap sub-call — cap the main session's effort at "high"
# so a user preference of ``max``/``xhigh`` (valid for Opus 4.6/4.7) doesn't
# propagate to a Sonnet research model that may not accept those levels.
Expand Down Expand Up @@ -322,7 +325,7 @@ async def _log(text: str) -> None:
messages.append(Message(role="user", content=doom_prompt))

# ── Context budget: warn at 75%, hard-stop at 95% ──
if _total_tokens >= _RESEARCH_CONTEXT_MAX:
if _total_tokens >= _research_context_max:
logger.warning(
"Research sub-agent hit context max (%d tokens) — forcing summary",
_total_tokens,
Expand Down Expand Up @@ -374,7 +377,7 @@ async def _log(text: str) -> None:
except Exception:
return "Research context exhausted and summary call failed.", False

if not _warned_context and _total_tokens >= _RESEARCH_CONTEXT_WARN:
if not _warned_context and _total_tokens >= _research_context_warn:
_warned_context = True
await _log(f"Context at {_total_tokens} tokens — nudging to wrap up")
messages.append(
Expand Down
65 changes: 65 additions & 0 deletions tests/unit/test_research_context_budget.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""Tests for dynamic research sub-agent context budget.

Regression for the hard-coded 170k/190k budget that assumed a 200k context
window regardless of the actual research model. With claude-sonnet-4-6 having
a 1M context window, the sub-agent was being terminated at ~19% capacity.
"""

from unittest.mock import AsyncMock, MagicMock, patch

import pytest

from agent.tools.research_tool import _get_research_model, research_handler


# ── _get_research_model ────────────────────────────────────────────────


def test_anthropic_main_model_uses_sonnet_for_research():
assert (
_get_research_model("anthropic/claude-opus-4-7")
== "anthropic/claude-sonnet-4-6"
)
assert (
_get_research_model("anthropic/claude-opus-4-6")
== "anthropic/claude-sonnet-4-6"
)


def test_bedrock_anthropic_model_uses_bedrock_sonnet():
result = _get_research_model("bedrock/us.anthropic.claude-opus-4-6-v1")
assert result == "bedrock/us.anthropic.claude-sonnet-4-6"


def test_non_anthropic_model_falls_back_to_same_model():
assert _get_research_model("openai/gpt-5.5") == "openai/gpt-5.5"
assert _get_research_model("moonshotai/Kimi-K2.6") == "moonshotai/Kimi-K2.6"


# ── research_handler calls _get_max_tokens_safe ────────────────────────


@pytest.mark.asyncio
async def test_research_handler_calls_get_max_tokens_safe_with_research_model():
"""`_get_max_tokens_safe` must be called with the *research* model id,
not the main model id, so the budget reflects the sub-agent's model."""
fake_session = MagicMock()
fake_session.config.model_name = "anthropic/claude-opus-4-7"
fake_session.config.reasoning_effort = None
fake_session.hf_token = None
fake_session.tool_router.get_tool_specs_for_llm.return_value = []
fake_session.send_event = AsyncMock()

with (
patch(
"agent.tools.research_tool._get_max_tokens_safe", return_value=1_000_000
) as mock_ctx,
patch(
"agent.tools.research_tool.acompletion",
side_effect=RuntimeError("abort"),
),
):
await research_handler({"task": "test task"}, session=fake_session)

# Must be called with the research model (sonnet), not the main model (opus)
mock_ctx.assert_called_once_with("anthropic/claude-sonnet-4-6")
Loading