frankbria · frankbria · Apr 7, 2026 · Apr 6, 2026 · Apr 6, 2026 · Apr 7, 2026
diff --git a/codeframe/adapters/llm/__init__.py b/codeframe/adapters/llm/__init__.py
@@ -26,6 +26,7 @@
     Message,
     ModelSelector,
     Purpose,
+    StreamChunk,
     Tool,
     ToolCall,
     ToolResult,
@@ -40,6 +41,7 @@
     "Message",
     "ModelSelector",
     "Purpose",
+    "StreamChunk",
     "Tool",
     "ToolCall",
     "ToolResult",

diff --git a/codeframe/adapters/llm/anthropic.py b/codeframe/adapters/llm/anthropic.py
@@ -3,14 +3,16 @@
 Provides Claude model access via the Anthropic API.
 """
 
+import asyncio
 import os
-from typing import TYPE_CHECKING, Iterator, Optional
+from typing import TYPE_CHECKING, AsyncIterator, Iterator, Optional
 
 from codeframe.adapters.llm.base import (
     LLMProvider,
     LLMResponse,
     ModelSelector,
     Purpose,
+    StreamChunk,
     Tool,
     ToolCall,
 )
@@ -172,6 +174,117 @@ async def async_complete(
         except APIConnectionError as exc:
             raise LLMConnectionError(str(exc)) from exc
 
+    def supports(self, capability: str) -> bool:
+        """Return True for capabilities this provider supports."""
+        return capability == "extended_thinking"
+
+    async def async_stream(
+        self,
+        messages: list[dict],
+        system: str,
+        tools: list[dict],
+        model: str,
+        max_tokens: int,
+        interrupt_event: Optional[asyncio.Event] = None,
+        extended_thinking: bool = False,
+    ) -> AsyncIterator[StreamChunk]:
+        """Stream using Anthropic AsyncAnthropic SDK, yielding StreamChunk objects.
+
+        Translates Anthropic SDK events into the normalized StreamChunk format.
+        Tool inputs are collected and emitted in the final message_stop chunk
+        via tool_inputs_by_id, which is more reliable than streaming input deltas.
+
+        When ``extended_thinking=True``, requests interleaved thinking via the
+        Anthropic betas API.  The flag is silently ignored on SDK versions that
+        do not support it.
+        """
+        from anthropic import AsyncAnthropic
+
+        if self._async_client is None:
+            self._async_client = AsyncAnthropic(api_key=self.api_key)
+
+        # Convert messages to Anthropic API format (handles tool_calls/tool_results)
+        converted = self._convert_messages(messages)
+
+        kwargs: dict = {
+            "model": model,
+            "system": system,
+            "messages": converted,
+            "tools": tools,
+            "max_tokens": max_tokens,
+        }
-        kwargs: dict = {
-            "model": model,
-            "system": system,
-            "messages": messages,
-            "tools": tools,
-            "max_tokens": max_tokens,
-        }
+        kwargs: dict = {
+            "model": model,
+            "system": system,
+            "messages": self._convert_messages(messages),
+            "tools": tools,
+            "max_tokens": max_tokens,
+        }
-        kwargs: dict = {
-            "model": model,
-            "system": system,
-            "messages": messages,
-            "tools": tools,
-            "max_tokens": max_tokens,
-        }
+        kwargs: dict = {
+            "model": model,
+            "system": system,
+            "messages": self._convert_messages(messages),
+            "tools": tools,
+            "max_tokens": max_tokens,
+        }
+
+        if extended_thinking:
+            kwargs["betas"] = ["interleaved-thinking-2025-05-14"]
+
+        active_tool_id: Optional[str] = None
+
+        # When extended_thinking is set, the beta header may be unsupported on
+        # older SDK versions.  Retry without it rather than hard-failing.
+        try:
+            stream_ctx = self._async_client.messages.stream(**kwargs)
+        except Exception:  # pragma: no cover
+            if extended_thinking:
+                kwargs.pop("betas", None)
+                stream_ctx = self._async_client.messages.stream(**kwargs)
+            else:
+                raise
+
+        async with stream_ctx as stream:
+            async for sdk_event in stream:
+                if interrupt_event and interrupt_event.is_set():
+                    return
+
+                event_type = sdk_event.type
+
+                if event_type == "content_block_start":
+                    block = sdk_event.content_block
+                    if block.type == "tool_use":
+                        active_tool_id = block.id
+                        yield StreamChunk(
+                            type="tool_use_start",
+                            tool_id=block.id,
+                            tool_name=block.name,
+                            tool_input=getattr(block, "input", {}),
+                        )
+
+                elif event_type == "content_block_delta":
+                    delta = sdk_event.delta
+                    if delta.type == "text_delta":
+                        yield StreamChunk(type="text_delta", text=delta.text)
+                    elif delta.type == "thinking_delta":
+                        yield StreamChunk(type="thinking_delta", text=delta.thinking)
+                    # input_json_delta: final inputs are rebuilt from message_stop
+
+                elif event_type == "content_block_stop":
+                    if active_tool_id is not None:
+                        yield StreamChunk(type="tool_use_stop")
+                        active_tool_id = None
+
+                elif event_type == "message_stop":
+                    # Flush any open tool block
+                    if active_tool_id is not None:
+                        yield StreamChunk(type="tool_use_stop")
+                        active_tool_id = None
+
+                    final_msg = await stream.get_final_message()
+                    stop_reason = final_msg.stop_reason or "end_turn"
+
+                    # Build tool_inputs_by_id from final content blocks
+                    tool_inputs_by_id: dict = {}
+                    if hasattr(final_msg, "content"):
+                        for block in final_msg.content:
+                            if getattr(block, "type", None) == "tool_use" and hasattr(block, "id"):
+                                tool_inputs_by_id[block.id] = getattr(block, "input", {})
+
+                    yield StreamChunk(
+                        type="message_stop",
+                        stop_reason=stop_reason,
+                        input_tokens=final_msg.usage.input_tokens,
+                        output_tokens=final_msg.usage.output_tokens,
+                        tool_inputs_by_id=tool_inputs_by_id,
+                    )
+
     def stream(
         self,
         messages: list[dict],

diff --git a/codeframe/adapters/llm/base.py b/codeframe/adapters/llm/base.py
@@ -9,7 +9,7 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import Iterator, Optional
+from typing import AsyncIterator, Iterator, Optional
 
 
 # ---------------------------------------------------------------------------
@@ -120,6 +120,53 @@ def for_purpose(self, purpose: Purpose) -> str:
             return self.execution_model  # Default fallback
 
 
+@dataclass
+class StreamChunk:
+    """A normalized chunk from a streaming LLM response.
+
+    Provider-specific streaming formats are translated into this common type
+    by each :class:`LLMProvider` implementation.
+
+    Attributes:
+        type: Event type — one of ``"text_delta"``, ``"thinking_delta"``,
+            ``"tool_use_start"``, ``"tool_use_stop"``, ``"message_stop"``.
+        text: Text content for ``text_delta`` and ``thinking_delta`` types.
+        tool_id: Tool call ID for ``tool_use_start``.
+        tool_name: Tool name for ``tool_use_start``.
+        tool_input: Tool input dict for ``tool_use_start`` (may be empty;
+            final inputs are provided in the ``message_stop`` chunk).
+        input_tokens: Input token count, populated for ``message_stop``.
+        output_tokens: Output token count, populated for ``message_stop``.
+        stop_reason: Why the model stopped, populated for ``message_stop``.
+        tool_inputs_by_id: Mapping of tool_id → final input dict, populated
+            for ``message_stop``.  More reliable than streaming incremental
+            input deltas.
+
+    .. note:: ``tool_use_stop`` ordering differs by provider:
+
+        - **Anthropic**: emitted immediately when each tool call's content
+          block ends (``content_block_stop`` event), so consumers see
+          ``tool_use_start → [deltas] → tool_use_stop`` interleaved.
+        - **OpenAI-compatible**: emitted after the full stream ends (before
+          ``message_stop``), because the SSE protocol has no per-tool stop
+          marker.  All ``tool_use_stop`` chunks arrive together at the end.
+
+        Consumers MUST use ``tool_inputs_by_id`` from the ``message_stop``
+        chunk for final tool inputs rather than relying on ``tool_use_stop``
+        ordering.
+    """
+
+    type: str
+    text: Optional[str] = None
+    tool_id: Optional[str] = None
+    tool_name: Optional[str] = None
+    tool_input: Optional[dict] = None
+    input_tokens: Optional[int] = None
+    output_tokens: Optional[int] = None
+    stop_reason: Optional[str] = None
+    tool_inputs_by_id: Optional[dict] = None
+
+
 @dataclass
 class ToolCall:
     """Represents a tool call requested by the LLM.
@@ -332,6 +379,59 @@ async def async_complete(
             lambda: self.complete(messages, purpose, tools, max_tokens, temperature, system),
         )
 
+    def supports(self, capability: str) -> bool:
+        """Check whether this provider supports an optional capability.
+
+        Args:
+            capability: Capability name, e.g. ``"extended_thinking"``.
+
+        Returns:
+            ``True`` if the capability is supported, ``False`` otherwise.
+        """
+        return False
+
+    # Not decorated with @abstractmethod intentionally: providers that only
+    # support synchronous completion (e.g. thin wrappers) don't need to
+    # implement streaming.  Calling async_stream() on such a provider raises
+    # NotImplementedError at call time rather than at instantiation.
+    async def async_stream(
+        self,
+        messages: list[dict],
+        system: str,
+        tools: list[dict],
+        model: str,
+        max_tokens: int,
+        interrupt_event: Optional[asyncio.Event] = None,
+        extended_thinking: bool = False,
+    ) -> AsyncIterator["StreamChunk"]:
+        """Stream a completion as normalized :class:`StreamChunk` objects.
+
+        Subclasses should override this with a provider-specific implementation.
+        The default raises :exc:`NotImplementedError`.
+
+        Args:
+            messages: Conversation messages in the provider's expected format.
+            system: System prompt string.
+            tools: Already-serialized tool definitions (list of dicts).
+            model: Model identifier to use for this call.
+            max_tokens: Maximum output tokens.
+            interrupt_event: When set, the stream should stop at the next
+                opportunity.
+            extended_thinking: When ``True``, request extended thinking tokens
+                from providers that support them (see :meth:`supports`).
+                Providers that do not support this capability should silently
+                ignore the flag.
+
+        Yields:
+            :class:`StreamChunk` objects in order of generation.
+        """
+        raise NotImplementedError(
+            f"{type(self).__name__} does not implement async_stream(). "
+            "Override this method in your provider subclass."
+        )
+        if False:  # pragma: no cover  # makes this an async generator
+            yield  # type: ignore[misc]
+
     def get_model(self, purpose: Purpose) -> str:
         """Get the model for a given purpose.
 

diff --git a/codeframe/adapters/llm/mock.py b/codeframe/adapters/llm/mock.py
@@ -4,13 +4,15 @@
 Supports configurable responses and call tracking.
 """
 
-from typing import Callable, Iterator, Optional
+import asyncio
+from typing import AsyncIterator, Callable, Iterator, Optional
 
 from codeframe.adapters.llm.base import (
     LLMProvider,
     LLMResponse,
     ModelSelector,
     Purpose,
+    StreamChunk,
     Tool,
     ToolCall,
 )
@@ -40,6 +42,8 @@ def __init__(
         self.responses: list[LLMResponse] = []
         self.response_index = 0
         self.response_handler: Optional[Callable[[list[dict]], LLMResponse]] = None
+        self.stream_chunks: list[list[StreamChunk]] = []
+        self.stream_index = 0
 
     def add_response(self, response: LLMResponse) -> None:
         """Add a canned response to the queue.
@@ -175,12 +179,83 @@ def stream(
         for word in response.content.split():
             yield word + " "
 
+    def add_stream_chunks(self, chunks: list[StreamChunk]) -> None:
+        """Add a sequence of StreamChunks for the next async_stream() call.
+
+        Args:
+            chunks: Ordered list of StreamChunk objects to yield.
+        """
+        self.stream_chunks.append(chunks)
+
+    async def async_stream(
+        self,
+        messages: list[dict],
+        system: str,
+        tools: list[dict],
+        model: str,
+        max_tokens: int,
+        interrupt_event: Optional[asyncio.Event] = None,
+        extended_thinking: bool = False,
+    ) -> AsyncIterator[StreamChunk]:
+        """Yield pre-configured StreamChunk sequences for testing.
+
+        Tracks each call in :attr:`calls` (same metadata as :meth:`complete`).
+        When pre-configured ``stream_chunks`` are available, yields them in
+        order.  Otherwise falls back to a minimal ``text_delta`` +
+        ``message_stop`` pair derived from the normal response queue
+        (``responses`` / ``response_handler`` / ``default_response``).
+        """
+        # Track the call so tests can assert on it
+        self.calls.append(
+            {
+                "messages": messages,
+                "system": system,
+                "tools": tools,
+                "model": model,
+                "max_tokens": max_tokens,
+                "extended_thinking": extended_thinking,
+            }
+        )
+
+        if self.stream_index < len(self.stream_chunks):
+            chunks = self.stream_chunks[self.stream_index]
+            self.stream_index += 1
+        else:
+            # Derive response text from the normal queue / handler
+            if self.response_handler:
+                resp = self.response_handler(messages)
+                text = resp.content
+            elif self.response_index < len(self.responses):
+                resp = self.responses[self.response_index]
+                self.response_index += 1
+                text = resp.content
+            else:
+                text = self.default_response
+
+            chunks = [
+                StreamChunk(type="text_delta", text=text),
+                StreamChunk(
+                    type="message_stop",
+                    stop_reason="end_turn",
+                    input_tokens=len(str(messages)),
+                    output_tokens=len(text),
+                    tool_inputs_by_id={},
+                ),
+            ]
+
+        for chunk in chunks:
+            if interrupt_event and interrupt_event.is_set():
+                return
+            yield chunk
+
     def reset(self) -> None:
         """Reset call tracking and response queue."""
         self.calls.clear()
         self.responses.clear()
         self.response_index = 0
         self.response_handler = None
+        self.stream_chunks.clear()
+        self.stream_index = 0
 
     @property
     def call_count(self) -> int: