From 4701ec6b5f419a292450d461b7c03d3f2e060378 Mon Sep 17 00:00:00 2001
From: Alex Bozarth <ajbozart@us.ibm.com>
Date: Fri, 13 Mar 2026 15:55:45 -0500
Subject: [PATCH 1/4] feat: add usage field to ModelOutputThunk and populate in
 all backends

- Add usage field to ModelOutputThunk with OpenAI-compatible format
- Update all 5 backends to populate usage field during post_processing

Signed-off-by: Alex Bozarth <ajbozart@us.ibm.com>
---
 mellea/backends/huggingface.py |  8 ++++++++
 mellea/backends/litellm.py     |  4 ++++
 mellea/backends/ollama.py      |  8 ++++++++
 mellea/backends/openai.py      |  4 ++++
 mellea/backends/watsonx.py     |  4 ++++
 mellea/core/base.py            | 10 ++++++++++
 6 files changed, 38 insertions(+)

diff --git a/mellea/backends/huggingface.py b/mellea/backends/huggingface.py
index c7886e7e..83655f3e 100644
--- a/mellea/backends/huggingface.py
+++ b/mellea/backends/huggingface.py
@@ -1075,6 +1075,14 @@ async def post_processing(
             except Exception:
                 pass
 
+        # Populate standardized usage field (convert to OpenAI format)
+        if n_prompt is not None and n_completion is not None:
+            mot.usage = {
+                "prompt_tokens": n_prompt,
+                "completion_tokens": n_completion,
+                "total_tokens": n_prompt + n_completion,
+            }
+
         # Record metrics if enabled
         if metrics_enabled and n_prompt is not None:
             from ..telemetry.backend_instrumentation import (
diff --git a/mellea/backends/litellm.py b/mellea/backends/litellm.py
index 4f5bac38..ef453d29 100644
--- a/mellea/backends/litellm.py
+++ b/mellea/backends/litellm.py
@@ -488,6 +488,10 @@ async def post_processing(
         if usage is None:
             usage = mot._meta.get("litellm_streaming_usage")
 
+        # Populate standardized usage field (LiteLLM uses OpenAI format)
+        if usage:
+            mot.usage = usage
+
         # Record metrics if enabled
         from ..telemetry.metrics import is_metrics_enabled
 
diff --git a/mellea/backends/ollama.py b/mellea/backends/ollama.py
index afe80ec8..276bb180 100644
--- a/mellea/backends/ollama.py
+++ b/mellea/backends/ollama.py
@@ -622,6 +622,14 @@ async def post_processing(
         )
         completion_tokens = getattr(response, "eval_count", None) if response else None
 
+        # Populate standardized usage field (convert to OpenAI format)
+        if prompt_tokens is not None or completion_tokens is not None:
+            mot.usage = {
+                "prompt_tokens": prompt_tokens or 0,
+                "completion_tokens": completion_tokens or 0,
+                "total_tokens": (prompt_tokens or 0) + (completion_tokens or 0),
+            }
+
         # Record metrics if enabled
         from ..telemetry.metrics import is_metrics_enabled
 
diff --git a/mellea/backends/openai.py b/mellea/backends/openai.py
index faadfc45..16a93485 100644
--- a/mellea/backends/openai.py
+++ b/mellea/backends/openai.py
@@ -625,6 +625,10 @@ async def post_processing(
         if usage is None:
             usage = mot._meta.get("oai_streaming_usage")
 
+        # Populate standardized usage field (OpenAI format already matches)
+        if usage:
+            mot.usage = usage
+
         # Record metrics if enabled
         from ..telemetry.metrics import is_metrics_enabled
 
diff --git a/mellea/backends/watsonx.py b/mellea/backends/watsonx.py
index d6ca943e..5a4bf24a 100644
--- a/mellea/backends/watsonx.py
+++ b/mellea/backends/watsonx.py
@@ -496,6 +496,10 @@ async def post_processing(
                 else getattr(response, "usage", None)
             )
 
+        # Populate standardized usage field (WatsonX uses OpenAI format)
+        if usage:
+            mot.usage = usage
+
         # Record metrics if enabled
         from ..telemetry.metrics import is_metrics_enabled
 
diff --git a/mellea/core/base.py b/mellea/core/base.py
index 60d65029..28b58234 100644
--- a/mellea/core/base.py
+++ b/mellea/core/base.py
@@ -196,6 +196,15 @@ def __init__(
         # Additional fields that should be standardized across apis.
         self.tool_calls = tool_calls
         self._thinking: str | None = None
+        self.usage: dict[str, int] | None = None
+        """Usage information following OpenAI API standard.
+
+        Core fields: 'prompt_tokens', 'completion_tokens', 'total_tokens'.
+        Populated by backends during post_processing. None if unavailable.
+
+        Future: May include optional breakdown fields like 'completion_tokens_details'
+        and 'prompt_tokens_details' for advanced features (reasoning, audio, caching).
+        """
 
         # Used for tracking generation.
         self._context: list[Component | CBlock] | None = None
@@ -233,6 +242,7 @@ def _copy_from(self, other: ModelOutputThunk) -> None:
         self.parsed_repr = other.parsed_repr
         self.tool_calls = other.tool_calls
         self._thinking = other._thinking
+        self.usage = other.usage
         self._generate_log = other._generate_log
 
     def is_computed(self) -> bool:

From b4649c9ede1d767eefaf123567004b5304bb5492 Mon Sep 17 00:00:00 2001
From: Alex Bozarth <ajbozart@us.ibm.com>
Date: Fri, 13 Mar 2026 19:04:51 -0500
Subject: [PATCH 2/4] refactor: migrate token metrics to plugin-based system

Completes migration from direct backend metrics calls to hook-based plugin system.
TokenMetricsPlugin now handles all token metrics recording via generation_post_call hook.

- Remove record_token_usage_metrics() calls from OpenAI, Ollama, LiteLLM, WatsonX, HuggingFace backends
- Update metrics output format for OTel compliance

Signed-off-by: Alex Bozarth <ajbozart@us.ibm.com>
---
 docs/dev/metrics_refactor_plan.md      | 673 +++++++++++++++++++++++++
 mellea/backends/huggingface.py         |  21 +-
 mellea/backends/litellm.py             |  21 +-
 mellea/backends/ollama.py              |  20 +-
 mellea/backends/openai.py              |  21 +-
 mellea/backends/watsonx.py             |  21 +-
 mellea/core/base.py                    |  22 +
 mellea/telemetry/metrics.py            |  46 +-
 mellea/telemetry/metrics_plugins.py    |  62 +++
 pyproject.toml                         |   1 +
 test/telemetry/test_metrics.py         |  30 +-
 test/telemetry/test_metrics_backend.py |  22 +-
 test/telemetry/test_metrics_token.py   |  54 +-
 uv.lock                                |  13 +-
 14 files changed, 838 insertions(+), 189 deletions(-)
 create mode 100644 docs/dev/metrics_refactor_plan.md
 create mode 100644 mellea/telemetry/metrics_plugins.py

diff --git a/docs/dev/metrics_refactor_plan.md b/docs/dev/metrics_refactor_plan.md
new file mode 100644
index 00000000..4a82ce7b
--- /dev/null
+++ b/docs/dev/metrics_refactor_plan.md
@@ -0,0 +1,673 @@
+# Metrics Refactor Plan: Migrate to Hooks/Plugins System
+
+## Overview
+
+This plan addresses issues [#607](https://github.com/generative-computing/mellea/issues/607) and [#608](https://github.com/generative-computing/mellea/issues/608) to refactor the token metrics implementation added in commit `0e71558` to use the hooks/plugins system introduced in commit `cbd63bd`.
+
+**Current State**: Token metrics are recorded via direct `record_token_usage_metrics()` calls in each backend's `post_processing()` method (5 backends: OpenAI, Ollama, LiteLLM, HuggingFace, WatsonX).
+
+**Target State**: Token metrics recorded via a plugin that hooks into `generation_post_call`, with token usage data standardized on `ModelOutputThunk` using a `usage` field that matches the OpenAI API standard.
+
+## Key Commits Analysis
+
+### Commit 0e71558 (Metrics Implementation)
+- Added `mellea/telemetry/metrics.py` with `record_token_usage_metrics()`
+- Modified 5 backends to call `record_token_usage_metrics()` in their `post_processing()` methods
+- Token extraction logic varies per backend:
+  - **OpenAI/LiteLLM/WatsonX**: Extract from `usage` dict via `get_value(usage, "prompt_tokens")`
+  - **Ollama**: Extract from response attributes (`prompt_eval_count`, `eval_count`)
+  - **HuggingFace**: Calculate from `input_ids` and output sequences
+- Each backend stores usage info in different `_meta` locations
+
+### Commit cbd63bd (Hooks/Plugins System)
+- Added complete hooks/plugins infrastructure in `mellea/plugins/`
+- Implemented `generation_post_call` hook that fires after `post_process()` completes
+- Hook fires in `ModelOutputThunk.astream()` at line 384-398 in `mellea/core/base.py`
+- Payload: `GenerationPostCallPayload(prompt, model_output, latency_ms)`
+- Policy: `generation_post_call` is **observe-only** (no writable fields)
+
+## Problem Statement
+
+### Issue #607: Standardize Token Storage
+**Problem**: Token usage is stored inconsistently across backends in various `_meta` locations, making programmatic access difficult.
+
+**Solution**: Add a standard `usage` field to `ModelOutputThunk` that all backends populate, matching the OpenAI API standard.
+
+### Issue #608: Use Hooks for Metrics
+**Problem**: Metrics recording is duplicated across 5 backends, tightly coupled to backend implementation.
+
+**Solution**: Create a metrics plugin that hooks `generation_post_call` to record token usage from the standardized field.
+
+## Architecture Decision
+
+The refactor follows this sequence:
+
+1. **First** (Issue #607): Standardize token storage on `ModelOutputThunk`
+2. **Then** (Issue #608): Create metrics plugin to consume standardized data
+
+This ordering is critical because:
+- The plugin needs a consistent data source to read from
+- Backends must populate the standard field before the hook fires
+- The hook fires **after** `post_processing()` completes, so backends have already extracted tokens
+
+## Detailed Implementation Plan
+
+### Phase 1: Standardize Token Storage (Issue #607)
+
+#### 1.1 Add `usage` Field to ModelOutputThunk
+
+**File**: `mellea/core/base.py`
+
+**Changes**:
+```python
+class ModelOutputThunk(CBlock, Generic[S]):
+    def __init__(
+        self,
+        value: str | None,
+        meta: dict[str, Any] | None = None,
+        parsed_repr: S | None = None,
+        tool_calls: dict[str, ModelToolCall] | None = None,
+    ):
+        # ... existing code ...
+        
+        # Add new field for standardized usage information
+        self.usage: dict[str, int] | None = None
+        """Usage information following OpenAI API standard.
+        
+        Core fields: 'prompt_tokens', 'completion_tokens', 'total_tokens'.
+        Populated by backends during post_processing. None if unavailable.
+        
+        Future: May include optional breakdown fields like 'completion_tokens_details'
+        and 'prompt_tokens_details' for advanced features (reasoning, audio, caching).
+        """
+```
+
+**Rationale**:
+- Matches OpenAI API standard (industry convention)
+- Consistent with existing `tool_calls` field pattern
+- Extensible for future usage fields beyond tokens (cost, reasoning tokens, cached tokens)
+- Simple dict format compatible with all backends
+- Optional (None) when backend doesn't provide usage info
+- Accessible via `model_output.usage` in hooks and user code
+
+#### 1.2 Update Backend `post_processing()` Methods
+
+Each backend's `post_processing()` method must populate `mot.token_usage` **before** the method returns (since `generation_post_call` hook fires after `post_processing` completes).
+
+**Pattern for all backends**:
+```python
+async def post_processing(self, mot: ModelOutputThunk, ...):
+    # ... existing processing logic ...
+    
+    # Extract token usage (backend-specific logic)
+    prompt_tokens = <extract_prompt_tokens>
+    completion_tokens = <extract_completion_tokens>
+    
+    # Populate standardized field (matches OpenAI API format)
+    if prompt_tokens is not None or completion_tokens is not None:
+        mot.usage = {
+            "prompt_tokens": prompt_tokens or 0,
+            "completion_tokens": completion_tokens or 0,
+            "total_tokens": (prompt_tokens or 0) + (completion_tokens or 0),
+        }
+    
+    # REMOVE: Direct metrics recording
+    # from ..telemetry.metrics import record_token_usage_metrics
+    # record_token_usage_metrics(...)
+```
+
+**Backend-Specific Extraction Logic**:
+
+1. **OpenAI** (`mellea/backends/openai.py:562-646`):
+   ```python
+   # Extract from response or streaming usage (lines 620-626)
+   response = mot._meta["oai_chat_response"]
+   usage = response.get("usage") if isinstance(response, dict) else None
+   if usage is None:
+       usage = mot._meta.get("oai_streaming_usage")
+   
+   # Populate standardized field (already matches OpenAI format)
+   if usage:
+       mot.usage = {
+           "prompt_tokens": get_value(usage, "prompt_tokens") or 0,
+           "completion_tokens": get_value(usage, "completion_tokens") or 0,
+           "total_tokens": get_value(usage, "total_tokens") or 0,
+       }
+   ```
+
+2. **Ollama** (`mellea/backends/ollama.py:583-642`):
+   ```python
+   # Extract from response attributes (lines 620-623)
+   response = mot._meta.get("ollama_response")
+   prompt_tokens = getattr(response, "prompt_eval_count", None) if response else None
+   completion_tokens = getattr(response, "eval_count", None) if response else None
+   
+   # Convert to OpenAI-compatible format
+   if prompt_tokens is not None or completion_tokens is not None:
+       mot.usage = {
+           "prompt_tokens": prompt_tokens or 0,
+           "completion_tokens": completion_tokens or 0,
+           "total_tokens": (prompt_tokens or 0) + (completion_tokens or 0),
+       }
+   ```
+
+3. **LiteLLM** (`mellea/backends/litellm.py:425-508`):
+   ```python
+   # Extract from full response or streaming usage (lines 483-489)
+   full_response = mot._meta.get("litellm_full_response")
+   usage = full_response.get("usage") if isinstance(full_response, dict) else None
+   if usage is None:
+       usage = mot._meta.get("litellm_streaming_usage")
+   
+   # Populate standardized field (already matches OpenAI format)
+   if usage:
+       mot.usage = {
+           "prompt_tokens": get_value(usage, "prompt_tokens") or 0,
+           "completion_tokens": get_value(usage, "completion_tokens") or 0,
+           "total_tokens": get_value(usage, "total_tokens") or 0,
+       }
+   ```
+
+4. **HuggingFace** (`mellea/backends/huggingface.py:1001-1092`):
+   ```python
+   # Calculate from sequences (lines 1063-1076)
+   hf_output = mot._meta.get("hf_output")
+   if isinstance(hf_output, GenerateDecoderOnlyOutput):
+       if input_ids is not None and hf_output.sequences is not None:
+           try:
+               n_prompt = input_ids.shape[1]
+               n_completion = hf_output.sequences[0].shape[0] - n_prompt
+               # Convert to OpenAI-compatible format
+               mot.usage = {
+                   "prompt_tokens": n_prompt,
+                   "completion_tokens": n_completion,
+                   "total_tokens": n_prompt + n_completion,
+               }
+           except Exception:
+               pass  # Leave as None if calculation fails
+   ```
+
+5. **WatsonX** (`mellea/backends/watsonx.py:450-508`):
+   ```python
+   # Extract from response usage (similar to OpenAI)
+   response = mot._meta.get("watsonx_response")
+   usage = response.get("usage") if isinstance(response, dict) else None
+   
+   # Populate standardized field (already matches OpenAI format)
+   if usage:
+       mot.usage = {
+           "prompt_tokens": get_value(usage, "prompt_tokens") or 0,
+           "completion_tokens": get_value(usage, "completion_tokens") or 0,
+           "total_tokens": get_value(usage, "total_tokens") or 0,
+       }
+   ```
+
+**Files to Modify**:
+- `mellea/core/base.py` (add field)
+- `mellea/backends/openai.py` (populate field, remove direct metrics call)
+- `mellea/backends/ollama.py` (populate field, remove direct metrics call)
+- `mellea/backends/litellm.py` (populate field, remove direct metrics call)
+- `mellea/backends/huggingface.py` (populate field, remove direct metrics call)
+- `mellea/backends/watsonx.py` (populate field, remove direct metrics call)
+
+### Phase 2: Create Metrics Plugin (Issue #608)
+
+#### 2.1 Create Token Metrics Plugin
+
+**New File**: `mellea/plugins/builtin/token_metrics.py`
+
+**Implementation**:
+```python
+"""Built-in plugin for recording token usage metrics via OpenTelemetry."""
+
+from mellea.plugins import Plugin, hook, HookType, PluginMode
+from mellea.plugins.hooks.generation import GenerationPostCallPayload
+
+
+class TokenMetricsPlugin(Plugin, name="token-metrics", priority=100):
+    """Records token usage metrics from generation_post_call hook.
+    
+    This plugin automatically records input/output token counts to OpenTelemetry
+    metrics when MELLEA_METRICS_ENABLED=true. It reads from the standardized
+    ModelOutputThunk.usage field populated by backends.
+    
+    Execution mode: FIRE_AND_FORGET (async, non-blocking, observe-only)
+    Priority: 100 (runs after other plugins)
+    """
+    
+    @hook(HookType.GENERATION_POST_CALL, mode=PluginMode.FIRE_AND_FORGET)
+    async def record_tokens(
+        self,
+        payload: GenerationPostCallPayload,
+        ctx
+    ):
+        """Record token usage metrics from the model output."""
+        from mellea.telemetry.metrics import is_metrics_enabled
+        
+        # Early return if metrics disabled (zero overhead)
+        if not is_metrics_enabled():
+            return
+        
+        mot = payload.model_output
+        if mot is None or mot.usage is None:
+            return
+        
+        # Extract backend info from context
+        backend = ctx.global_context.state.get("backend")
+        if backend is None:
+            return
+        
+        from mellea.telemetry.backend_instrumentation import (
+            get_model_id_str,
+            get_system_name,
+        )
+        from mellea.telemetry.metrics import record_token_usage_metrics
+        
+        # Record using standardized usage dict
+        record_token_usage_metrics(
+            input_tokens=mot.usage.get("prompt_tokens"),
+            output_tokens=mot.usage.get("completion_tokens"),
+            model=get_model_id_str(backend),
+            backend=backend.__class__.__name__,
+            system=get_system_name(backend),
+        )
+```
+
+**Key Design Decisions**:
+- **FIRE_AND_FORGET mode**: Metrics recording is async, non-blocking, and cannot fail the generation
+- **Priority 100**: Runs after other plugins (lower priority = earlier execution)
+- **Observe-only**: Reads from `model_output.usage`, doesn't modify payload
+- **Zero overhead**: Early return when metrics disabled
+- **Backend-agnostic**: Works with any backend that populates `usage`
+- **OpenAI-compatible**: Uses standard field name for familiarity and future extensibility
+
+#### 2.2 Auto-Register Plugin When Metrics Enabled
+
+**File**: `mellea/telemetry/metrics.py`
+
+**Changes**:
+```python
+# At module initialization (after _meter setup, around line 247)
+if _OTEL_AVAILABLE and _METRICS_ENABLED:
+    _meter_provider = _setup_meter_provider()
+    if _meter_provider is not None:
+        _meter = metrics.get_meter("mellea.metrics", version("mellea"))
+        
+        # Auto-register token metrics plugin
+        from mellea.plugins.builtin.token_metrics import TokenMetricsPlugin
+        from mellea.plugins import register
+        
+        _token_metrics_plugin = TokenMetricsPlugin()
+        register(_token_metrics_plugin)
+```
+
+**Rationale**:
+- Automatic activation when `MELLEA_METRICS_ENABLED=true`
+- No user code changes required
+- Consistent with existing metrics module behavior
+- Plugin is globally registered (works with both session and functional API)
+
+#### 2.3 Update Plugin Module Structure
+
+**New Directory**: `mellea/plugins/builtin/`
+
+**Files**:
+- `mellea/plugins/builtin/__init__.py` - Export built-in plugins
+- `mellea/plugins/builtin/token_metrics.py` - Token metrics plugin
+
+**Update**: `mellea/plugins/__init__.py`
+```python
+# Add to exports
+from mellea.plugins.builtin import TokenMetricsPlugin
+
+__all__ = [
+    # ... existing exports ...
+    "TokenMetricsPlugin",
+]
+```
+
+### Phase 3: Update Tests
+
+#### 3.1 Update Backend Integration Tests
+
+**Files**: `test/telemetry/test_metrics_backend.py`
+
+**Changes**:
+- Tests should verify `mot.token_usage` is populated (not just metrics recorded)
+- Add assertions: `assert mot.token_usage is not None`
+- Add assertions: `assert mot.token_usage["prompt_tokens"] > 0`
+- Keep existing metrics verification (plugin should still record them)
+
+**Example**:
+```python
+async def test_openai_token_metrics(ollama_backend):
+    """Test that OpenAI backend populates usage and metrics are recorded."""
+    # ... existing test setup ...
+    
+    # Verify usage field is populated
+    assert mot.usage is not None
+    assert mot.usage["prompt_tokens"] > 0
+    assert mot.usage["completion_tokens"] > 0
+    assert mot.usage["total_tokens"] > 0
+    
+    # Verify metrics were recorded (via plugin)
+    # ... existing metrics verification ...
+```
+
+#### 3.2 Add Plugin-Specific Tests
+
+**New File**: `test/plugins/test_token_metrics_plugin.py`
+
+**Tests**:
+1. Test plugin registers correctly when metrics enabled
+2. Test plugin reads from `mot.usage` correctly
+3. Test plugin handles missing `usage` gracefully
+4. Test plugin is no-op when metrics disabled
+5. Test plugin works with all backends
+6. Test plugin doesn't block on errors (FIRE_AND_FORGET mode)
+
+#### 3.3 Update Unit Tests
+
+**File**: `test/telemetry/test_metrics_token.py`
+
+**Changes**:
+- Tests for `record_token_usage_metrics()` remain unchanged (function still exists)
+- Add note that function is now called by plugin, not backends directly
+
+### Phase 4: Update Documentation
+
+#### 4.1 Update Telemetry Documentation
+
+**File**: `docs/dev/telemetry.md`
+
+**Changes**:
+- Update "Token Usage Metrics" section to explain plugin-based architecture
+- Add note about `ModelOutputThunk.usage` field (OpenAI-compatible)
+- Update backend support table to note standardized field
+- Add example of accessing token usage programmatically
+
+**Example Addition**:
+```markdown
+### Programmatic Access to Token Usage
+
+Token usage information is available on the `ModelOutputThunk` after generation:
+
+```python
+with start_session() as m:
+    result = m.instruct("What is the capital of France?")
+    
+    # Access token usage (OpenAI-compatible format)
+    if result.usage:
+        print(f"Prompt tokens: {result.usage['prompt_tokens']}")
+        print(f"Completion tokens: {result.usage['completion_tokens']}")
+        print(f"Total tokens: {result.usage['total_tokens']}")
+```
+
+Token metrics are automatically recorded to OpenTelemetry when 
+`MELLEA_METRICS_ENABLED=true` via the built-in `TokenMetricsPlugin`.
+```
+
+#### 4.2 Update Plugin Documentation
+
+**File**: `docs/docs/core-concept/plugins.mdx`
+
+**Changes**:
+- Add section on built-in plugins
+- Document `TokenMetricsPlugin` as an example of FIRE_AND_FORGET mode
+- Show how built-in plugins auto-register
+
+#### 4.3 Update Examples
+
+**File**: `docs/examples/telemetry/metrics_example.py`
+
+**Changes**:
+- Add example showing programmatic access to `usage`
+- Add comment explaining plugin-based architecture
+- No functional changes (metrics still work the same way)
+
+**Example Addition**:
+```python
+# Example 5: Programmatic token access
+print("\n5. Accessing token usage programmatically...")
+result = m.instruct("Count to five")
+if result.usage:
+    print(f"  Prompt tokens: {result.usage['prompt_tokens']}")
+    print(f"  Completion tokens: {result.usage['completion_tokens']}")
+    print(f"  Total tokens: {result.usage['total_tokens']}")
+```
+
+### Phase 5: Cleanup
+
+#### 5.1 Remove Backend-Specific Metrics Code
+
+**All Backend Files**:
+- Remove `from ..telemetry.metrics import is_metrics_enabled` imports
+- Remove `from ..telemetry.metrics import record_token_usage_metrics` imports
+- Remove `from ..telemetry.backend_instrumentation import get_model_id_str, get_system_name` imports (if only used for metrics)
+- Remove `if is_metrics_enabled():` blocks and `record_token_usage_metrics()` calls
+
+**Estimated Lines Removed**: ~15-20 lines per backend × 5 backends = ~75-100 lines
+
+#### 5.2 Keep Core Metrics Infrastructure
+
+**Files to Keep Unchanged**:
+- `mellea/telemetry/metrics.py` - Core metrics functions still needed by plugin
+- `mellea/telemetry/backend_instrumentation.py` - Helper functions still needed
+
+## Implementation Order
+
+### Step 1: Add `token_usage` Field (Issue #607)
+1. Modify `mellea/core/base.py` to add `token_usage` field
+2. Update all 5 backends to populate the field
+3. Keep existing `record_token_usage_metrics()` calls temporarily (dual-write)
+4. Run tests to verify field is populated correctly
+
+### Step 2: Create Metrics Plugin (Issue #608)
+5. Create `mellea/plugins/builtin/` directory
+6. Implement `TokenMetricsPlugin` in `token_metrics.py`
+7. Auto-register plugin in `mellea/telemetry/metrics.py`
+8. Run tests to verify plugin records metrics correctly
+
+### Step 3: Remove Duplicate Code
+9. Remove direct `record_token_usage_metrics()` calls from all backends
+10. Remove unused imports from backends
+11. Run full test suite to verify no regressions
+
+### Step 4: Update Documentation and Tests
+12. Update backend integration tests to verify `token_usage` field
+13. Add plugin-specific tests
+14. Update documentation (telemetry.md, plugins.mdx)
+15. Update examples to show programmatic access
+
+## Testing Strategy
+
+### Test Coverage Required
+
+1. **Unit Tests** (existing, should pass unchanged):
+   - `test/telemetry/test_metrics_token.py` - Tests for `record_token_usage_metrics()`
+
+2. **Backend Integration Tests** (need updates):
+   - `test/telemetry/test_metrics_backend.py` - Add `token_usage` field assertions
+   - All 5 backend tests should verify field population
+
+3. **Plugin Tests** (new):
+   - `test/plugins/test_token_metrics_plugin.py` - Plugin-specific tests
+   - Test plugin registration, execution, error handling
+
+4. **End-to-End Tests** (existing, should pass):
+   - `docs/examples/telemetry/metrics_example.py` - Should work unchanged
+
+### Test Execution
+
+```bash
+# Run metrics tests
+uv run pytest test/telemetry/test_metrics*.py -v
+
+# Run plugin tests
+uv run pytest test/plugins/test_token_metrics_plugin.py -v
+
+# Run backend tests
+uv run pytest test/backends/ -k "metrics" -v
+
+# Run example as test
+uv run pytest docs/examples/telemetry/metrics_example.py -v
+```
+
+## Migration Impact
+
+### Breaking Changes
+**None** - This is a pure refactor with no API changes:
+- Metrics still recorded automatically when `MELLEA_METRICS_ENABLED=true`
+- Same metric names, attributes, and exporters
+- User code unchanged
+
+### New Features
+- **Programmatic access**: Users can now access `mot.token_usage` directly
+- **Extensibility**: Users can create custom metrics plugins following this pattern
+- **Consistency**: Token data in standard location across all backends
+
+### Performance Impact
+- **Negligible**: Plugin uses FIRE_AND_FORGET mode (async, non-blocking)
+- **Same overhead**: Metrics recording logic unchanged, just moved to plugin
+- **Zero overhead when disabled**: Early return in plugin when metrics disabled
+
+## Risks and Mitigations
+
+### Risk 1: Hook Timing
+**Risk**: `generation_post_call` fires after `post_processing()`, but what if backends don't populate `token_usage` in time?
+
+**Mitigation**: Backends populate `token_usage` **during** `post_processing()`, which completes **before** the hook fires. The hook call is at line 384-398 in `ModelOutputThunk.astream()`, after line 366 (`await self._post_process(self)`).
+
+### Risk 2: Missing Token Data
+**Risk**: Some backends might not have token usage available.
+
+**Mitigation**: 
+- `token_usage` field is optional (None when unavailable)
+- Plugin checks `if mot.token_usage is None: return`
+- Existing behavior preserved (metrics not recorded when data unavailable)
+
+### Risk 3: Plugin Framework Dependency
+**Risk**: Plugin requires `mellea[hooks]` extra dependency.
+
+**Mitigation**:
+- Plugin only imported when metrics enabled
+- Graceful fallback if hooks not installed (warning message)
+- Most users enabling metrics will have full installation
+
+### Risk 4: Test Failures
+**Risk**: Existing tests might fail if they expect metrics without the plugin.
+
+**Mitigation**:
+- Plugin auto-registers when metrics enabled (same as before)
+- Tests run with `MELLEA_METRICS_ENABLED=true` will get plugin automatically
+- Dual-write during Step 1 ensures no test breakage during transition
+
+## Success Criteria
+
+### Phase 1 Complete When:
+- [ ] `ModelOutputThunk.token_usage` field added
+- [ ] All 5 backends populate the field correctly
+- [ ] Backend integration tests verify field population
+- [ ] Existing metrics tests still pass (dual-write active)
+
+### Phase 2 Complete When:
+- [ ] `TokenMetricsPlugin` implemented and tested
+- [ ] Plugin auto-registers when metrics enabled
+- [ ] Plugin records metrics correctly (verified by existing tests)
+- [ ] Plugin-specific tests added and passing
+
+### Phase 3 Complete When:
+- [ ] Direct `record_token_usage_metrics()` calls removed from backends
+- [ ] Unused imports cleaned up
+- [ ] All tests pass (metrics now recorded via plugin only)
+
+### Phase 4 Complete When:
+- [ ] Documentation updated (telemetry.md, plugins.mdx)
+- [ ] Examples updated to show programmatic access
+- [ ] All documentation builds without errors
+
+### Overall Success:
+- [ ] All tests pass (`uv run pytest test/`)
+- [ ] No breaking changes to user API
+- [ ] Metrics still recorded correctly
+- [ ] Code is cleaner (no duplication across backends)
+- [ ] Token usage accessible programmatically via `mot.token_usage`
+
+## File Checklist
+
+### Files to Create
+- [ ] `mellea/plugins/builtin/__init__.py`
+- [ ] `mellea/plugins/builtin/token_metrics.py`
+- [ ] `test/plugins/test_token_metrics_plugin.py`
+
+### Files to Modify
+- [ ] `mellea/core/base.py` - Add `token_usage` field
+- [ ] `mellea/backends/openai.py` - Populate field, remove metrics call
+- [ ] `mellea/backends/ollama.py` - Populate field, remove metrics call
+- [ ] `mellea/backends/litellm.py` - Populate field, remove metrics call
+- [ ] `mellea/backends/huggingface.py` - Populate field, remove metrics call
+- [ ] `mellea/backends/watsonx.py` - Populate field, remove metrics call
+- [ ] `mellea/telemetry/metrics.py` - Auto-register plugin
+- [ ] `mellea/plugins/__init__.py` - Export TokenMetricsPlugin
+- [ ] `test/telemetry/test_metrics_backend.py` - Add token_usage assertions
+- [ ] `docs/dev/telemetry.md` - Document new architecture
+- [ ] `docs/docs/core-concept/plugins.mdx` - Document built-in plugin
+- [ ] `docs/examples/telemetry/metrics_example.py` - Add programmatic access example
+
+### Files to Review (No Changes Expected)
+- `mellea/telemetry/backend_instrumentation.py` - Helper functions still used
+- `test/telemetry/test_metrics_token.py` - Unit tests for core function
+- `mellea/plugins/hooks/generation.py` - Payload definition unchanged
+
+## Estimated Effort
+
+- **Phase 1** (Standardize storage): 2-3 hours
+  - Add field: 15 min
+  - Update 5 backends: 1.5 hours (30 min each)
+  - Update tests: 1 hour
+
+- **Phase 2** (Create plugin): 1-2 hours
+  - Implement plugin: 45 min
+  - Auto-registration: 15 min
+  - Plugin tests: 1 hour
+
+- **Phase 3** (Cleanup): 30 min
+  - Remove duplicate code: 20 min
+  - Verify tests: 10 min
+
+- **Phase 4** (Documentation): 1 hour
+  - Update docs: 30 min
+  - Update examples: 30 min
+
+**Total**: 4.5-6.5 hours
+
+## Open Questions
+
+1. **Should `token_usage` be added to `GenerationPostCallPayload`?**
+   - Pro: Makes it explicit in the hook payload
+   - Con: Redundant (already on `model_output`)
+   - **Recommendation**: No, keep it on MOT only (single source of truth)
+
+2. **Should the plugin be optional or always registered?**
+   - Current plan: Auto-register when `MELLEA_METRICS_ENABLED=true`
+   - Alternative: Always register, let plugin check `is_metrics_enabled()` internally
+   - **Recommendation**: Auto-register (cleaner, no overhead when disabled)
+
+3. **Should we support custom metrics plugins?**
+   - Users could create their own plugins reading from `token_usage`
+   - **Recommendation**: Yes, document this pattern in plugins.mdx
+
+4. **What about vLLM backend?**
+   - Not modified in commit 0e71558 (no metrics support)
+   - **Recommendation**: Out of scope for this refactor, can be added later
+
+## References
+
+- Issue #607: https://github.com/generative-computing/mellea/issues/607
+- Issue #608: https://github.com/generative-computing/mellea/issues/608
+- PR #563 (metrics): https://github.com/generative-computing/mellea/pull/563
+- PR #582 (hooks): https://github.com/generative-computing/mellea/pull/582
+- Commit 0e71558: Metrics implementation
+- Commit cbd63bd: Hooks/plugins system
+- Hook system spec: `docs/dev/hook_system.md`
+- Plugin examples: `docs/examples/plugins/`
\ No newline at end of file
diff --git a/mellea/backends/huggingface.py b/mellea/backends/huggingface.py
index 83655f3e..4efa5cbb 100644
--- a/mellea/backends/huggingface.py
+++ b/mellea/backends/huggingface.py
@@ -1083,21 +1083,12 @@ async def post_processing(
                 "total_tokens": n_prompt + n_completion,
             }
 
-        # Record metrics if enabled
-        if metrics_enabled and n_prompt is not None:
-            from ..telemetry.backend_instrumentation import (
-                get_model_id_str,
-                get_system_name,
-            )
-            from ..telemetry.metrics import record_token_usage_metrics
-
-            record_token_usage_metrics(
-                input_tokens=n_prompt,
-                output_tokens=n_completion,
-                model=get_model_id_str(self),
-                backend=self.__class__.__name__,
-                system=get_system_name(self),
-            )
+        # Populate model and provider metadata
+        if hasattr(self.model_id, "hf_model_name"):
+            mot.model = str(self.model_id.hf_model_name)  # type: ignore
+        else:
+            mot.model = str(self.model_id)
+        mot.provider = "huggingface"
 
         # Record tracing if span exists
         if span is not None:
diff --git a/mellea/backends/litellm.py b/mellea/backends/litellm.py
index ef453d29..43e8c262 100644
--- a/mellea/backends/litellm.py
+++ b/mellea/backends/litellm.py
@@ -492,24 +492,9 @@ async def post_processing(
         if usage:
             mot.usage = usage
 
-        # Record metrics if enabled
-        from ..telemetry.metrics import is_metrics_enabled
-
-        if is_metrics_enabled() and usage:
-            from ..telemetry.backend_instrumentation import (
-                get_model_id_str,
-                get_system_name,
-            )
-            from ..telemetry.metrics import record_token_usage_metrics
-            from .utils import get_value
-
-            record_token_usage_metrics(
-                input_tokens=get_value(usage, "prompt_tokens"),
-                output_tokens=get_value(usage, "completion_tokens"),
-                model=get_model_id_str(self),
-                backend=self.__class__.__name__,
-                system=get_system_name(self),
-            )
+        # Populate model and provider metadata
+        mot.model = str(self.model_id)
+        mot.provider = "litellm"
 
         # Record telemetry now that response is available
         span = mot._meta.get("_telemetry_span")
diff --git a/mellea/backends/ollama.py b/mellea/backends/ollama.py
index 276bb180..8cae59fa 100644
--- a/mellea/backends/ollama.py
+++ b/mellea/backends/ollama.py
@@ -630,23 +630,9 @@ async def post_processing(
                 "total_tokens": (prompt_tokens or 0) + (completion_tokens or 0),
             }
 
-        # Record metrics if enabled
-        from ..telemetry.metrics import is_metrics_enabled
-
-        if is_metrics_enabled():
-            from ..telemetry.backend_instrumentation import (
-                get_model_id_str,
-                get_system_name,
-            )
-            from ..telemetry.metrics import record_token_usage_metrics
-
-            record_token_usage_metrics(
-                input_tokens=prompt_tokens,
-                output_tokens=completion_tokens,
-                model=get_model_id_str(self),
-                backend=self.__class__.__name__,
-                system=get_system_name(self),
-            )
+        # Populate model and provider metadata
+        mot.model = str(self.model_id)
+        mot.provider = "ollama"
 
         # Record telemetry and close span now that response is available
         span = mot._meta.get("_telemetry_span")
diff --git a/mellea/backends/openai.py b/mellea/backends/openai.py
index 16a93485..561fd58b 100644
--- a/mellea/backends/openai.py
+++ b/mellea/backends/openai.py
@@ -629,24 +629,9 @@ async def post_processing(
         if usage:
             mot.usage = usage
 
-        # Record metrics if enabled
-        from ..telemetry.metrics import is_metrics_enabled
-
-        if is_metrics_enabled() and usage:
-            from ..telemetry.backend_instrumentation import (
-                get_model_id_str,
-                get_system_name,
-            )
-            from ..telemetry.metrics import record_token_usage_metrics
-            from .utils import get_value
-
-            record_token_usage_metrics(
-                input_tokens=get_value(usage, "prompt_tokens"),
-                output_tokens=get_value(usage, "completion_tokens"),
-                model=get_model_id_str(self),
-                backend=self.__class__.__name__,
-                system=get_system_name(self),
-            )
+        # Populate model and provider metadata
+        mot.model = str(self.model_id)
+        mot.provider = "openai"
 
         # Record telemetry now that response is available
         span = mot._meta.get("_telemetry_span")
diff --git a/mellea/backends/watsonx.py b/mellea/backends/watsonx.py
index 5a4bf24a..f6f65283 100644
--- a/mellea/backends/watsonx.py
+++ b/mellea/backends/watsonx.py
@@ -500,24 +500,9 @@ async def post_processing(
         if usage:
             mot.usage = usage
 
-        # Record metrics if enabled
-        from ..telemetry.metrics import is_metrics_enabled
-
-        if is_metrics_enabled() and usage:
-            from ..telemetry.backend_instrumentation import (
-                get_model_id_str,
-                get_system_name,
-            )
-            from ..telemetry.metrics import record_token_usage_metrics
-            from .utils import get_value
-
-            record_token_usage_metrics(
-                input_tokens=get_value(usage, "prompt_tokens"),
-                output_tokens=get_value(usage, "completion_tokens"),
-                model=get_model_id_str(self),
-                backend=self.__class__.__name__,
-                system=get_system_name(self),
-            )
+        # Populate model and provider metadata
+        mot.model = str(self.model_id)
+        mot.provider = "watsonx"
 
         # Record tracing if span exists
         span = mot._meta.get("_telemetry_span")
diff --git a/mellea/core/base.py b/mellea/core/base.py
index 28b58234..4850fc9d 100644
--- a/mellea/core/base.py
+++ b/mellea/core/base.py
@@ -206,6 +206,20 @@ def __init__(
         and 'prompt_tokens_details' for advanced features (reasoning, audio, caching).
         """
 
+        self.model: str | None = None
+        """Model identifier that generated this output.
+
+        Examples: 'gpt-4', 'llama2:7b', 'meta-llama/Llama-2-7b-hf'.
+        Populated by backends. None if unavailable.
+        """
+
+        self.provider: str | None = None
+        """Provider that generated this output.
+
+        Examples: 'openai', 'ollama', 'huggingface', 'watsonx'.
+        Populated by backends. None if unavailable.
+        """
+
         # Used for tracking generation.
         self._context: list[Component | CBlock] | None = None
         self._action: Component | CBlock | None = None
@@ -243,6 +257,8 @@ def _copy_from(self, other: ModelOutputThunk) -> None:
         self.tool_calls = other.tool_calls
         self._thinking = other._thinking
         self.usage = other.usage
+        self.model = other.model
+        self.provider = other.provider
         self._generate_log = other._generate_log
 
     def is_computed(self) -> bool:
@@ -443,6 +459,9 @@ def __copy__(self) -> ModelOutputThunk:
         copied._context = self._context
         copied._generate_log = self._generate_log
         copied._model_options = self._model_options
+        copied.usage = self.usage
+        copied.model = self.model
+        copied.provider = self.provider
         return copied
 
     def __deepcopy__(self, memo: dict) -> ModelOutputThunk:
@@ -472,6 +491,9 @@ def __deepcopy__(self, memo: dict) -> ModelOutputThunk:
         )  # The items in a context should be immutable.
         deepcopied._generate_log = copy(self._generate_log)
         deepcopied._model_options = copy(self._model_options)
+        deepcopied.usage = deepcopy(self.usage) if self.usage else None
+        deepcopied.model = self.model
+        deepcopied.provider = self.provider
         return deepcopied
 
 
diff --git a/mellea/telemetry/metrics.py b/mellea/telemetry/metrics.py
index e4b99294..9382c10b 100644
--- a/mellea/telemetry/metrics.py
+++ b/mellea/telemetry/metrics.py
@@ -407,13 +407,9 @@ def _get_token_counters() -> tuple[Any, Any]:
 
 
 def record_token_usage_metrics(
-    input_tokens: int | None,
-    output_tokens: int | None,
-    model: str,
-    backend: str,
-    system: str,
+    input_tokens: int | None, output_tokens: int | None, model: str, provider: str
 ) -> None:
-    """Record token usage metrics following Gen-AI semantic conventions.
+    """Record token usage metrics following OpenTelemetry Gen-AI semantic conventions.
 
     This is a no-op when metrics are disabled, ensuring zero overhead.
 
@@ -421,16 +417,14 @@ def record_token_usage_metrics(
         input_tokens: Number of input tokens (prompt tokens), or None if unavailable
         output_tokens: Number of output tokens (completion tokens), or None if unavailable
         model: Model identifier (e.g., "gpt-4", "llama2:7b")
-        backend: Backend class name (e.g., "OpenAIBackend", "OllamaBackend")
-        system: Gen-AI system name (e.g., "openai", "ollama", "watsonx")
+        provider: Provider name (e.g., "openai", "ollama", "watsonx")
 
     Example:
         record_token_usage_metrics(
             input_tokens=150,
             output_tokens=50,
             model="llama2:7b",
-            backend="OllamaBackend",
-            system="ollama"
+            provider="ollama"
         )
     """
     # Early return if metrics are disabled (zero overhead)
@@ -440,12 +434,8 @@ def record_token_usage_metrics(
     # Get the token counters (lazily initialized)
     input_counter, output_counter = _get_token_counters()
 
-    # Prepare attributes following Gen-AI semantic conventions
-    attributes = {
-        "gen_ai.system": system,
-        "gen_ai.request.model": model,
-        "mellea.backend": backend,
-    }
+    # Prepare attributes following OTel Gen-AI semantic conventions
+    attributes = {"gen_ai.provider.name": provider, "gen_ai.request.model": model}
 
     # Record input tokens if available
     if input_tokens is not None and input_tokens > 0:
@@ -456,6 +446,30 @@ def record_token_usage_metrics(
         output_counter.add(output_tokens, attributes)
 
 
+# Auto-register TokenMetricsPlugin when metrics are enabled
+if _OTEL_AVAILABLE and _METRICS_ENABLED:
+    try:
+        from mellea.plugins.registry import register
+        from mellea.telemetry.metrics_plugins import TokenMetricsPlugin
+
+        # Idempotent registration (supports module reloads in tests)
+        try:
+            register(TokenMetricsPlugin())
+        except ValueError as e:
+            # Already registered (expected during module reloads in tests)
+            warnings.warn(
+                f"TokenMetricsPlugin already registered: {e}", UserWarning, stacklevel=2
+            )
+    except ImportError:
+        warnings.warn(
+            "Metrics are enabled but the plugin framework is not installed. "
+            "Token usage metrics will not be recorded automatically. "
+            "Install with: pip install mellea[telemetry]",
+            UserWarning,
+            stacklevel=2,
+        )
+
+
 __all__ = [
     "create_counter",
     "create_histogram",
diff --git a/mellea/telemetry/metrics_plugins.py b/mellea/telemetry/metrics_plugins.py
new file mode 100644
index 00000000..cbf45ac8
--- /dev/null
+++ b/mellea/telemetry/metrics_plugins.py
@@ -0,0 +1,62 @@
+"""Metrics plugins for recording telemetry data via hooks.
+
+This module contains plugins that hook into the generation pipeline to
+automatically record metrics when enabled. Currently includes:
+
+- TokenMetricsPlugin: Records token usage statistics from ModelOutputThunk.usage
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+from mellea.plugins.base import Plugin
+from mellea.plugins.decorators import hook
+from mellea.plugins.types import PluginMode
+
+if TYPE_CHECKING:
+    from mellea.plugins.hooks.generation import GenerationPostCallPayload
+
+
+class TokenMetricsPlugin(Plugin, name="token_metrics", priority=50):
+    """Records token usage metrics from generation outputs.
+
+    This plugin hooks into the generation_post_call event to automatically
+    record token usage metrics when the usage field is populated on
+    ModelOutputThunk instances.
+
+    The plugin reads the standardized usage field (OpenAI-compatible format)
+    and records metrics following OpenTelemetry Gen-AI semantic conventions.
+
+    Example:
+        >>> from mellea.telemetry.metrics_plugins import TokenMetricsPlugin
+        >>> from mellea.telemetry.metrics import enable_metrics
+        >>>
+        >>> enable_metrics()
+        >>> with TokenMetricsPlugin():
+        ...     result = session.instruct("Hello, world!")
+    """
+
+    @hook("generation_post_call", mode=PluginMode.SEQUENTIAL)
+    async def record_token_metrics(
+        self, payload: GenerationPostCallPayload, context: dict[str, Any]
+    ) -> None:
+        """Record token metrics after generation completes.
+
+        Args:
+            payload: Contains the model_output (ModelOutputThunk) with usage data
+            context: Plugin context (unused)
+        """
+        from mellea.telemetry.metrics import record_token_usage_metrics
+
+        mot = payload.model_output
+        if mot.usage is None:
+            return
+
+        # Record metrics (no-op if metrics disabled)
+        record_token_usage_metrics(
+            input_tokens=mot.usage.get("prompt_tokens"),
+            output_tokens=mot.usage.get("completion_tokens"),
+            model=mot.model or "unknown",
+            provider=mot.provider or "unknown",
+        )
diff --git a/pyproject.toml b/pyproject.toml
index d5d7bb24..fbde0025 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -75,6 +75,7 @@ telemetry = [
     "opentelemetry-exporter-otlp>=1.20.0",
     "opentelemetry-exporter-prometheus>=0.40b0",
     "opentelemetry-distro>=0.59b0",
+    "mellea[hooks]",
 ]
 
 docling = [
diff --git a/test/telemetry/test_metrics.py b/test/telemetry/test_metrics.py
index 81143800..f07ea61e 100644
--- a/test/telemetry/test_metrics.py
+++ b/test/telemetry/test_metrics.py
@@ -569,11 +569,7 @@ def test_token_counters_lazy_initialization(enable_metrics):
     from mellea.telemetry.metrics import record_token_usage_metrics
 
     record_token_usage_metrics(
-        input_tokens=100,
-        output_tokens=50,
-        model="llama2:7b",
-        backend="OllamaBackend",
-        system="ollama",
+        input_tokens=100, output_tokens=50, model="llama2:7b", provider="ollama"
     )
 
     # Now should be initialized
@@ -589,11 +585,7 @@ def test_record_token_usage_metrics_with_valid_tokens(enable_metrics):
 
     # Should not raise
     record_token_usage_metrics(
-        input_tokens=150,
-        output_tokens=50,
-        model="gpt-4",
-        backend="OpenAIBackend",
-        system="openai",
+        input_tokens=150, output_tokens=50, model="gpt-4", provider="openai"
     )
 
 
@@ -603,11 +595,7 @@ def test_record_token_usage_metrics_with_none_tokens(enable_metrics):
 
     # Should not raise
     record_token_usage_metrics(
-        input_tokens=None,
-        output_tokens=None,
-        model="llama2:7b",
-        backend="OllamaBackend",
-        system="ollama",
+        input_tokens=None, output_tokens=None, model="llama2:7b", provider="ollama"
     )
 
 
@@ -617,11 +605,7 @@ def test_record_token_usage_metrics_with_zero_tokens(enable_metrics):
 
     # Should not raise, but won't record zeros
     record_token_usage_metrics(
-        input_tokens=0,
-        output_tokens=0,
-        model="llama2:7b",
-        backend="OllamaBackend",
-        system="ollama",
+        input_tokens=0, output_tokens=0, model="llama2:7b", provider="ollama"
     )
 
 
@@ -631,11 +615,7 @@ def test_record_token_usage_metrics_noop_when_disabled(clean_metrics_env):
 
     # Should not raise and should be no-op
     record_token_usage_metrics(
-        input_tokens=100,
-        output_tokens=50,
-        model="llama2:7b",
-        backend="OllamaBackend",
-        system="ollama",
+        input_tokens=100, output_tokens=50, model="llama2:7b", provider="ollama"
     )
 
     # Counters should still be None (not initialized)
diff --git a/test/telemetry/test_metrics_backend.py b/test/telemetry/test_metrics_backend.py
index 5b2702bd..8918564e 100644
--- a/test/telemetry/test_metrics_backend.py
+++ b/test/telemetry/test_metrics_backend.py
@@ -142,12 +142,12 @@ async def test_ollama_token_metrics_integration(enable_metrics, metric_reader, s
 
     # Verify input token counter
     input_tokens = get_metric_value(
-        metrics_data, "mellea.llm.tokens.input", {"gen_ai.system": "ollama"}
+        metrics_data, "mellea.llm.tokens.input", {"gen_ai.provider.name": "ollama"}
     )
 
     # Verify output token counter
     output_tokens = get_metric_value(
-        metrics_data, "mellea.llm.tokens.output", {"gen_ai.system": "ollama"}
+        metrics_data, "mellea.llm.tokens.output", {"gen_ai.provider.name": "ollama"}
     )
 
     # Ollama should always return token counts
@@ -198,11 +198,11 @@ async def test_openai_token_metrics_integration(enable_metrics, metric_reader, s
 
     # OpenAI always provides token counts
     input_tokens = get_metric_value(
-        metrics_data, "mellea.llm.tokens.input", {"gen_ai.system": "openai"}
+        metrics_data, "mellea.llm.tokens.input", {"gen_ai.provider.name": "openai"}
     )
 
     output_tokens = get_metric_value(
-        metrics_data, "mellea.llm.tokens.output", {"gen_ai.system": "openai"}
+        metrics_data, "mellea.llm.tokens.output", {"gen_ai.provider.name": "openai"}
     )
 
     assert input_tokens is not None, "Input tokens should be recorded"
@@ -246,11 +246,11 @@ async def test_watsonx_token_metrics_integration(enable_metrics, metric_reader):
     metrics_data = metric_reader.get_metrics_data()
 
     input_tokens = get_metric_value(
-        metrics_data, "mellea.llm.tokens.input", {"gen_ai.system": "watsonx"}
+        metrics_data, "mellea.llm.tokens.input", {"gen_ai.provider.name": "watsonx"}
     )
 
     output_tokens = get_metric_value(
-        metrics_data, "mellea.llm.tokens.output", {"gen_ai.system": "watsonx"}
+        metrics_data, "mellea.llm.tokens.output", {"gen_ai.provider.name": "watsonx"}
     )
 
     assert input_tokens is not None, "Input tokens should be recorded"
@@ -306,11 +306,11 @@ async def test_litellm_token_metrics_integration(
     metrics_data = metric_reader.get_metrics_data()
 
     input_tokens = get_metric_value(
-        metrics_data, "mellea.llm.tokens.input", {"gen_ai.system": "litellm"}
+        metrics_data, "mellea.llm.tokens.input", {"gen_ai.provider.name": "litellm"}
     )
 
     output_tokens = get_metric_value(
-        metrics_data, "mellea.llm.tokens.output", {"gen_ai.system": "litellm"}
+        metrics_data, "mellea.llm.tokens.output", {"gen_ai.provider.name": "litellm"}
     )
 
     # LiteLLM with Ollama backend should always provide token counts
@@ -357,11 +357,13 @@ async def test_huggingface_token_metrics_integration(
 
     # HuggingFace computes token counts locally
     input_tokens = get_metric_value(
-        metrics_data, "mellea.llm.tokens.input", {"gen_ai.system": "huggingface"}
+        metrics_data, "mellea.llm.tokens.input", {"gen_ai.provider.name": "huggingface"}
     )
 
     output_tokens = get_metric_value(
-        metrics_data, "mellea.llm.tokens.output", {"gen_ai.system": "huggingface"}
+        metrics_data,
+        "mellea.llm.tokens.output",
+        {"gen_ai.provider.name": "huggingface"},
     )
 
     assert input_tokens is not None, "Input tokens should be recorded"
diff --git a/test/telemetry/test_metrics_token.py b/test/telemetry/test_metrics_token.py
index a28a9460..2a61acce 100644
--- a/test/telemetry/test_metrics_token.py
+++ b/test/telemetry/test_metrics_token.py
@@ -58,11 +58,7 @@ def test_record_token_metrics_basic(clean_metrics_env):
 
     # Record some token usage
     record_token_usage_metrics(
-        input_tokens=150,
-        output_tokens=50,
-        model="llama2:7b",
-        backend="OllamaBackend",
-        system="ollama",
+        input_tokens=150, output_tokens=50, model="llama2:7b", provider="ollama"
     )
 
     # Force metrics collection
@@ -86,9 +82,8 @@ def test_record_token_metrics_basic(clean_metrics_env):
                     # Verify attributes
                     for data_point in metric.data.data_points:
                         attrs = dict(data_point.attributes)
-                        assert attrs["gen_ai.system"] == "ollama"
+                        assert attrs["gen_ai.provider.name"] == "ollama"
                         assert attrs["gen_ai.request.model"] == "llama2:7b"
-                        assert attrs["mellea.backend"] == "OllamaBackend"
                         assert data_point.value == 150
 
                 if metric.name == "mellea.llm.tokens.output":
@@ -96,9 +91,8 @@ def test_record_token_metrics_basic(clean_metrics_env):
                     # Verify attributes
                     for data_point in metric.data.data_points:
                         attrs = dict(data_point.attributes)
-                        assert attrs["gen_ai.system"] == "ollama"
+                        assert attrs["gen_ai.provider.name"] == "ollama"
                         assert attrs["gen_ai.request.model"] == "llama2:7b"
-                        assert attrs["mellea.backend"] == "OllamaBackend"
                         assert data_point.value == 50
 
     assert found_input, "Input token metric not found"
@@ -120,18 +114,10 @@ def test_record_token_metrics_accumulation(clean_metrics_env):
 
     # Record multiple token usages with same attributes
     record_token_usage_metrics(
-        input_tokens=100,
-        output_tokens=30,
-        model="gpt-4",
-        backend="OpenAIBackend",
-        system="openai",
+        input_tokens=100, output_tokens=30, model="gpt-4", provider="openai"
     )
     record_token_usage_metrics(
-        input_tokens=200,
-        output_tokens=70,
-        model="gpt-4",
-        backend="OpenAIBackend",
-        system="openai",
+        input_tokens=200, output_tokens=70, model="gpt-4", provider="openai"
     )
 
     # Force metrics collection
@@ -166,11 +152,7 @@ def test_record_token_metrics_none_handling(clean_metrics_env):
 
     # Record with None values (should not crash)
     record_token_usage_metrics(
-        input_tokens=None,
-        output_tokens=None,
-        model="llama2:7b",
-        backend="OllamaBackend",
-        system="ollama",
+        input_tokens=None, output_tokens=None, model="llama2:7b", provider="ollama"
     )
 
     # Should not raise, and no metrics should be recorded for None values
@@ -203,25 +185,13 @@ def test_record_token_metrics_multiple_backends(clean_metrics_env):
 
     # Record from different backends
     record_token_usage_metrics(
-        input_tokens=100,
-        output_tokens=50,
-        model="llama2:7b",
-        backend="OllamaBackend",
-        system="ollama",
+        input_tokens=100, output_tokens=50, model="llama2:7b", provider="ollama"
     )
     record_token_usage_metrics(
-        input_tokens=200,
-        output_tokens=80,
-        model="gpt-4",
-        backend="OpenAIBackend",
-        system="openai",
+        input_tokens=200, output_tokens=80, model="gpt-4", provider="openai"
     )
     record_token_usage_metrics(
-        input_tokens=150,
-        output_tokens=60,
-        model="granite-3-8b",
-        backend="WatsonxBackend",
-        system="watsonx",
+        input_tokens=150, output_tokens=60, model="granite-3-8b", provider="watsonx"
     )
 
     # Force metrics collection
@@ -240,18 +210,16 @@ def test_record_token_metrics_multiple_backends(clean_metrics_env):
                     for dp in metric.data.data_points:
                         attrs = dict(dp.attributes)
                         key = (
-                            attrs["gen_ai.system"],
+                            attrs["gen_ai.provider.name"],
                             attrs["gen_ai.request.model"],
-                            attrs["mellea.backend"],
                         )
                         input_attrs.add(key)
                 if metric.name == "mellea.llm.tokens.output":
                     for dp in metric.data.data_points:
                         attrs = dict(dp.attributes)
                         key = (
-                            attrs["gen_ai.system"],
+                            attrs["gen_ai.provider.name"],
                             attrs["gen_ai.request.model"],
-                            attrs["mellea.backend"],
                         )
                         output_attrs.add(key)
 
diff --git a/uv.lock b/uv.lock
index e64504fb..f64846ff 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,5 @@
 version = 1
-revision = 3
+revision = 2
 requires-python = ">=3.11"
 resolution-markers = [
     "python_full_version >= '3.14' and python_full_version < '4'",
@@ -3747,6 +3747,8 @@ server = [
     { name = "uvicorn" },
 ]
 telemetry = [
+    { name = "cpex" },
+    { name = "grpcio" },
     { name = "opentelemetry-api" },
     { name = "opentelemetry-distro" },
     { name = "opentelemetry-exporter-otlp" },
@@ -3854,6 +3856,7 @@ requires-dist = [
     { name = "llm-sandbox", extras = ["docker"], marker = "extra == 'sandbox'", specifier = ">=0.3.23" },
     { name = "math-verify" },
     { name = "mellea", extras = ["backends", "docling", "tools", "telemetry", "server", "sandbox", "granite-retriever", "hooks"], marker = "extra == 'all'" },
+    { name = "mellea", extras = ["hooks"], marker = "extra == 'telemetry'" },
     { name = "mellea", extras = ["watsonx", "hf", "vllm", "litellm"], marker = "extra == 'backends'" },
     { name = "mistletoe", specifier = ">=1.4.0" },
     { name = "numpy", marker = "extra == 'vllm'", specifier = "<=2.2" },
@@ -5813,11 +5816,7 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/75/2e/a9e28941c6dab6f06e6d3f6783d3373044be9b0f9a9d3492c3d8d2260ac0/pybase64-1.4.3-cp312-cp312-win32.whl", hash = "sha256:7bca1ed3a5df53305c629ca94276966272eda33c0d71f862d2d3d043f1e1b91a", size = 33686, upload-time = "2025-12-06T13:23:37.848Z" },
     { url = "https://files.pythonhosted.org/packages/83/e3/507ab649d8c3512c258819c51d25c45d6e29d9ca33992593059e7b646a33/pybase64-1.4.3-cp312-cp312-win_amd64.whl", hash = "sha256:9f2da8f56d9b891b18b4daf463a0640eae45a80af548ce435be86aa6eff3603b", size = 35833, upload-time = "2025-12-06T13:23:38.877Z" },
     { url = "https://files.pythonhosted.org/packages/bc/8a/6eba66cd549a2fc74bb4425fd61b839ba0ab3022d3c401b8a8dc2cc00c7a/pybase64-1.4.3-cp312-cp312-win_arm64.whl", hash = "sha256:0631d8a2d035de03aa9bded029b9513e1fee8ed80b7ddef6b8e9389ffc445da0", size = 31185, upload-time = "2025-12-06T13:23:39.908Z" },
-    { url = "https://files.pythonhosted.org/packages/3a/50/b7170cb2c631944388fe2519507fe3835a4054a6a12a43f43781dae82be1/pybase64-1.4.3-cp313-cp313-android_21_arm64_v8a.whl", hash = "sha256:ea4b785b0607d11950b66ce7c328f452614aefc9c6d3c9c28bae795dc7f072e1", size = 33901, upload-time = "2025-12-06T13:23:40.951Z" },
     { url = "https://files.pythonhosted.org/packages/48/8b/69f50578e49c25e0a26e3ee72c39884ff56363344b79fc3967f5af420ed6/pybase64-1.4.3-cp313-cp313-android_21_x86_64.whl", hash = "sha256:6a10b6330188c3026a8b9c10e6b9b3f2e445779cf16a4c453d51a072241c65a2", size = 40807, upload-time = "2025-12-06T13:23:42.006Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/8d/20b68f11adfc4c22230e034b65c71392e3e338b413bf713c8945bd2ccfb3/pybase64-1.4.3-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:27fdff227a0c0e182e0ba37a99109645188978b920dfb20d8b9c17eeee370d0d", size = 30932, upload-time = "2025-12-06T13:23:43.348Z" },
-    { url = "https://files.pythonhosted.org/packages/f7/79/b1b550ac6bff51a4880bf6e089008b2e1ca16f2c98db5e039a08ac3ad157/pybase64-1.4.3-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:2a8204f1fdfec5aa4184249b51296c0de95445869920c88123978304aad42df1", size = 31394, upload-time = "2025-12-06T13:23:44.317Z" },
-    { url = "https://files.pythonhosted.org/packages/82/70/b5d7c5932bf64ee1ec5da859fbac981930b6a55d432a603986c7f509c838/pybase64-1.4.3-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:874fc2a3777de6baf6aa921a7aa73b3be98295794bea31bd80568a963be30767", size = 38078, upload-time = "2025-12-06T13:23:45.348Z" },
     { url = "https://files.pythonhosted.org/packages/1c/c9/24b3b905cf75e23a9a4deaf203b35ffcb9f473ac0e6d8257f91a05dfce62/pybase64-1.4.3-cp313-cp313-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:1d45c8fe8fe82b65c36b227bb4a2cf623d9ada16bed602ce2d3e18c35285b72a", size = 68244, upload-time = "2025-12-06T13:23:49.026Z" },
     { url = "https://files.pythonhosted.org/packages/f8/cd/d15b0c3e25e5859fab0416dc5b96d34d6bd2603c1c96a07bb2202b68ab92/pybase64-1.4.3-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:ad70c26ba091d8f5167e9d4e1e86a0483a5414805cdb598a813db635bd3be8b8", size = 71620, upload-time = "2025-12-06T13:23:50.081Z" },
     { url = "https://files.pythonhosted.org/packages/0d/31/4ca953cc3dcde2b3711d6bfd70a6f4ad2ca95a483c9698076ba605f1520f/pybase64-1.4.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e98310b7c43145221e7194ac9fa7fffc84763c87bfc5e2f59f9f92363475bdc1", size = 59930, upload-time = "2025-12-06T13:23:51.68Z" },
@@ -5852,11 +5851,7 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/42/10/abb7757c330bb869ebb95dab0c57edf5961ffbd6c095c8209cbbf75d117d/pybase64-1.4.3-cp313-cp313t-win32.whl", hash = "sha256:46d75c9387f354c5172582a9eaae153b53a53afeb9c19fcf764ea7038be3bd8b", size = 33965, upload-time = "2025-12-06T13:24:28.548Z" },
     { url = "https://files.pythonhosted.org/packages/63/a0/2d4e5a59188e9e6aed0903d580541aaea72dcbbab7bf50fb8b83b490b6c3/pybase64-1.4.3-cp313-cp313t-win_amd64.whl", hash = "sha256:d7344625591d281bec54e85cbfdab9e970f6219cac1570f2aa140b8c942ccb81", size = 36207, upload-time = "2025-12-06T13:24:29.646Z" },
     { url = "https://files.pythonhosted.org/packages/1f/05/95b902e8f567b4d4b41df768ccc438af618f8d111e54deaf57d2df46bd76/pybase64-1.4.3-cp313-cp313t-win_arm64.whl", hash = "sha256:28a3c60c55138e0028313f2eccd321fec3c4a0be75e57a8d3eb883730b1b0880", size = 31505, upload-time = "2025-12-06T13:24:30.687Z" },
-    { url = "https://files.pythonhosted.org/packages/e4/80/4bd3dff423e5a91f667ca41982dc0b79495b90ec0c0f5d59aca513e50f8c/pybase64-1.4.3-cp314-cp314-android_24_arm64_v8a.whl", hash = "sha256:015bb586a1ea1467f69d57427abe587469392215f59db14f1f5c39b52fdafaf5", size = 33835, upload-time = "2025-12-06T13:24:31.767Z" },
     { url = "https://files.pythonhosted.org/packages/45/60/a94d94cc1e3057f602e0b483c9ebdaef40911d84a232647a2fe593ab77bb/pybase64-1.4.3-cp314-cp314-android_24_x86_64.whl", hash = "sha256:d101e3a516f837c3dcc0e5a0b7db09582ebf99ed670865223123fb2e5839c6c0", size = 40673, upload-time = "2025-12-06T13:24:32.82Z" },
-    { url = "https://files.pythonhosted.org/packages/e3/71/cf62b261d431857e8e054537a5c3c24caafa331de30daede7b2c6c558501/pybase64-1.4.3-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:8f183ac925a48046abe047360fe3a1b28327afb35309892132fe1915d62fb282", size = 30939, upload-time = "2025-12-06T13:24:34.001Z" },
-    { url = "https://files.pythonhosted.org/packages/24/3e/d12f92a3c1f7c6ab5d53c155bff9f1084ba997a37a39a4f781ccba9455f3/pybase64-1.4.3-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:30bf3558e24dcce4da5248dcf6d73792adfcf4f504246967e9db155be4c439ad", size = 31401, upload-time = "2025-12-06T13:24:35.11Z" },
-    { url = "https://files.pythonhosted.org/packages/9b/3d/9c27440031fea0d05146f8b70a460feb95d8b4e3d9ca8f45c972efb4c3d3/pybase64-1.4.3-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:a674b419de318d2ce54387dd62646731efa32b4b590907800f0bd40675c1771d", size = 38075, upload-time = "2025-12-06T13:24:36.53Z" },
     { url = "https://files.pythonhosted.org/packages/db/26/b136a4b65e5c94ff06217f7726478df3f31ab1c777c2c02cf698e748183f/pybase64-1.4.3-cp314-cp314-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:b51204d349a4b208287a8aa5b5422be3baa88abf6cc8ff97ccbda34919bbc857", size = 68460, upload-time = "2025-12-06T13:24:41.735Z" },
     { url = "https://files.pythonhosted.org/packages/68/6d/84ce50e7ee1ae79984d689e05a9937b2460d4efa1e5b202b46762fb9036c/pybase64-1.4.3-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:30f2fd53efecbdde4bdca73a872a68dcb0d1bf8a4560c70a3e7746df973e1ef3", size = 71688, upload-time = "2025-12-06T13:24:42.908Z" },
     { url = "https://files.pythonhosted.org/packages/e3/57/6743e420416c3ff1b004041c85eb0ebd9c50e9cf05624664bfa1dc8b5625/pybase64-1.4.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0932b0c5cfa617091fd74f17d24549ce5de3628791998c94ba57be808078eeaf", size = 60040, upload-time = "2025-12-06T13:24:44.37Z" },

From d35d37a7ed2aec987ce5197caa110456cb956b09 Mon Sep 17 00:00:00 2001
From: Alex Bozarth <ajbozart@us.ibm.com>
Date: Fri, 13 Mar 2026 19:14:16 -0500
Subject: [PATCH 3/4] test: add telemetry field assertions to backend tests

Add assertions to verify usage, model, and provider fields are populated
in test_async_avalue tests for all 5 supported backends.

Signed-off-by: Alex Bozarth <ajbozart@us.ibm.com>
---
 docs/dev/metrics_refactor_plan.md    | 673 ---------------------------
 test/backends/test_huggingface.py    |   8 +
 test/backends/test_litellm_ollama.py |   8 +
 test/backends/test_ollama.py         |   8 +
 test/backends/test_openai_ollama.py  |   8 +
 test/backends/test_watsonx.py        |   8 +
 6 files changed, 40 insertions(+), 673 deletions(-)
 delete mode 100644 docs/dev/metrics_refactor_plan.md

diff --git a/docs/dev/metrics_refactor_plan.md b/docs/dev/metrics_refactor_plan.md
deleted file mode 100644
index 4a82ce7b..00000000
--- a/docs/dev/metrics_refactor_plan.md
+++ /dev/null
@@ -1,673 +0,0 @@
-# Metrics Refactor Plan: Migrate to Hooks/Plugins System
-
-## Overview
-
-This plan addresses issues [#607](https://github.com/generative-computing/mellea/issues/607) and [#608](https://github.com/generative-computing/mellea/issues/608) to refactor the token metrics implementation added in commit `0e71558` to use the hooks/plugins system introduced in commit `cbd63bd`.
-
-**Current State**: Token metrics are recorded via direct `record_token_usage_metrics()` calls in each backend's `post_processing()` method (5 backends: OpenAI, Ollama, LiteLLM, HuggingFace, WatsonX).
-
-**Target State**: Token metrics recorded via a plugin that hooks into `generation_post_call`, with token usage data standardized on `ModelOutputThunk` using a `usage` field that matches the OpenAI API standard.
-
-## Key Commits Analysis
-
-### Commit 0e71558 (Metrics Implementation)
-- Added `mellea/telemetry/metrics.py` with `record_token_usage_metrics()`
-- Modified 5 backends to call `record_token_usage_metrics()` in their `post_processing()` methods
-- Token extraction logic varies per backend:
-  - **OpenAI/LiteLLM/WatsonX**: Extract from `usage` dict via `get_value(usage, "prompt_tokens")`
-  - **Ollama**: Extract from response attributes (`prompt_eval_count`, `eval_count`)
-  - **HuggingFace**: Calculate from `input_ids` and output sequences
-- Each backend stores usage info in different `_meta` locations
-
-### Commit cbd63bd (Hooks/Plugins System)
-- Added complete hooks/plugins infrastructure in `mellea/plugins/`
-- Implemented `generation_post_call` hook that fires after `post_process()` completes
-- Hook fires in `ModelOutputThunk.astream()` at line 384-398 in `mellea/core/base.py`
-- Payload: `GenerationPostCallPayload(prompt, model_output, latency_ms)`
-- Policy: `generation_post_call` is **observe-only** (no writable fields)
-
-## Problem Statement
-
-### Issue #607: Standardize Token Storage
-**Problem**: Token usage is stored inconsistently across backends in various `_meta` locations, making programmatic access difficult.
-
-**Solution**: Add a standard `usage` field to `ModelOutputThunk` that all backends populate, matching the OpenAI API standard.
-
-### Issue #608: Use Hooks for Metrics
-**Problem**: Metrics recording is duplicated across 5 backends, tightly coupled to backend implementation.
-
-**Solution**: Create a metrics plugin that hooks `generation_post_call` to record token usage from the standardized field.
-
-## Architecture Decision
-
-The refactor follows this sequence:
-
-1. **First** (Issue #607): Standardize token storage on `ModelOutputThunk`
-2. **Then** (Issue #608): Create metrics plugin to consume standardized data
-
-This ordering is critical because:
-- The plugin needs a consistent data source to read from
-- Backends must populate the standard field before the hook fires
-- The hook fires **after** `post_processing()` completes, so backends have already extracted tokens
-
-## Detailed Implementation Plan
-
-### Phase 1: Standardize Token Storage (Issue #607)
-
-#### 1.1 Add `usage` Field to ModelOutputThunk
-
-**File**: `mellea/core/base.py`
-
-**Changes**:
-```python
-class ModelOutputThunk(CBlock, Generic[S]):
-    def __init__(
-        self,
-        value: str | None,
-        meta: dict[str, Any] | None = None,
-        parsed_repr: S | None = None,
-        tool_calls: dict[str, ModelToolCall] | None = None,
-    ):
-        # ... existing code ...
-        
-        # Add new field for standardized usage information
-        self.usage: dict[str, int] | None = None
-        """Usage information following OpenAI API standard.
-        
-        Core fields: 'prompt_tokens', 'completion_tokens', 'total_tokens'.
-        Populated by backends during post_processing. None if unavailable.
-        
-        Future: May include optional breakdown fields like 'completion_tokens_details'
-        and 'prompt_tokens_details' for advanced features (reasoning, audio, caching).
-        """
-```
-
-**Rationale**:
-- Matches OpenAI API standard (industry convention)
-- Consistent with existing `tool_calls` field pattern
-- Extensible for future usage fields beyond tokens (cost, reasoning tokens, cached tokens)
-- Simple dict format compatible with all backends
-- Optional (None) when backend doesn't provide usage info
-- Accessible via `model_output.usage` in hooks and user code
-
-#### 1.2 Update Backend `post_processing()` Methods
-
-Each backend's `post_processing()` method must populate `mot.token_usage` **before** the method returns (since `generation_post_call` hook fires after `post_processing` completes).
-
-**Pattern for all backends**:
-```python
-async def post_processing(self, mot: ModelOutputThunk, ...):
-    # ... existing processing logic ...
-    
-    # Extract token usage (backend-specific logic)
-    prompt_tokens = <extract_prompt_tokens>
-    completion_tokens = <extract_completion_tokens>
-    
-    # Populate standardized field (matches OpenAI API format)
-    if prompt_tokens is not None or completion_tokens is not None:
-        mot.usage = {
-            "prompt_tokens": prompt_tokens or 0,
-            "completion_tokens": completion_tokens or 0,
-            "total_tokens": (prompt_tokens or 0) + (completion_tokens or 0),
-        }
-    
-    # REMOVE: Direct metrics recording
-    # from ..telemetry.metrics import record_token_usage_metrics
-    # record_token_usage_metrics(...)
-```
-
-**Backend-Specific Extraction Logic**:
-
-1. **OpenAI** (`mellea/backends/openai.py:562-646`):
-   ```python
-   # Extract from response or streaming usage (lines 620-626)
-   response = mot._meta["oai_chat_response"]
-   usage = response.get("usage") if isinstance(response, dict) else None
-   if usage is None:
-       usage = mot._meta.get("oai_streaming_usage")
-   
-   # Populate standardized field (already matches OpenAI format)
-   if usage:
-       mot.usage = {
-           "prompt_tokens": get_value(usage, "prompt_tokens") or 0,
-           "completion_tokens": get_value(usage, "completion_tokens") or 0,
-           "total_tokens": get_value(usage, "total_tokens") or 0,
-       }
-   ```
-
-2. **Ollama** (`mellea/backends/ollama.py:583-642`):
-   ```python
-   # Extract from response attributes (lines 620-623)
-   response = mot._meta.get("ollama_response")
-   prompt_tokens = getattr(response, "prompt_eval_count", None) if response else None
-   completion_tokens = getattr(response, "eval_count", None) if response else None
-   
-   # Convert to OpenAI-compatible format
-   if prompt_tokens is not None or completion_tokens is not None:
-       mot.usage = {
-           "prompt_tokens": prompt_tokens or 0,
-           "completion_tokens": completion_tokens or 0,
-           "total_tokens": (prompt_tokens or 0) + (completion_tokens or 0),
-       }
-   ```
-
-3. **LiteLLM** (`mellea/backends/litellm.py:425-508`):
-   ```python
-   # Extract from full response or streaming usage (lines 483-489)
-   full_response = mot._meta.get("litellm_full_response")
-   usage = full_response.get("usage") if isinstance(full_response, dict) else None
-   if usage is None:
-       usage = mot._meta.get("litellm_streaming_usage")
-   
-   # Populate standardized field (already matches OpenAI format)
-   if usage:
-       mot.usage = {
-           "prompt_tokens": get_value(usage, "prompt_tokens") or 0,
-           "completion_tokens": get_value(usage, "completion_tokens") or 0,
-           "total_tokens": get_value(usage, "total_tokens") or 0,
-       }
-   ```
-
-4. **HuggingFace** (`mellea/backends/huggingface.py:1001-1092`):
-   ```python
-   # Calculate from sequences (lines 1063-1076)
-   hf_output = mot._meta.get("hf_output")
-   if isinstance(hf_output, GenerateDecoderOnlyOutput):
-       if input_ids is not None and hf_output.sequences is not None:
-           try:
-               n_prompt = input_ids.shape[1]
-               n_completion = hf_output.sequences[0].shape[0] - n_prompt
-               # Convert to OpenAI-compatible format
-               mot.usage = {
-                   "prompt_tokens": n_prompt,
-                   "completion_tokens": n_completion,
-                   "total_tokens": n_prompt + n_completion,
-               }
-           except Exception:
-               pass  # Leave as None if calculation fails
-   ```
-
-5. **WatsonX** (`mellea/backends/watsonx.py:450-508`):
-   ```python
-   # Extract from response usage (similar to OpenAI)
-   response = mot._meta.get("watsonx_response")
-   usage = response.get("usage") if isinstance(response, dict) else None
-   
-   # Populate standardized field (already matches OpenAI format)
-   if usage:
-       mot.usage = {
-           "prompt_tokens": get_value(usage, "prompt_tokens") or 0,
-           "completion_tokens": get_value(usage, "completion_tokens") or 0,
-           "total_tokens": get_value(usage, "total_tokens") or 0,
-       }
-   ```
-
-**Files to Modify**:
-- `mellea/core/base.py` (add field)
-- `mellea/backends/openai.py` (populate field, remove direct metrics call)
-- `mellea/backends/ollama.py` (populate field, remove direct metrics call)
-- `mellea/backends/litellm.py` (populate field, remove direct metrics call)
-- `mellea/backends/huggingface.py` (populate field, remove direct metrics call)
-- `mellea/backends/watsonx.py` (populate field, remove direct metrics call)
-
-### Phase 2: Create Metrics Plugin (Issue #608)
-
-#### 2.1 Create Token Metrics Plugin
-
-**New File**: `mellea/plugins/builtin/token_metrics.py`
-
-**Implementation**:
-```python
-"""Built-in plugin for recording token usage metrics via OpenTelemetry."""
-
-from mellea.plugins import Plugin, hook, HookType, PluginMode
-from mellea.plugins.hooks.generation import GenerationPostCallPayload
-
-
-class TokenMetricsPlugin(Plugin, name="token-metrics", priority=100):
-    """Records token usage metrics from generation_post_call hook.
-    
-    This plugin automatically records input/output token counts to OpenTelemetry
-    metrics when MELLEA_METRICS_ENABLED=true. It reads from the standardized
-    ModelOutputThunk.usage field populated by backends.
-    
-    Execution mode: FIRE_AND_FORGET (async, non-blocking, observe-only)
-    Priority: 100 (runs after other plugins)
-    """
-    
-    @hook(HookType.GENERATION_POST_CALL, mode=PluginMode.FIRE_AND_FORGET)
-    async def record_tokens(
-        self,
-        payload: GenerationPostCallPayload,
-        ctx
-    ):
-        """Record token usage metrics from the model output."""
-        from mellea.telemetry.metrics import is_metrics_enabled
-        
-        # Early return if metrics disabled (zero overhead)
-        if not is_metrics_enabled():
-            return
-        
-        mot = payload.model_output
-        if mot is None or mot.usage is None:
-            return
-        
-        # Extract backend info from context
-        backend = ctx.global_context.state.get("backend")
-        if backend is None:
-            return
-        
-        from mellea.telemetry.backend_instrumentation import (
-            get_model_id_str,
-            get_system_name,
-        )
-        from mellea.telemetry.metrics import record_token_usage_metrics
-        
-        # Record using standardized usage dict
-        record_token_usage_metrics(
-            input_tokens=mot.usage.get("prompt_tokens"),
-            output_tokens=mot.usage.get("completion_tokens"),
-            model=get_model_id_str(backend),
-            backend=backend.__class__.__name__,
-            system=get_system_name(backend),
-        )
-```
-
-**Key Design Decisions**:
-- **FIRE_AND_FORGET mode**: Metrics recording is async, non-blocking, and cannot fail the generation
-- **Priority 100**: Runs after other plugins (lower priority = earlier execution)
-- **Observe-only**: Reads from `model_output.usage`, doesn't modify payload
-- **Zero overhead**: Early return when metrics disabled
-- **Backend-agnostic**: Works with any backend that populates `usage`
-- **OpenAI-compatible**: Uses standard field name for familiarity and future extensibility
-
-#### 2.2 Auto-Register Plugin When Metrics Enabled
-
-**File**: `mellea/telemetry/metrics.py`
-
-**Changes**:
-```python
-# At module initialization (after _meter setup, around line 247)
-if _OTEL_AVAILABLE and _METRICS_ENABLED:
-    _meter_provider = _setup_meter_provider()
-    if _meter_provider is not None:
-        _meter = metrics.get_meter("mellea.metrics", version("mellea"))
-        
-        # Auto-register token metrics plugin
-        from mellea.plugins.builtin.token_metrics import TokenMetricsPlugin
-        from mellea.plugins import register
-        
-        _token_metrics_plugin = TokenMetricsPlugin()
-        register(_token_metrics_plugin)
-```
-
-**Rationale**:
-- Automatic activation when `MELLEA_METRICS_ENABLED=true`
-- No user code changes required
-- Consistent with existing metrics module behavior
-- Plugin is globally registered (works with both session and functional API)
-
-#### 2.3 Update Plugin Module Structure
-
-**New Directory**: `mellea/plugins/builtin/`
-
-**Files**:
-- `mellea/plugins/builtin/__init__.py` - Export built-in plugins
-- `mellea/plugins/builtin/token_metrics.py` - Token metrics plugin
-
-**Update**: `mellea/plugins/__init__.py`
-```python
-# Add to exports
-from mellea.plugins.builtin import TokenMetricsPlugin
-
-__all__ = [
-    # ... existing exports ...
-    "TokenMetricsPlugin",
-]
-```
-
-### Phase 3: Update Tests
-
-#### 3.1 Update Backend Integration Tests
-
-**Files**: `test/telemetry/test_metrics_backend.py`
-
-**Changes**:
-- Tests should verify `mot.token_usage` is populated (not just metrics recorded)
-- Add assertions: `assert mot.token_usage is not None`
-- Add assertions: `assert mot.token_usage["prompt_tokens"] > 0`
-- Keep existing metrics verification (plugin should still record them)
-
-**Example**:
-```python
-async def test_openai_token_metrics(ollama_backend):
-    """Test that OpenAI backend populates usage and metrics are recorded."""
-    # ... existing test setup ...
-    
-    # Verify usage field is populated
-    assert mot.usage is not None
-    assert mot.usage["prompt_tokens"] > 0
-    assert mot.usage["completion_tokens"] > 0
-    assert mot.usage["total_tokens"] > 0
-    
-    # Verify metrics were recorded (via plugin)
-    # ... existing metrics verification ...
-```
-
-#### 3.2 Add Plugin-Specific Tests
-
-**New File**: `test/plugins/test_token_metrics_plugin.py`
-
-**Tests**:
-1. Test plugin registers correctly when metrics enabled
-2. Test plugin reads from `mot.usage` correctly
-3. Test plugin handles missing `usage` gracefully
-4. Test plugin is no-op when metrics disabled
-5. Test plugin works with all backends
-6. Test plugin doesn't block on errors (FIRE_AND_FORGET mode)
-
-#### 3.3 Update Unit Tests
-
-**File**: `test/telemetry/test_metrics_token.py`
-
-**Changes**:
-- Tests for `record_token_usage_metrics()` remain unchanged (function still exists)
-- Add note that function is now called by plugin, not backends directly
-
-### Phase 4: Update Documentation
-
-#### 4.1 Update Telemetry Documentation
-
-**File**: `docs/dev/telemetry.md`
-
-**Changes**:
-- Update "Token Usage Metrics" section to explain plugin-based architecture
-- Add note about `ModelOutputThunk.usage` field (OpenAI-compatible)
-- Update backend support table to note standardized field
-- Add example of accessing token usage programmatically
-
-**Example Addition**:
-```markdown
-### Programmatic Access to Token Usage
-
-Token usage information is available on the `ModelOutputThunk` after generation:
-
-```python
-with start_session() as m:
-    result = m.instruct("What is the capital of France?")
-    
-    # Access token usage (OpenAI-compatible format)
-    if result.usage:
-        print(f"Prompt tokens: {result.usage['prompt_tokens']}")
-        print(f"Completion tokens: {result.usage['completion_tokens']}")
-        print(f"Total tokens: {result.usage['total_tokens']}")
-```
-
-Token metrics are automatically recorded to OpenTelemetry when 
-`MELLEA_METRICS_ENABLED=true` via the built-in `TokenMetricsPlugin`.
-```
-
-#### 4.2 Update Plugin Documentation
-
-**File**: `docs/docs/core-concept/plugins.mdx`
-
-**Changes**:
-- Add section on built-in plugins
-- Document `TokenMetricsPlugin` as an example of FIRE_AND_FORGET mode
-- Show how built-in plugins auto-register
-
-#### 4.3 Update Examples
-
-**File**: `docs/examples/telemetry/metrics_example.py`
-
-**Changes**:
-- Add example showing programmatic access to `usage`
-- Add comment explaining plugin-based architecture
-- No functional changes (metrics still work the same way)
-
-**Example Addition**:
-```python
-# Example 5: Programmatic token access
-print("\n5. Accessing token usage programmatically...")
-result = m.instruct("Count to five")
-if result.usage:
-    print(f"  Prompt tokens: {result.usage['prompt_tokens']}")
-    print(f"  Completion tokens: {result.usage['completion_tokens']}")
-    print(f"  Total tokens: {result.usage['total_tokens']}")
-```
-
-### Phase 5: Cleanup
-
-#### 5.1 Remove Backend-Specific Metrics Code
-
-**All Backend Files**:
-- Remove `from ..telemetry.metrics import is_metrics_enabled` imports
-- Remove `from ..telemetry.metrics import record_token_usage_metrics` imports
-- Remove `from ..telemetry.backend_instrumentation import get_model_id_str, get_system_name` imports (if only used for metrics)
-- Remove `if is_metrics_enabled():` blocks and `record_token_usage_metrics()` calls
-
-**Estimated Lines Removed**: ~15-20 lines per backend × 5 backends = ~75-100 lines
-
-#### 5.2 Keep Core Metrics Infrastructure
-
-**Files to Keep Unchanged**:
-- `mellea/telemetry/metrics.py` - Core metrics functions still needed by plugin
-- `mellea/telemetry/backend_instrumentation.py` - Helper functions still needed
-
-## Implementation Order
-
-### Step 1: Add `token_usage` Field (Issue #607)
-1. Modify `mellea/core/base.py` to add `token_usage` field
-2. Update all 5 backends to populate the field
-3. Keep existing `record_token_usage_metrics()` calls temporarily (dual-write)
-4. Run tests to verify field is populated correctly
-
-### Step 2: Create Metrics Plugin (Issue #608)
-5. Create `mellea/plugins/builtin/` directory
-6. Implement `TokenMetricsPlugin` in `token_metrics.py`
-7. Auto-register plugin in `mellea/telemetry/metrics.py`
-8. Run tests to verify plugin records metrics correctly
-
-### Step 3: Remove Duplicate Code
-9. Remove direct `record_token_usage_metrics()` calls from all backends
-10. Remove unused imports from backends
-11. Run full test suite to verify no regressions
-
-### Step 4: Update Documentation and Tests
-12. Update backend integration tests to verify `token_usage` field
-13. Add plugin-specific tests
-14. Update documentation (telemetry.md, plugins.mdx)
-15. Update examples to show programmatic access
-
-## Testing Strategy
-
-### Test Coverage Required
-
-1. **Unit Tests** (existing, should pass unchanged):
-   - `test/telemetry/test_metrics_token.py` - Tests for `record_token_usage_metrics()`
-
-2. **Backend Integration Tests** (need updates):
-   - `test/telemetry/test_metrics_backend.py` - Add `token_usage` field assertions
-   - All 5 backend tests should verify field population
-
-3. **Plugin Tests** (new):
-   - `test/plugins/test_token_metrics_plugin.py` - Plugin-specific tests
-   - Test plugin registration, execution, error handling
-
-4. **End-to-End Tests** (existing, should pass):
-   - `docs/examples/telemetry/metrics_example.py` - Should work unchanged
-
-### Test Execution
-
-```bash
-# Run metrics tests
-uv run pytest test/telemetry/test_metrics*.py -v
-
-# Run plugin tests
-uv run pytest test/plugins/test_token_metrics_plugin.py -v
-
-# Run backend tests
-uv run pytest test/backends/ -k "metrics" -v
-
-# Run example as test
-uv run pytest docs/examples/telemetry/metrics_example.py -v
-```
-
-## Migration Impact
-
-### Breaking Changes
-**None** - This is a pure refactor with no API changes:
-- Metrics still recorded automatically when `MELLEA_METRICS_ENABLED=true`
-- Same metric names, attributes, and exporters
-- User code unchanged
-
-### New Features
-- **Programmatic access**: Users can now access `mot.token_usage` directly
-- **Extensibility**: Users can create custom metrics plugins following this pattern
-- **Consistency**: Token data in standard location across all backends
-
-### Performance Impact
-- **Negligible**: Plugin uses FIRE_AND_FORGET mode (async, non-blocking)
-- **Same overhead**: Metrics recording logic unchanged, just moved to plugin
-- **Zero overhead when disabled**: Early return in plugin when metrics disabled
-
-## Risks and Mitigations
-
-### Risk 1: Hook Timing
-**Risk**: `generation_post_call` fires after `post_processing()`, but what if backends don't populate `token_usage` in time?
-
-**Mitigation**: Backends populate `token_usage` **during** `post_processing()`, which completes **before** the hook fires. The hook call is at line 384-398 in `ModelOutputThunk.astream()`, after line 366 (`await self._post_process(self)`).
-
-### Risk 2: Missing Token Data
-**Risk**: Some backends might not have token usage available.
-
-**Mitigation**: 
-- `token_usage` field is optional (None when unavailable)
-- Plugin checks `if mot.token_usage is None: return`
-- Existing behavior preserved (metrics not recorded when data unavailable)
-
-### Risk 3: Plugin Framework Dependency
-**Risk**: Plugin requires `mellea[hooks]` extra dependency.
-
-**Mitigation**:
-- Plugin only imported when metrics enabled
-- Graceful fallback if hooks not installed (warning message)
-- Most users enabling metrics will have full installation
-
-### Risk 4: Test Failures
-**Risk**: Existing tests might fail if they expect metrics without the plugin.
-
-**Mitigation**:
-- Plugin auto-registers when metrics enabled (same as before)
-- Tests run with `MELLEA_METRICS_ENABLED=true` will get plugin automatically
-- Dual-write during Step 1 ensures no test breakage during transition
-
-## Success Criteria
-
-### Phase 1 Complete When:
-- [ ] `ModelOutputThunk.token_usage` field added
-- [ ] All 5 backends populate the field correctly
-- [ ] Backend integration tests verify field population
-- [ ] Existing metrics tests still pass (dual-write active)
-
-### Phase 2 Complete When:
-- [ ] `TokenMetricsPlugin` implemented and tested
-- [ ] Plugin auto-registers when metrics enabled
-- [ ] Plugin records metrics correctly (verified by existing tests)
-- [ ] Plugin-specific tests added and passing
-
-### Phase 3 Complete When:
-- [ ] Direct `record_token_usage_metrics()` calls removed from backends
-- [ ] Unused imports cleaned up
-- [ ] All tests pass (metrics now recorded via plugin only)
-
-### Phase 4 Complete When:
-- [ ] Documentation updated (telemetry.md, plugins.mdx)
-- [ ] Examples updated to show programmatic access
-- [ ] All documentation builds without errors
-
-### Overall Success:
-- [ ] All tests pass (`uv run pytest test/`)
-- [ ] No breaking changes to user API
-- [ ] Metrics still recorded correctly
-- [ ] Code is cleaner (no duplication across backends)
-- [ ] Token usage accessible programmatically via `mot.token_usage`
-
-## File Checklist
-
-### Files to Create
-- [ ] `mellea/plugins/builtin/__init__.py`
-- [ ] `mellea/plugins/builtin/token_metrics.py`
-- [ ] `test/plugins/test_token_metrics_plugin.py`
-
-### Files to Modify
-- [ ] `mellea/core/base.py` - Add `token_usage` field
-- [ ] `mellea/backends/openai.py` - Populate field, remove metrics call
-- [ ] `mellea/backends/ollama.py` - Populate field, remove metrics call
-- [ ] `mellea/backends/litellm.py` - Populate field, remove metrics call
-- [ ] `mellea/backends/huggingface.py` - Populate field, remove metrics call
-- [ ] `mellea/backends/watsonx.py` - Populate field, remove metrics call
-- [ ] `mellea/telemetry/metrics.py` - Auto-register plugin
-- [ ] `mellea/plugins/__init__.py` - Export TokenMetricsPlugin
-- [ ] `test/telemetry/test_metrics_backend.py` - Add token_usage assertions
-- [ ] `docs/dev/telemetry.md` - Document new architecture
-- [ ] `docs/docs/core-concept/plugins.mdx` - Document built-in plugin
-- [ ] `docs/examples/telemetry/metrics_example.py` - Add programmatic access example
-
-### Files to Review (No Changes Expected)
-- `mellea/telemetry/backend_instrumentation.py` - Helper functions still used
-- `test/telemetry/test_metrics_token.py` - Unit tests for core function
-- `mellea/plugins/hooks/generation.py` - Payload definition unchanged
-
-## Estimated Effort
-
-- **Phase 1** (Standardize storage): 2-3 hours
-  - Add field: 15 min
-  - Update 5 backends: 1.5 hours (30 min each)
-  - Update tests: 1 hour
-
-- **Phase 2** (Create plugin): 1-2 hours
-  - Implement plugin: 45 min
-  - Auto-registration: 15 min
-  - Plugin tests: 1 hour
-
-- **Phase 3** (Cleanup): 30 min
-  - Remove duplicate code: 20 min
-  - Verify tests: 10 min
-
-- **Phase 4** (Documentation): 1 hour
-  - Update docs: 30 min
-  - Update examples: 30 min
-
-**Total**: 4.5-6.5 hours
-
-## Open Questions
-
-1. **Should `token_usage` be added to `GenerationPostCallPayload`?**
-   - Pro: Makes it explicit in the hook payload
-   - Con: Redundant (already on `model_output`)
-   - **Recommendation**: No, keep it on MOT only (single source of truth)
-
-2. **Should the plugin be optional or always registered?**
-   - Current plan: Auto-register when `MELLEA_METRICS_ENABLED=true`
-   - Alternative: Always register, let plugin check `is_metrics_enabled()` internally
-   - **Recommendation**: Auto-register (cleaner, no overhead when disabled)
-
-3. **Should we support custom metrics plugins?**
-   - Users could create their own plugins reading from `token_usage`
-   - **Recommendation**: Yes, document this pattern in plugins.mdx
-
-4. **What about vLLM backend?**
-   - Not modified in commit 0e71558 (no metrics support)
-   - **Recommendation**: Out of scope for this refactor, can be added later
-
-## References
-
-- Issue #607: https://github.com/generative-computing/mellea/issues/607
-- Issue #608: https://github.com/generative-computing/mellea/issues/608
-- PR #563 (metrics): https://github.com/generative-computing/mellea/pull/563
-- PR #582 (hooks): https://github.com/generative-computing/mellea/pull/582
-- Commit 0e71558: Metrics implementation
-- Commit cbd63bd: Hooks/plugins system
-- Hook system spec: `docs/dev/hook_system.md`
-- Plugin examples: `docs/examples/plugins/`
\ No newline at end of file
diff --git a/test/backends/test_huggingface.py b/test/backends/test_huggingface.py
index 805f8c4d..e2881f3b 100644
--- a/test/backends/test_huggingface.py
+++ b/test/backends/test_huggingface.py
@@ -328,6 +328,14 @@ async def test_async_avalue(session) -> None:
     assert m1_final_val is not None
     assert m1_final_val == mot1.value
 
+    # Verify telemetry fields are populated
+    assert mot1.usage is not None
+    assert mot1.usage["prompt_tokens"] >= 0
+    assert mot1.usage["completion_tokens"] > 0
+    assert mot1.usage["total_tokens"] > 0
+    assert mot1.model is not None
+    assert mot1.provider == "huggingface"
+
 
 @pytest.mark.qualitative
 async def test_generate_with_lock(backend) -> None:
diff --git a/test/backends/test_litellm_ollama.py b/test/backends/test_litellm_ollama.py
index ece9c890..d41367bb 100644
--- a/test/backends/test_litellm_ollama.py
+++ b/test/backends/test_litellm_ollama.py
@@ -202,6 +202,14 @@ async def test_async_avalue(session):
     assert m1_final_val is not None
     assert m1_final_val == mot1.value
 
+    # Verify telemetry fields are populated
+    assert mot1.usage is not None
+    assert mot1.usage["prompt_tokens"] >= 0
+    assert mot1.usage["completion_tokens"] > 0
+    assert mot1.usage["total_tokens"] > 0
+    assert mot1.model is not None
+    assert mot1.provider == "litellm"
+
 
 if __name__ == "__main__":
     import pytest
diff --git a/test/backends/test_ollama.py b/test/backends/test_ollama.py
index 706766ea..a4b75f59 100644
--- a/test/backends/test_ollama.py
+++ b/test/backends/test_ollama.py
@@ -196,6 +196,14 @@ async def test_async_avalue(session) -> None:
     assert m1_final_val is not None
     assert m1_final_val == mot1.value
 
+    # Verify telemetry fields are populated
+    assert mot1.usage is not None
+    assert mot1.usage["prompt_tokens"] >= 0
+    assert mot1.usage["completion_tokens"] > 0
+    assert mot1.usage["total_tokens"] > 0
+    assert mot1.model is not None
+    assert mot1.provider == "ollama"
+
 
 def test_multiple_asyncio_runs(session) -> None:
     async def test():
diff --git a/test/backends/test_openai_ollama.py b/test/backends/test_openai_ollama.py
index 142d0781..10e4ac0b 100644
--- a/test/backends/test_openai_ollama.py
+++ b/test/backends/test_openai_ollama.py
@@ -194,6 +194,14 @@ async def test_async_avalue(m_session) -> None:
     assert m1_final_val is not None
     assert m1_final_val == mot1.value
 
+    # Verify telemetry fields are populated
+    assert mot1.usage is not None
+    assert mot1.usage["prompt_tokens"] >= 0
+    assert mot1.usage["completion_tokens"] > 0
+    assert mot1.usage["total_tokens"] > 0
+    assert mot1.model is not None
+    assert mot1.provider == "openai"
+
 
 def test_client_cache(backend) -> None:
     first_client = backend._async_client
diff --git a/test/backends/test_watsonx.py b/test/backends/test_watsonx.py
index c9f27be2..87784e0b 100644
--- a/test/backends/test_watsonx.py
+++ b/test/backends/test_watsonx.py
@@ -207,6 +207,14 @@ async def test_async_avalue(session):
     assert m1_final_val is not None
     assert m1_final_val == mot1.value
 
+    # Verify telemetry fields are populated
+    assert mot1.usage is not None
+    assert mot1.usage["prompt_tokens"] >= 0
+    assert mot1.usage["completion_tokens"] > 0
+    assert mot1.usage["total_tokens"] > 0
+    assert mot1.model is not None
+    assert mot1.provider == "watsonx"
+
 
 def test_client_cache(backend):
     first_client = backend._model

From 13345b6575b097d84a2d5e2274d31e1f30c0f635 Mon Sep 17 00:00:00 2001
From: Alex Bozarth <ajbozart@us.ibm.com>
Date: Fri, 13 Mar 2026 19:52:37 -0500
Subject: [PATCH 4/4] docs: update telemetry docs for plugin-based metrics

Signed-off-by: Alex Bozarth <ajbozart@us.ibm.com>
---
 AGENTS.md                                  |  2 +-
 docs/dev/telemetry.md                      | 23 +++++++++++++++++++---
 docs/examples/telemetry/metrics_example.py |  6 ++++++
 3 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index 0396b617..d414f1a7 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -92,9 +92,9 @@ Tests/examples automatically skip if system lacks required resources. Heavy exam
 - **Google-style docstrings**
 - **Ruff** for linting/formatting
 - Use `...` in `@generative` function bodies
-- Use `...` in `@generative` function bodies
 - Prefer primitives over classes
 - **Friendly Dependency Errors**: Wraps optional backend imports in `try/except ImportError` with a helpful message (e.g., "Please pip install mellea[hf]"). See `mellea/stdlib/session.py` for examples.
+- **Backend telemetry fields**: All backends must populate `mot.usage` (dict with `prompt_tokens`, `completion_tokens`, `total_tokens`), `mot.model` (str), and `mot.provider` (str) in their `post_processing()` method. Metrics are automatically recorded by `TokenMetricsPlugin` — don't add manual `record_token_usage_metrics()` calls.
 
 ## 5. Commits & Hooks
 [Angular format](https://github.com/angular/angular/blob/main/CONTRIBUTING.md#commit): `feat:`, `fix:`, `docs:`, `test:`, `refactor:`, `release:`
diff --git a/docs/dev/telemetry.md b/docs/dev/telemetry.md
index 09fb342a..b0a52d6b 100644
--- a/docs/dev/telemetry.md
+++ b/docs/dev/telemetry.md
@@ -129,9 +129,8 @@ All token metrics include these attributes following Gen-AI semantic conventions
 
 | Attribute | Description | Example Values |
 |-----------|-------------|----------------|
-| `gen_ai.system` | Backend system name | `openai`, `ollama`, `watsonx`, `litellm`, `huggingface` |
+| `gen_ai.provider.name` | Backend provider name | `openai`, `ollama`, `watsonx`, `litellm`, `huggingface` |
 | `gen_ai.request.model` | Model identifier | `gpt-4`, `llama3.2:7b`, `granite-3.1-8b-instruct` |
-| `mellea.backend` | Backend class name | `OpenAIBackend`, `OllamaBackend`, `WatsonxBackend` |
 
 #### Backend Support
 
@@ -365,12 +364,30 @@ if is_metrics_enabled():
     print("Token metrics are being collected")
 ```
 
+Access token usage data from `ModelOutputThunk`:
+
+```python
+from mellea import start_session
+
+with start_session() as m:
+    result = m.instruct("Write a haiku about programming")
+    
+    # Access token usage (follows OpenAI API format)
+    if result.usage:
+        print(f"Prompt tokens: {result.usage['prompt_tokens']}")
+        print(f"Completion tokens: {result.usage['completion_tokens']}")
+        print(f"Total tokens: {result.usage['total_tokens']}")
+```
+
+The `usage` field is a dictionary with three keys: `prompt_tokens`, `completion_tokens`, and `total_tokens`. All backends populate this field consistently.
+
 #### Performance
 
-- **Zero overhead when disabled**: When `MELLEA_METRICS_ENABLED=false` (default), `record_token_usage_metrics()` returns immediately with no processing
+- **Zero overhead when disabled**: When `MELLEA_METRICS_ENABLED=false` (default), the TokenMetricsPlugin is not registered and has no overhead
 - **Minimal overhead when enabled**: Counter increments are extremely fast (~nanoseconds per operation)
 - **Async export**: Metrics are batched and exported asynchronously (default: every 60 seconds)
 - **Non-blocking**: Metric recording never blocks LLM calls
+- **Automatic collection**: Metrics are recorded via hooks after generation completes—no manual instrumentation needed
 
 #### Use Cases
 
diff --git a/docs/examples/telemetry/metrics_example.py b/docs/examples/telemetry/metrics_example.py
index d7bf1c54..c8630c55 100644
--- a/docs/examples/telemetry/metrics_example.py
+++ b/docs/examples/telemetry/metrics_example.py
@@ -101,6 +101,12 @@ def main():
         )
         print(f"Email: {str(email)[:100]}...")
 
+        # Token usage is available on the result from instruct()
+        if email.usage:
+            print(f"  → Prompt tokens: {email.usage['prompt_tokens']}")
+            print(f"  → Completion tokens: {email.usage['completion_tokens']}")
+            print(f"  → Total tokens: {email.usage['total_tokens']}")
+
         # Example 3: Multiple operations
         print("\n3. Multiple operations...")
         text = "Hello, how are you today?"