From 4701ec6b5f419a292450d461b7c03d3f2e060378 Mon Sep 17 00:00:00 2001 From: Alex Bozarth Date: Fri, 13 Mar 2026 15:55:45 -0500 Subject: [PATCH 1/4] feat: add usage field to ModelOutputThunk and populate in all backends - Add usage field to ModelOutputThunk with OpenAI-compatible format - Update all 5 backends to populate usage field during post_processing Signed-off-by: Alex Bozarth --- mellea/backends/huggingface.py | 8 ++++++++ mellea/backends/litellm.py | 4 ++++ mellea/backends/ollama.py | 8 ++++++++ mellea/backends/openai.py | 4 ++++ mellea/backends/watsonx.py | 4 ++++ mellea/core/base.py | 10 ++++++++++ 6 files changed, 38 insertions(+) diff --git a/mellea/backends/huggingface.py b/mellea/backends/huggingface.py index c7886e7e..83655f3e 100644 --- a/mellea/backends/huggingface.py +++ b/mellea/backends/huggingface.py @@ -1075,6 +1075,14 @@ async def post_processing( except Exception: pass + # Populate standardized usage field (convert to OpenAI format) + if n_prompt is not None and n_completion is not None: + mot.usage = { + "prompt_tokens": n_prompt, + "completion_tokens": n_completion, + "total_tokens": n_prompt + n_completion, + } + # Record metrics if enabled if metrics_enabled and n_prompt is not None: from ..telemetry.backend_instrumentation import ( diff --git a/mellea/backends/litellm.py b/mellea/backends/litellm.py index 4f5bac38..ef453d29 100644 --- a/mellea/backends/litellm.py +++ b/mellea/backends/litellm.py @@ -488,6 +488,10 @@ async def post_processing( if usage is None: usage = mot._meta.get("litellm_streaming_usage") + # Populate standardized usage field (LiteLLM uses OpenAI format) + if usage: + mot.usage = usage + # Record metrics if enabled from ..telemetry.metrics import is_metrics_enabled diff --git a/mellea/backends/ollama.py b/mellea/backends/ollama.py index afe80ec8..276bb180 100644 --- a/mellea/backends/ollama.py +++ b/mellea/backends/ollama.py @@ -622,6 +622,14 @@ async def post_processing( ) completion_tokens = getattr(response, "eval_count", None) if response else None + # Populate standardized usage field (convert to OpenAI format) + if prompt_tokens is not None or completion_tokens is not None: + mot.usage = { + "prompt_tokens": prompt_tokens or 0, + "completion_tokens": completion_tokens or 0, + "total_tokens": (prompt_tokens or 0) + (completion_tokens or 0), + } + # Record metrics if enabled from ..telemetry.metrics import is_metrics_enabled diff --git a/mellea/backends/openai.py b/mellea/backends/openai.py index faadfc45..16a93485 100644 --- a/mellea/backends/openai.py +++ b/mellea/backends/openai.py @@ -625,6 +625,10 @@ async def post_processing( if usage is None: usage = mot._meta.get("oai_streaming_usage") + # Populate standardized usage field (OpenAI format already matches) + if usage: + mot.usage = usage + # Record metrics if enabled from ..telemetry.metrics import is_metrics_enabled diff --git a/mellea/backends/watsonx.py b/mellea/backends/watsonx.py index d6ca943e..5a4bf24a 100644 --- a/mellea/backends/watsonx.py +++ b/mellea/backends/watsonx.py @@ -496,6 +496,10 @@ async def post_processing( else getattr(response, "usage", None) ) + # Populate standardized usage field (WatsonX uses OpenAI format) + if usage: + mot.usage = usage + # Record metrics if enabled from ..telemetry.metrics import is_metrics_enabled diff --git a/mellea/core/base.py b/mellea/core/base.py index 60d65029..28b58234 100644 --- a/mellea/core/base.py +++ b/mellea/core/base.py @@ -196,6 +196,15 @@ def __init__( # Additional fields that should be standardized across apis. self.tool_calls = tool_calls self._thinking: str | None = None + self.usage: dict[str, int] | None = None + """Usage information following OpenAI API standard. + + Core fields: 'prompt_tokens', 'completion_tokens', 'total_tokens'. + Populated by backends during post_processing. None if unavailable. + + Future: May include optional breakdown fields like 'completion_tokens_details' + and 'prompt_tokens_details' for advanced features (reasoning, audio, caching). + """ # Used for tracking generation. self._context: list[Component | CBlock] | None = None @@ -233,6 +242,7 @@ def _copy_from(self, other: ModelOutputThunk) -> None: self.parsed_repr = other.parsed_repr self.tool_calls = other.tool_calls self._thinking = other._thinking + self.usage = other.usage self._generate_log = other._generate_log def is_computed(self) -> bool: From b4649c9ede1d767eefaf123567004b5304bb5492 Mon Sep 17 00:00:00 2001 From: Alex Bozarth Date: Fri, 13 Mar 2026 19:04:51 -0500 Subject: [PATCH 2/4] refactor: migrate token metrics to plugin-based system Completes migration from direct backend metrics calls to hook-based plugin system. TokenMetricsPlugin now handles all token metrics recording via generation_post_call hook. - Remove record_token_usage_metrics() calls from OpenAI, Ollama, LiteLLM, WatsonX, HuggingFace backends - Update metrics output format for OTel compliance Signed-off-by: Alex Bozarth --- docs/dev/metrics_refactor_plan.md | 673 +++++++++++++++++++++++++ mellea/backends/huggingface.py | 21 +- mellea/backends/litellm.py | 21 +- mellea/backends/ollama.py | 20 +- mellea/backends/openai.py | 21 +- mellea/backends/watsonx.py | 21 +- mellea/core/base.py | 22 + mellea/telemetry/metrics.py | 46 +- mellea/telemetry/metrics_plugins.py | 62 +++ pyproject.toml | 1 + test/telemetry/test_metrics.py | 30 +- test/telemetry/test_metrics_backend.py | 22 +- test/telemetry/test_metrics_token.py | 54 +- uv.lock | 13 +- 14 files changed, 838 insertions(+), 189 deletions(-) create mode 100644 docs/dev/metrics_refactor_plan.md create mode 100644 mellea/telemetry/metrics_plugins.py diff --git a/docs/dev/metrics_refactor_plan.md b/docs/dev/metrics_refactor_plan.md new file mode 100644 index 00000000..4a82ce7b --- /dev/null +++ b/docs/dev/metrics_refactor_plan.md @@ -0,0 +1,673 @@ +# Metrics Refactor Plan: Migrate to Hooks/Plugins System + +## Overview + +This plan addresses issues [#607](https://github.com/generative-computing/mellea/issues/607) and [#608](https://github.com/generative-computing/mellea/issues/608) to refactor the token metrics implementation added in commit `0e71558` to use the hooks/plugins system introduced in commit `cbd63bd`. + +**Current State**: Token metrics are recorded via direct `record_token_usage_metrics()` calls in each backend's `post_processing()` method (5 backends: OpenAI, Ollama, LiteLLM, HuggingFace, WatsonX). + +**Target State**: Token metrics recorded via a plugin that hooks into `generation_post_call`, with token usage data standardized on `ModelOutputThunk` using a `usage` field that matches the OpenAI API standard. + +## Key Commits Analysis + +### Commit 0e71558 (Metrics Implementation) +- Added `mellea/telemetry/metrics.py` with `record_token_usage_metrics()` +- Modified 5 backends to call `record_token_usage_metrics()` in their `post_processing()` methods +- Token extraction logic varies per backend: + - **OpenAI/LiteLLM/WatsonX**: Extract from `usage` dict via `get_value(usage, "prompt_tokens")` + - **Ollama**: Extract from response attributes (`prompt_eval_count`, `eval_count`) + - **HuggingFace**: Calculate from `input_ids` and output sequences +- Each backend stores usage info in different `_meta` locations + +### Commit cbd63bd (Hooks/Plugins System) +- Added complete hooks/plugins infrastructure in `mellea/plugins/` +- Implemented `generation_post_call` hook that fires after `post_process()` completes +- Hook fires in `ModelOutputThunk.astream()` at line 384-398 in `mellea/core/base.py` +- Payload: `GenerationPostCallPayload(prompt, model_output, latency_ms)` +- Policy: `generation_post_call` is **observe-only** (no writable fields) + +## Problem Statement + +### Issue #607: Standardize Token Storage +**Problem**: Token usage is stored inconsistently across backends in various `_meta` locations, making programmatic access difficult. + +**Solution**: Add a standard `usage` field to `ModelOutputThunk` that all backends populate, matching the OpenAI API standard. + +### Issue #608: Use Hooks for Metrics +**Problem**: Metrics recording is duplicated across 5 backends, tightly coupled to backend implementation. + +**Solution**: Create a metrics plugin that hooks `generation_post_call` to record token usage from the standardized field. + +## Architecture Decision + +The refactor follows this sequence: + +1. **First** (Issue #607): Standardize token storage on `ModelOutputThunk` +2. **Then** (Issue #608): Create metrics plugin to consume standardized data + +This ordering is critical because: +- The plugin needs a consistent data source to read from +- Backends must populate the standard field before the hook fires +- The hook fires **after** `post_processing()` completes, so backends have already extracted tokens + +## Detailed Implementation Plan + +### Phase 1: Standardize Token Storage (Issue #607) + +#### 1.1 Add `usage` Field to ModelOutputThunk + +**File**: `mellea/core/base.py` + +**Changes**: +```python +class ModelOutputThunk(CBlock, Generic[S]): + def __init__( + self, + value: str | None, + meta: dict[str, Any] | None = None, + parsed_repr: S | None = None, + tool_calls: dict[str, ModelToolCall] | None = None, + ): + # ... existing code ... + + # Add new field for standardized usage information + self.usage: dict[str, int] | None = None + """Usage information following OpenAI API standard. + + Core fields: 'prompt_tokens', 'completion_tokens', 'total_tokens'. + Populated by backends during post_processing. None if unavailable. + + Future: May include optional breakdown fields like 'completion_tokens_details' + and 'prompt_tokens_details' for advanced features (reasoning, audio, caching). + """ +``` + +**Rationale**: +- Matches OpenAI API standard (industry convention) +- Consistent with existing `tool_calls` field pattern +- Extensible for future usage fields beyond tokens (cost, reasoning tokens, cached tokens) +- Simple dict format compatible with all backends +- Optional (None) when backend doesn't provide usage info +- Accessible via `model_output.usage` in hooks and user code + +#### 1.2 Update Backend `post_processing()` Methods + +Each backend's `post_processing()` method must populate `mot.token_usage` **before** the method returns (since `generation_post_call` hook fires after `post_processing` completes). + +**Pattern for all backends**: +```python +async def post_processing(self, mot: ModelOutputThunk, ...): + # ... existing processing logic ... + + # Extract token usage (backend-specific logic) + prompt_tokens = + completion_tokens = + + # Populate standardized field (matches OpenAI API format) + if prompt_tokens is not None or completion_tokens is not None: + mot.usage = { + "prompt_tokens": prompt_tokens or 0, + "completion_tokens": completion_tokens or 0, + "total_tokens": (prompt_tokens or 0) + (completion_tokens or 0), + } + + # REMOVE: Direct metrics recording + # from ..telemetry.metrics import record_token_usage_metrics + # record_token_usage_metrics(...) +``` + +**Backend-Specific Extraction Logic**: + +1. **OpenAI** (`mellea/backends/openai.py:562-646`): + ```python + # Extract from response or streaming usage (lines 620-626) + response = mot._meta["oai_chat_response"] + usage = response.get("usage") if isinstance(response, dict) else None + if usage is None: + usage = mot._meta.get("oai_streaming_usage") + + # Populate standardized field (already matches OpenAI format) + if usage: + mot.usage = { + "prompt_tokens": get_value(usage, "prompt_tokens") or 0, + "completion_tokens": get_value(usage, "completion_tokens") or 0, + "total_tokens": get_value(usage, "total_tokens") or 0, + } + ``` + +2. **Ollama** (`mellea/backends/ollama.py:583-642`): + ```python + # Extract from response attributes (lines 620-623) + response = mot._meta.get("ollama_response") + prompt_tokens = getattr(response, "prompt_eval_count", None) if response else None + completion_tokens = getattr(response, "eval_count", None) if response else None + + # Convert to OpenAI-compatible format + if prompt_tokens is not None or completion_tokens is not None: + mot.usage = { + "prompt_tokens": prompt_tokens or 0, + "completion_tokens": completion_tokens or 0, + "total_tokens": (prompt_tokens or 0) + (completion_tokens or 0), + } + ``` + +3. **LiteLLM** (`mellea/backends/litellm.py:425-508`): + ```python + # Extract from full response or streaming usage (lines 483-489) + full_response = mot._meta.get("litellm_full_response") + usage = full_response.get("usage") if isinstance(full_response, dict) else None + if usage is None: + usage = mot._meta.get("litellm_streaming_usage") + + # Populate standardized field (already matches OpenAI format) + if usage: + mot.usage = { + "prompt_tokens": get_value(usage, "prompt_tokens") or 0, + "completion_tokens": get_value(usage, "completion_tokens") or 0, + "total_tokens": get_value(usage, "total_tokens") or 0, + } + ``` + +4. **HuggingFace** (`mellea/backends/huggingface.py:1001-1092`): + ```python + # Calculate from sequences (lines 1063-1076) + hf_output = mot._meta.get("hf_output") + if isinstance(hf_output, GenerateDecoderOnlyOutput): + if input_ids is not None and hf_output.sequences is not None: + try: + n_prompt = input_ids.shape[1] + n_completion = hf_output.sequences[0].shape[0] - n_prompt + # Convert to OpenAI-compatible format + mot.usage = { + "prompt_tokens": n_prompt, + "completion_tokens": n_completion, + "total_tokens": n_prompt + n_completion, + } + except Exception: + pass # Leave as None if calculation fails + ``` + +5. **WatsonX** (`mellea/backends/watsonx.py:450-508`): + ```python + # Extract from response usage (similar to OpenAI) + response = mot._meta.get("watsonx_response") + usage = response.get("usage") if isinstance(response, dict) else None + + # Populate standardized field (already matches OpenAI format) + if usage: + mot.usage = { + "prompt_tokens": get_value(usage, "prompt_tokens") or 0, + "completion_tokens": get_value(usage, "completion_tokens") or 0, + "total_tokens": get_value(usage, "total_tokens") or 0, + } + ``` + +**Files to Modify**: +- `mellea/core/base.py` (add field) +- `mellea/backends/openai.py` (populate field, remove direct metrics call) +- `mellea/backends/ollama.py` (populate field, remove direct metrics call) +- `mellea/backends/litellm.py` (populate field, remove direct metrics call) +- `mellea/backends/huggingface.py` (populate field, remove direct metrics call) +- `mellea/backends/watsonx.py` (populate field, remove direct metrics call) + +### Phase 2: Create Metrics Plugin (Issue #608) + +#### 2.1 Create Token Metrics Plugin + +**New File**: `mellea/plugins/builtin/token_metrics.py` + +**Implementation**: +```python +"""Built-in plugin for recording token usage metrics via OpenTelemetry.""" + +from mellea.plugins import Plugin, hook, HookType, PluginMode +from mellea.plugins.hooks.generation import GenerationPostCallPayload + + +class TokenMetricsPlugin(Plugin, name="token-metrics", priority=100): + """Records token usage metrics from generation_post_call hook. + + This plugin automatically records input/output token counts to OpenTelemetry + metrics when MELLEA_METRICS_ENABLED=true. It reads from the standardized + ModelOutputThunk.usage field populated by backends. + + Execution mode: FIRE_AND_FORGET (async, non-blocking, observe-only) + Priority: 100 (runs after other plugins) + """ + + @hook(HookType.GENERATION_POST_CALL, mode=PluginMode.FIRE_AND_FORGET) + async def record_tokens( + self, + payload: GenerationPostCallPayload, + ctx + ): + """Record token usage metrics from the model output.""" + from mellea.telemetry.metrics import is_metrics_enabled + + # Early return if metrics disabled (zero overhead) + if not is_metrics_enabled(): + return + + mot = payload.model_output + if mot is None or mot.usage is None: + return + + # Extract backend info from context + backend = ctx.global_context.state.get("backend") + if backend is None: + return + + from mellea.telemetry.backend_instrumentation import ( + get_model_id_str, + get_system_name, + ) + from mellea.telemetry.metrics import record_token_usage_metrics + + # Record using standardized usage dict + record_token_usage_metrics( + input_tokens=mot.usage.get("prompt_tokens"), + output_tokens=mot.usage.get("completion_tokens"), + model=get_model_id_str(backend), + backend=backend.__class__.__name__, + system=get_system_name(backend), + ) +``` + +**Key Design Decisions**: +- **FIRE_AND_FORGET mode**: Metrics recording is async, non-blocking, and cannot fail the generation +- **Priority 100**: Runs after other plugins (lower priority = earlier execution) +- **Observe-only**: Reads from `model_output.usage`, doesn't modify payload +- **Zero overhead**: Early return when metrics disabled +- **Backend-agnostic**: Works with any backend that populates `usage` +- **OpenAI-compatible**: Uses standard field name for familiarity and future extensibility + +#### 2.2 Auto-Register Plugin When Metrics Enabled + +**File**: `mellea/telemetry/metrics.py` + +**Changes**: +```python +# At module initialization (after _meter setup, around line 247) +if _OTEL_AVAILABLE and _METRICS_ENABLED: + _meter_provider = _setup_meter_provider() + if _meter_provider is not None: + _meter = metrics.get_meter("mellea.metrics", version("mellea")) + + # Auto-register token metrics plugin + from mellea.plugins.builtin.token_metrics import TokenMetricsPlugin + from mellea.plugins import register + + _token_metrics_plugin = TokenMetricsPlugin() + register(_token_metrics_plugin) +``` + +**Rationale**: +- Automatic activation when `MELLEA_METRICS_ENABLED=true` +- No user code changes required +- Consistent with existing metrics module behavior +- Plugin is globally registered (works with both session and functional API) + +#### 2.3 Update Plugin Module Structure + +**New Directory**: `mellea/plugins/builtin/` + +**Files**: +- `mellea/plugins/builtin/__init__.py` - Export built-in plugins +- `mellea/plugins/builtin/token_metrics.py` - Token metrics plugin + +**Update**: `mellea/plugins/__init__.py` +```python +# Add to exports +from mellea.plugins.builtin import TokenMetricsPlugin + +__all__ = [ + # ... existing exports ... + "TokenMetricsPlugin", +] +``` + +### Phase 3: Update Tests + +#### 3.1 Update Backend Integration Tests + +**Files**: `test/telemetry/test_metrics_backend.py` + +**Changes**: +- Tests should verify `mot.token_usage` is populated (not just metrics recorded) +- Add assertions: `assert mot.token_usage is not None` +- Add assertions: `assert mot.token_usage["prompt_tokens"] > 0` +- Keep existing metrics verification (plugin should still record them) + +**Example**: +```python +async def test_openai_token_metrics(ollama_backend): + """Test that OpenAI backend populates usage and metrics are recorded.""" + # ... existing test setup ... + + # Verify usage field is populated + assert mot.usage is not None + assert mot.usage["prompt_tokens"] > 0 + assert mot.usage["completion_tokens"] > 0 + assert mot.usage["total_tokens"] > 0 + + # Verify metrics were recorded (via plugin) + # ... existing metrics verification ... +``` + +#### 3.2 Add Plugin-Specific Tests + +**New File**: `test/plugins/test_token_metrics_plugin.py` + +**Tests**: +1. Test plugin registers correctly when metrics enabled +2. Test plugin reads from `mot.usage` correctly +3. Test plugin handles missing `usage` gracefully +4. Test plugin is no-op when metrics disabled +5. Test plugin works with all backends +6. Test plugin doesn't block on errors (FIRE_AND_FORGET mode) + +#### 3.3 Update Unit Tests + +**File**: `test/telemetry/test_metrics_token.py` + +**Changes**: +- Tests for `record_token_usage_metrics()` remain unchanged (function still exists) +- Add note that function is now called by plugin, not backends directly + +### Phase 4: Update Documentation + +#### 4.1 Update Telemetry Documentation + +**File**: `docs/dev/telemetry.md` + +**Changes**: +- Update "Token Usage Metrics" section to explain plugin-based architecture +- Add note about `ModelOutputThunk.usage` field (OpenAI-compatible) +- Update backend support table to note standardized field +- Add example of accessing token usage programmatically + +**Example Addition**: +```markdown +### Programmatic Access to Token Usage + +Token usage information is available on the `ModelOutputThunk` after generation: + +```python +with start_session() as m: + result = m.instruct("What is the capital of France?") + + # Access token usage (OpenAI-compatible format) + if result.usage: + print(f"Prompt tokens: {result.usage['prompt_tokens']}") + print(f"Completion tokens: {result.usage['completion_tokens']}") + print(f"Total tokens: {result.usage['total_tokens']}") +``` + +Token metrics are automatically recorded to OpenTelemetry when +`MELLEA_METRICS_ENABLED=true` via the built-in `TokenMetricsPlugin`. +``` + +#### 4.2 Update Plugin Documentation + +**File**: `docs/docs/core-concept/plugins.mdx` + +**Changes**: +- Add section on built-in plugins +- Document `TokenMetricsPlugin` as an example of FIRE_AND_FORGET mode +- Show how built-in plugins auto-register + +#### 4.3 Update Examples + +**File**: `docs/examples/telemetry/metrics_example.py` + +**Changes**: +- Add example showing programmatic access to `usage` +- Add comment explaining plugin-based architecture +- No functional changes (metrics still work the same way) + +**Example Addition**: +```python +# Example 5: Programmatic token access +print("\n5. Accessing token usage programmatically...") +result = m.instruct("Count to five") +if result.usage: + print(f" Prompt tokens: {result.usage['prompt_tokens']}") + print(f" Completion tokens: {result.usage['completion_tokens']}") + print(f" Total tokens: {result.usage['total_tokens']}") +``` + +### Phase 5: Cleanup + +#### 5.1 Remove Backend-Specific Metrics Code + +**All Backend Files**: +- Remove `from ..telemetry.metrics import is_metrics_enabled` imports +- Remove `from ..telemetry.metrics import record_token_usage_metrics` imports +- Remove `from ..telemetry.backend_instrumentation import get_model_id_str, get_system_name` imports (if only used for metrics) +- Remove `if is_metrics_enabled():` blocks and `record_token_usage_metrics()` calls + +**Estimated Lines Removed**: ~15-20 lines per backend × 5 backends = ~75-100 lines + +#### 5.2 Keep Core Metrics Infrastructure + +**Files to Keep Unchanged**: +- `mellea/telemetry/metrics.py` - Core metrics functions still needed by plugin +- `mellea/telemetry/backend_instrumentation.py` - Helper functions still needed + +## Implementation Order + +### Step 1: Add `token_usage` Field (Issue #607) +1. Modify `mellea/core/base.py` to add `token_usage` field +2. Update all 5 backends to populate the field +3. Keep existing `record_token_usage_metrics()` calls temporarily (dual-write) +4. Run tests to verify field is populated correctly + +### Step 2: Create Metrics Plugin (Issue #608) +5. Create `mellea/plugins/builtin/` directory +6. Implement `TokenMetricsPlugin` in `token_metrics.py` +7. Auto-register plugin in `mellea/telemetry/metrics.py` +8. Run tests to verify plugin records metrics correctly + +### Step 3: Remove Duplicate Code +9. Remove direct `record_token_usage_metrics()` calls from all backends +10. Remove unused imports from backends +11. Run full test suite to verify no regressions + +### Step 4: Update Documentation and Tests +12. Update backend integration tests to verify `token_usage` field +13. Add plugin-specific tests +14. Update documentation (telemetry.md, plugins.mdx) +15. Update examples to show programmatic access + +## Testing Strategy + +### Test Coverage Required + +1. **Unit Tests** (existing, should pass unchanged): + - `test/telemetry/test_metrics_token.py` - Tests for `record_token_usage_metrics()` + +2. **Backend Integration Tests** (need updates): + - `test/telemetry/test_metrics_backend.py` - Add `token_usage` field assertions + - All 5 backend tests should verify field population + +3. **Plugin Tests** (new): + - `test/plugins/test_token_metrics_plugin.py` - Plugin-specific tests + - Test plugin registration, execution, error handling + +4. **End-to-End Tests** (existing, should pass): + - `docs/examples/telemetry/metrics_example.py` - Should work unchanged + +### Test Execution + +```bash +# Run metrics tests +uv run pytest test/telemetry/test_metrics*.py -v + +# Run plugin tests +uv run pytest test/plugins/test_token_metrics_plugin.py -v + +# Run backend tests +uv run pytest test/backends/ -k "metrics" -v + +# Run example as test +uv run pytest docs/examples/telemetry/metrics_example.py -v +``` + +## Migration Impact + +### Breaking Changes +**None** - This is a pure refactor with no API changes: +- Metrics still recorded automatically when `MELLEA_METRICS_ENABLED=true` +- Same metric names, attributes, and exporters +- User code unchanged + +### New Features +- **Programmatic access**: Users can now access `mot.token_usage` directly +- **Extensibility**: Users can create custom metrics plugins following this pattern +- **Consistency**: Token data in standard location across all backends + +### Performance Impact +- **Negligible**: Plugin uses FIRE_AND_FORGET mode (async, non-blocking) +- **Same overhead**: Metrics recording logic unchanged, just moved to plugin +- **Zero overhead when disabled**: Early return in plugin when metrics disabled + +## Risks and Mitigations + +### Risk 1: Hook Timing +**Risk**: `generation_post_call` fires after `post_processing()`, but what if backends don't populate `token_usage` in time? + +**Mitigation**: Backends populate `token_usage` **during** `post_processing()`, which completes **before** the hook fires. The hook call is at line 384-398 in `ModelOutputThunk.astream()`, after line 366 (`await self._post_process(self)`). + +### Risk 2: Missing Token Data +**Risk**: Some backends might not have token usage available. + +**Mitigation**: +- `token_usage` field is optional (None when unavailable) +- Plugin checks `if mot.token_usage is None: return` +- Existing behavior preserved (metrics not recorded when data unavailable) + +### Risk 3: Plugin Framework Dependency +**Risk**: Plugin requires `mellea[hooks]` extra dependency. + +**Mitigation**: +- Plugin only imported when metrics enabled +- Graceful fallback if hooks not installed (warning message) +- Most users enabling metrics will have full installation + +### Risk 4: Test Failures +**Risk**: Existing tests might fail if they expect metrics without the plugin. + +**Mitigation**: +- Plugin auto-registers when metrics enabled (same as before) +- Tests run with `MELLEA_METRICS_ENABLED=true` will get plugin automatically +- Dual-write during Step 1 ensures no test breakage during transition + +## Success Criteria + +### Phase 1 Complete When: +- [ ] `ModelOutputThunk.token_usage` field added +- [ ] All 5 backends populate the field correctly +- [ ] Backend integration tests verify field population +- [ ] Existing metrics tests still pass (dual-write active) + +### Phase 2 Complete When: +- [ ] `TokenMetricsPlugin` implemented and tested +- [ ] Plugin auto-registers when metrics enabled +- [ ] Plugin records metrics correctly (verified by existing tests) +- [ ] Plugin-specific tests added and passing + +### Phase 3 Complete When: +- [ ] Direct `record_token_usage_metrics()` calls removed from backends +- [ ] Unused imports cleaned up +- [ ] All tests pass (metrics now recorded via plugin only) + +### Phase 4 Complete When: +- [ ] Documentation updated (telemetry.md, plugins.mdx) +- [ ] Examples updated to show programmatic access +- [ ] All documentation builds without errors + +### Overall Success: +- [ ] All tests pass (`uv run pytest test/`) +- [ ] No breaking changes to user API +- [ ] Metrics still recorded correctly +- [ ] Code is cleaner (no duplication across backends) +- [ ] Token usage accessible programmatically via `mot.token_usage` + +## File Checklist + +### Files to Create +- [ ] `mellea/plugins/builtin/__init__.py` +- [ ] `mellea/plugins/builtin/token_metrics.py` +- [ ] `test/plugins/test_token_metrics_plugin.py` + +### Files to Modify +- [ ] `mellea/core/base.py` - Add `token_usage` field +- [ ] `mellea/backends/openai.py` - Populate field, remove metrics call +- [ ] `mellea/backends/ollama.py` - Populate field, remove metrics call +- [ ] `mellea/backends/litellm.py` - Populate field, remove metrics call +- [ ] `mellea/backends/huggingface.py` - Populate field, remove metrics call +- [ ] `mellea/backends/watsonx.py` - Populate field, remove metrics call +- [ ] `mellea/telemetry/metrics.py` - Auto-register plugin +- [ ] `mellea/plugins/__init__.py` - Export TokenMetricsPlugin +- [ ] `test/telemetry/test_metrics_backend.py` - Add token_usage assertions +- [ ] `docs/dev/telemetry.md` - Document new architecture +- [ ] `docs/docs/core-concept/plugins.mdx` - Document built-in plugin +- [ ] `docs/examples/telemetry/metrics_example.py` - Add programmatic access example + +### Files to Review (No Changes Expected) +- `mellea/telemetry/backend_instrumentation.py` - Helper functions still used +- `test/telemetry/test_metrics_token.py` - Unit tests for core function +- `mellea/plugins/hooks/generation.py` - Payload definition unchanged + +## Estimated Effort + +- **Phase 1** (Standardize storage): 2-3 hours + - Add field: 15 min + - Update 5 backends: 1.5 hours (30 min each) + - Update tests: 1 hour + +- **Phase 2** (Create plugin): 1-2 hours + - Implement plugin: 45 min + - Auto-registration: 15 min + - Plugin tests: 1 hour + +- **Phase 3** (Cleanup): 30 min + - Remove duplicate code: 20 min + - Verify tests: 10 min + +- **Phase 4** (Documentation): 1 hour + - Update docs: 30 min + - Update examples: 30 min + +**Total**: 4.5-6.5 hours + +## Open Questions + +1. **Should `token_usage` be added to `GenerationPostCallPayload`?** + - Pro: Makes it explicit in the hook payload + - Con: Redundant (already on `model_output`) + - **Recommendation**: No, keep it on MOT only (single source of truth) + +2. **Should the plugin be optional or always registered?** + - Current plan: Auto-register when `MELLEA_METRICS_ENABLED=true` + - Alternative: Always register, let plugin check `is_metrics_enabled()` internally + - **Recommendation**: Auto-register (cleaner, no overhead when disabled) + +3. **Should we support custom metrics plugins?** + - Users could create their own plugins reading from `token_usage` + - **Recommendation**: Yes, document this pattern in plugins.mdx + +4. **What about vLLM backend?** + - Not modified in commit 0e71558 (no metrics support) + - **Recommendation**: Out of scope for this refactor, can be added later + +## References + +- Issue #607: https://github.com/generative-computing/mellea/issues/607 +- Issue #608: https://github.com/generative-computing/mellea/issues/608 +- PR #563 (metrics): https://github.com/generative-computing/mellea/pull/563 +- PR #582 (hooks): https://github.com/generative-computing/mellea/pull/582 +- Commit 0e71558: Metrics implementation +- Commit cbd63bd: Hooks/plugins system +- Hook system spec: `docs/dev/hook_system.md` +- Plugin examples: `docs/examples/plugins/` \ No newline at end of file diff --git a/mellea/backends/huggingface.py b/mellea/backends/huggingface.py index 83655f3e..4efa5cbb 100644 --- a/mellea/backends/huggingface.py +++ b/mellea/backends/huggingface.py @@ -1083,21 +1083,12 @@ async def post_processing( "total_tokens": n_prompt + n_completion, } - # Record metrics if enabled - if metrics_enabled and n_prompt is not None: - from ..telemetry.backend_instrumentation import ( - get_model_id_str, - get_system_name, - ) - from ..telemetry.metrics import record_token_usage_metrics - - record_token_usage_metrics( - input_tokens=n_prompt, - output_tokens=n_completion, - model=get_model_id_str(self), - backend=self.__class__.__name__, - system=get_system_name(self), - ) + # Populate model and provider metadata + if hasattr(self.model_id, "hf_model_name"): + mot.model = str(self.model_id.hf_model_name) # type: ignore + else: + mot.model = str(self.model_id) + mot.provider = "huggingface" # Record tracing if span exists if span is not None: diff --git a/mellea/backends/litellm.py b/mellea/backends/litellm.py index ef453d29..43e8c262 100644 --- a/mellea/backends/litellm.py +++ b/mellea/backends/litellm.py @@ -492,24 +492,9 @@ async def post_processing( if usage: mot.usage = usage - # Record metrics if enabled - from ..telemetry.metrics import is_metrics_enabled - - if is_metrics_enabled() and usage: - from ..telemetry.backend_instrumentation import ( - get_model_id_str, - get_system_name, - ) - from ..telemetry.metrics import record_token_usage_metrics - from .utils import get_value - - record_token_usage_metrics( - input_tokens=get_value(usage, "prompt_tokens"), - output_tokens=get_value(usage, "completion_tokens"), - model=get_model_id_str(self), - backend=self.__class__.__name__, - system=get_system_name(self), - ) + # Populate model and provider metadata + mot.model = str(self.model_id) + mot.provider = "litellm" # Record telemetry now that response is available span = mot._meta.get("_telemetry_span") diff --git a/mellea/backends/ollama.py b/mellea/backends/ollama.py index 276bb180..8cae59fa 100644 --- a/mellea/backends/ollama.py +++ b/mellea/backends/ollama.py @@ -630,23 +630,9 @@ async def post_processing( "total_tokens": (prompt_tokens or 0) + (completion_tokens or 0), } - # Record metrics if enabled - from ..telemetry.metrics import is_metrics_enabled - - if is_metrics_enabled(): - from ..telemetry.backend_instrumentation import ( - get_model_id_str, - get_system_name, - ) - from ..telemetry.metrics import record_token_usage_metrics - - record_token_usage_metrics( - input_tokens=prompt_tokens, - output_tokens=completion_tokens, - model=get_model_id_str(self), - backend=self.__class__.__name__, - system=get_system_name(self), - ) + # Populate model and provider metadata + mot.model = str(self.model_id) + mot.provider = "ollama" # Record telemetry and close span now that response is available span = mot._meta.get("_telemetry_span") diff --git a/mellea/backends/openai.py b/mellea/backends/openai.py index 16a93485..561fd58b 100644 --- a/mellea/backends/openai.py +++ b/mellea/backends/openai.py @@ -629,24 +629,9 @@ async def post_processing( if usage: mot.usage = usage - # Record metrics if enabled - from ..telemetry.metrics import is_metrics_enabled - - if is_metrics_enabled() and usage: - from ..telemetry.backend_instrumentation import ( - get_model_id_str, - get_system_name, - ) - from ..telemetry.metrics import record_token_usage_metrics - from .utils import get_value - - record_token_usage_metrics( - input_tokens=get_value(usage, "prompt_tokens"), - output_tokens=get_value(usage, "completion_tokens"), - model=get_model_id_str(self), - backend=self.__class__.__name__, - system=get_system_name(self), - ) + # Populate model and provider metadata + mot.model = str(self.model_id) + mot.provider = "openai" # Record telemetry now that response is available span = mot._meta.get("_telemetry_span") diff --git a/mellea/backends/watsonx.py b/mellea/backends/watsonx.py index 5a4bf24a..f6f65283 100644 --- a/mellea/backends/watsonx.py +++ b/mellea/backends/watsonx.py @@ -500,24 +500,9 @@ async def post_processing( if usage: mot.usage = usage - # Record metrics if enabled - from ..telemetry.metrics import is_metrics_enabled - - if is_metrics_enabled() and usage: - from ..telemetry.backend_instrumentation import ( - get_model_id_str, - get_system_name, - ) - from ..telemetry.metrics import record_token_usage_metrics - from .utils import get_value - - record_token_usage_metrics( - input_tokens=get_value(usage, "prompt_tokens"), - output_tokens=get_value(usage, "completion_tokens"), - model=get_model_id_str(self), - backend=self.__class__.__name__, - system=get_system_name(self), - ) + # Populate model and provider metadata + mot.model = str(self.model_id) + mot.provider = "watsonx" # Record tracing if span exists span = mot._meta.get("_telemetry_span") diff --git a/mellea/core/base.py b/mellea/core/base.py index 28b58234..4850fc9d 100644 --- a/mellea/core/base.py +++ b/mellea/core/base.py @@ -206,6 +206,20 @@ def __init__( and 'prompt_tokens_details' for advanced features (reasoning, audio, caching). """ + self.model: str | None = None + """Model identifier that generated this output. + + Examples: 'gpt-4', 'llama2:7b', 'meta-llama/Llama-2-7b-hf'. + Populated by backends. None if unavailable. + """ + + self.provider: str | None = None + """Provider that generated this output. + + Examples: 'openai', 'ollama', 'huggingface', 'watsonx'. + Populated by backends. None if unavailable. + """ + # Used for tracking generation. self._context: list[Component | CBlock] | None = None self._action: Component | CBlock | None = None @@ -243,6 +257,8 @@ def _copy_from(self, other: ModelOutputThunk) -> None: self.tool_calls = other.tool_calls self._thinking = other._thinking self.usage = other.usage + self.model = other.model + self.provider = other.provider self._generate_log = other._generate_log def is_computed(self) -> bool: @@ -443,6 +459,9 @@ def __copy__(self) -> ModelOutputThunk: copied._context = self._context copied._generate_log = self._generate_log copied._model_options = self._model_options + copied.usage = self.usage + copied.model = self.model + copied.provider = self.provider return copied def __deepcopy__(self, memo: dict) -> ModelOutputThunk: @@ -472,6 +491,9 @@ def __deepcopy__(self, memo: dict) -> ModelOutputThunk: ) # The items in a context should be immutable. deepcopied._generate_log = copy(self._generate_log) deepcopied._model_options = copy(self._model_options) + deepcopied.usage = deepcopy(self.usage) if self.usage else None + deepcopied.model = self.model + deepcopied.provider = self.provider return deepcopied diff --git a/mellea/telemetry/metrics.py b/mellea/telemetry/metrics.py index e4b99294..9382c10b 100644 --- a/mellea/telemetry/metrics.py +++ b/mellea/telemetry/metrics.py @@ -407,13 +407,9 @@ def _get_token_counters() -> tuple[Any, Any]: def record_token_usage_metrics( - input_tokens: int | None, - output_tokens: int | None, - model: str, - backend: str, - system: str, + input_tokens: int | None, output_tokens: int | None, model: str, provider: str ) -> None: - """Record token usage metrics following Gen-AI semantic conventions. + """Record token usage metrics following OpenTelemetry Gen-AI semantic conventions. This is a no-op when metrics are disabled, ensuring zero overhead. @@ -421,16 +417,14 @@ def record_token_usage_metrics( input_tokens: Number of input tokens (prompt tokens), or None if unavailable output_tokens: Number of output tokens (completion tokens), or None if unavailable model: Model identifier (e.g., "gpt-4", "llama2:7b") - backend: Backend class name (e.g., "OpenAIBackend", "OllamaBackend") - system: Gen-AI system name (e.g., "openai", "ollama", "watsonx") + provider: Provider name (e.g., "openai", "ollama", "watsonx") Example: record_token_usage_metrics( input_tokens=150, output_tokens=50, model="llama2:7b", - backend="OllamaBackend", - system="ollama" + provider="ollama" ) """ # Early return if metrics are disabled (zero overhead) @@ -440,12 +434,8 @@ def record_token_usage_metrics( # Get the token counters (lazily initialized) input_counter, output_counter = _get_token_counters() - # Prepare attributes following Gen-AI semantic conventions - attributes = { - "gen_ai.system": system, - "gen_ai.request.model": model, - "mellea.backend": backend, - } + # Prepare attributes following OTel Gen-AI semantic conventions + attributes = {"gen_ai.provider.name": provider, "gen_ai.request.model": model} # Record input tokens if available if input_tokens is not None and input_tokens > 0: @@ -456,6 +446,30 @@ def record_token_usage_metrics( output_counter.add(output_tokens, attributes) +# Auto-register TokenMetricsPlugin when metrics are enabled +if _OTEL_AVAILABLE and _METRICS_ENABLED: + try: + from mellea.plugins.registry import register + from mellea.telemetry.metrics_plugins import TokenMetricsPlugin + + # Idempotent registration (supports module reloads in tests) + try: + register(TokenMetricsPlugin()) + except ValueError as e: + # Already registered (expected during module reloads in tests) + warnings.warn( + f"TokenMetricsPlugin already registered: {e}", UserWarning, stacklevel=2 + ) + except ImportError: + warnings.warn( + "Metrics are enabled but the plugin framework is not installed. " + "Token usage metrics will not be recorded automatically. " + "Install with: pip install mellea[telemetry]", + UserWarning, + stacklevel=2, + ) + + __all__ = [ "create_counter", "create_histogram", diff --git a/mellea/telemetry/metrics_plugins.py b/mellea/telemetry/metrics_plugins.py new file mode 100644 index 00000000..cbf45ac8 --- /dev/null +++ b/mellea/telemetry/metrics_plugins.py @@ -0,0 +1,62 @@ +"""Metrics plugins for recording telemetry data via hooks. + +This module contains plugins that hook into the generation pipeline to +automatically record metrics when enabled. Currently includes: + +- TokenMetricsPlugin: Records token usage statistics from ModelOutputThunk.usage +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from mellea.plugins.base import Plugin +from mellea.plugins.decorators import hook +from mellea.plugins.types import PluginMode + +if TYPE_CHECKING: + from mellea.plugins.hooks.generation import GenerationPostCallPayload + + +class TokenMetricsPlugin(Plugin, name="token_metrics", priority=50): + """Records token usage metrics from generation outputs. + + This plugin hooks into the generation_post_call event to automatically + record token usage metrics when the usage field is populated on + ModelOutputThunk instances. + + The plugin reads the standardized usage field (OpenAI-compatible format) + and records metrics following OpenTelemetry Gen-AI semantic conventions. + + Example: + >>> from mellea.telemetry.metrics_plugins import TokenMetricsPlugin + >>> from mellea.telemetry.metrics import enable_metrics + >>> + >>> enable_metrics() + >>> with TokenMetricsPlugin(): + ... result = session.instruct("Hello, world!") + """ + + @hook("generation_post_call", mode=PluginMode.SEQUENTIAL) + async def record_token_metrics( + self, payload: GenerationPostCallPayload, context: dict[str, Any] + ) -> None: + """Record token metrics after generation completes. + + Args: + payload: Contains the model_output (ModelOutputThunk) with usage data + context: Plugin context (unused) + """ + from mellea.telemetry.metrics import record_token_usage_metrics + + mot = payload.model_output + if mot.usage is None: + return + + # Record metrics (no-op if metrics disabled) + record_token_usage_metrics( + input_tokens=mot.usage.get("prompt_tokens"), + output_tokens=mot.usage.get("completion_tokens"), + model=mot.model or "unknown", + provider=mot.provider or "unknown", + ) diff --git a/pyproject.toml b/pyproject.toml index d5d7bb24..fbde0025 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,6 +75,7 @@ telemetry = [ "opentelemetry-exporter-otlp>=1.20.0", "opentelemetry-exporter-prometheus>=0.40b0", "opentelemetry-distro>=0.59b0", + "mellea[hooks]", ] docling = [ diff --git a/test/telemetry/test_metrics.py b/test/telemetry/test_metrics.py index 81143800..f07ea61e 100644 --- a/test/telemetry/test_metrics.py +++ b/test/telemetry/test_metrics.py @@ -569,11 +569,7 @@ def test_token_counters_lazy_initialization(enable_metrics): from mellea.telemetry.metrics import record_token_usage_metrics record_token_usage_metrics( - input_tokens=100, - output_tokens=50, - model="llama2:7b", - backend="OllamaBackend", - system="ollama", + input_tokens=100, output_tokens=50, model="llama2:7b", provider="ollama" ) # Now should be initialized @@ -589,11 +585,7 @@ def test_record_token_usage_metrics_with_valid_tokens(enable_metrics): # Should not raise record_token_usage_metrics( - input_tokens=150, - output_tokens=50, - model="gpt-4", - backend="OpenAIBackend", - system="openai", + input_tokens=150, output_tokens=50, model="gpt-4", provider="openai" ) @@ -603,11 +595,7 @@ def test_record_token_usage_metrics_with_none_tokens(enable_metrics): # Should not raise record_token_usage_metrics( - input_tokens=None, - output_tokens=None, - model="llama2:7b", - backend="OllamaBackend", - system="ollama", + input_tokens=None, output_tokens=None, model="llama2:7b", provider="ollama" ) @@ -617,11 +605,7 @@ def test_record_token_usage_metrics_with_zero_tokens(enable_metrics): # Should not raise, but won't record zeros record_token_usage_metrics( - input_tokens=0, - output_tokens=0, - model="llama2:7b", - backend="OllamaBackend", - system="ollama", + input_tokens=0, output_tokens=0, model="llama2:7b", provider="ollama" ) @@ -631,11 +615,7 @@ def test_record_token_usage_metrics_noop_when_disabled(clean_metrics_env): # Should not raise and should be no-op record_token_usage_metrics( - input_tokens=100, - output_tokens=50, - model="llama2:7b", - backend="OllamaBackend", - system="ollama", + input_tokens=100, output_tokens=50, model="llama2:7b", provider="ollama" ) # Counters should still be None (not initialized) diff --git a/test/telemetry/test_metrics_backend.py b/test/telemetry/test_metrics_backend.py index 5b2702bd..8918564e 100644 --- a/test/telemetry/test_metrics_backend.py +++ b/test/telemetry/test_metrics_backend.py @@ -142,12 +142,12 @@ async def test_ollama_token_metrics_integration(enable_metrics, metric_reader, s # Verify input token counter input_tokens = get_metric_value( - metrics_data, "mellea.llm.tokens.input", {"gen_ai.system": "ollama"} + metrics_data, "mellea.llm.tokens.input", {"gen_ai.provider.name": "ollama"} ) # Verify output token counter output_tokens = get_metric_value( - metrics_data, "mellea.llm.tokens.output", {"gen_ai.system": "ollama"} + metrics_data, "mellea.llm.tokens.output", {"gen_ai.provider.name": "ollama"} ) # Ollama should always return token counts @@ -198,11 +198,11 @@ async def test_openai_token_metrics_integration(enable_metrics, metric_reader, s # OpenAI always provides token counts input_tokens = get_metric_value( - metrics_data, "mellea.llm.tokens.input", {"gen_ai.system": "openai"} + metrics_data, "mellea.llm.tokens.input", {"gen_ai.provider.name": "openai"} ) output_tokens = get_metric_value( - metrics_data, "mellea.llm.tokens.output", {"gen_ai.system": "openai"} + metrics_data, "mellea.llm.tokens.output", {"gen_ai.provider.name": "openai"} ) assert input_tokens is not None, "Input tokens should be recorded" @@ -246,11 +246,11 @@ async def test_watsonx_token_metrics_integration(enable_metrics, metric_reader): metrics_data = metric_reader.get_metrics_data() input_tokens = get_metric_value( - metrics_data, "mellea.llm.tokens.input", {"gen_ai.system": "watsonx"} + metrics_data, "mellea.llm.tokens.input", {"gen_ai.provider.name": "watsonx"} ) output_tokens = get_metric_value( - metrics_data, "mellea.llm.tokens.output", {"gen_ai.system": "watsonx"} + metrics_data, "mellea.llm.tokens.output", {"gen_ai.provider.name": "watsonx"} ) assert input_tokens is not None, "Input tokens should be recorded" @@ -306,11 +306,11 @@ async def test_litellm_token_metrics_integration( metrics_data = metric_reader.get_metrics_data() input_tokens = get_metric_value( - metrics_data, "mellea.llm.tokens.input", {"gen_ai.system": "litellm"} + metrics_data, "mellea.llm.tokens.input", {"gen_ai.provider.name": "litellm"} ) output_tokens = get_metric_value( - metrics_data, "mellea.llm.tokens.output", {"gen_ai.system": "litellm"} + metrics_data, "mellea.llm.tokens.output", {"gen_ai.provider.name": "litellm"} ) # LiteLLM with Ollama backend should always provide token counts @@ -357,11 +357,13 @@ async def test_huggingface_token_metrics_integration( # HuggingFace computes token counts locally input_tokens = get_metric_value( - metrics_data, "mellea.llm.tokens.input", {"gen_ai.system": "huggingface"} + metrics_data, "mellea.llm.tokens.input", {"gen_ai.provider.name": "huggingface"} ) output_tokens = get_metric_value( - metrics_data, "mellea.llm.tokens.output", {"gen_ai.system": "huggingface"} + metrics_data, + "mellea.llm.tokens.output", + {"gen_ai.provider.name": "huggingface"}, ) assert input_tokens is not None, "Input tokens should be recorded" diff --git a/test/telemetry/test_metrics_token.py b/test/telemetry/test_metrics_token.py index a28a9460..2a61acce 100644 --- a/test/telemetry/test_metrics_token.py +++ b/test/telemetry/test_metrics_token.py @@ -58,11 +58,7 @@ def test_record_token_metrics_basic(clean_metrics_env): # Record some token usage record_token_usage_metrics( - input_tokens=150, - output_tokens=50, - model="llama2:7b", - backend="OllamaBackend", - system="ollama", + input_tokens=150, output_tokens=50, model="llama2:7b", provider="ollama" ) # Force metrics collection @@ -86,9 +82,8 @@ def test_record_token_metrics_basic(clean_metrics_env): # Verify attributes for data_point in metric.data.data_points: attrs = dict(data_point.attributes) - assert attrs["gen_ai.system"] == "ollama" + assert attrs["gen_ai.provider.name"] == "ollama" assert attrs["gen_ai.request.model"] == "llama2:7b" - assert attrs["mellea.backend"] == "OllamaBackend" assert data_point.value == 150 if metric.name == "mellea.llm.tokens.output": @@ -96,9 +91,8 @@ def test_record_token_metrics_basic(clean_metrics_env): # Verify attributes for data_point in metric.data.data_points: attrs = dict(data_point.attributes) - assert attrs["gen_ai.system"] == "ollama" + assert attrs["gen_ai.provider.name"] == "ollama" assert attrs["gen_ai.request.model"] == "llama2:7b" - assert attrs["mellea.backend"] == "OllamaBackend" assert data_point.value == 50 assert found_input, "Input token metric not found" @@ -120,18 +114,10 @@ def test_record_token_metrics_accumulation(clean_metrics_env): # Record multiple token usages with same attributes record_token_usage_metrics( - input_tokens=100, - output_tokens=30, - model="gpt-4", - backend="OpenAIBackend", - system="openai", + input_tokens=100, output_tokens=30, model="gpt-4", provider="openai" ) record_token_usage_metrics( - input_tokens=200, - output_tokens=70, - model="gpt-4", - backend="OpenAIBackend", - system="openai", + input_tokens=200, output_tokens=70, model="gpt-4", provider="openai" ) # Force metrics collection @@ -166,11 +152,7 @@ def test_record_token_metrics_none_handling(clean_metrics_env): # Record with None values (should not crash) record_token_usage_metrics( - input_tokens=None, - output_tokens=None, - model="llama2:7b", - backend="OllamaBackend", - system="ollama", + input_tokens=None, output_tokens=None, model="llama2:7b", provider="ollama" ) # Should not raise, and no metrics should be recorded for None values @@ -203,25 +185,13 @@ def test_record_token_metrics_multiple_backends(clean_metrics_env): # Record from different backends record_token_usage_metrics( - input_tokens=100, - output_tokens=50, - model="llama2:7b", - backend="OllamaBackend", - system="ollama", + input_tokens=100, output_tokens=50, model="llama2:7b", provider="ollama" ) record_token_usage_metrics( - input_tokens=200, - output_tokens=80, - model="gpt-4", - backend="OpenAIBackend", - system="openai", + input_tokens=200, output_tokens=80, model="gpt-4", provider="openai" ) record_token_usage_metrics( - input_tokens=150, - output_tokens=60, - model="granite-3-8b", - backend="WatsonxBackend", - system="watsonx", + input_tokens=150, output_tokens=60, model="granite-3-8b", provider="watsonx" ) # Force metrics collection @@ -240,18 +210,16 @@ def test_record_token_metrics_multiple_backends(clean_metrics_env): for dp in metric.data.data_points: attrs = dict(dp.attributes) key = ( - attrs["gen_ai.system"], + attrs["gen_ai.provider.name"], attrs["gen_ai.request.model"], - attrs["mellea.backend"], ) input_attrs.add(key) if metric.name == "mellea.llm.tokens.output": for dp in metric.data.data_points: attrs = dict(dp.attributes) key = ( - attrs["gen_ai.system"], + attrs["gen_ai.provider.name"], attrs["gen_ai.request.model"], - attrs["mellea.backend"], ) output_attrs.add(key) diff --git a/uv.lock b/uv.lock index e64504fb..f64846ff 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.11" resolution-markers = [ "python_full_version >= '3.14' and python_full_version < '4'", @@ -3747,6 +3747,8 @@ server = [ { name = "uvicorn" }, ] telemetry = [ + { name = "cpex" }, + { name = "grpcio" }, { name = "opentelemetry-api" }, { name = "opentelemetry-distro" }, { name = "opentelemetry-exporter-otlp" }, @@ -3854,6 +3856,7 @@ requires-dist = [ { name = "llm-sandbox", extras = ["docker"], marker = "extra == 'sandbox'", specifier = ">=0.3.23" }, { name = "math-verify" }, { name = "mellea", extras = ["backends", "docling", "tools", "telemetry", "server", "sandbox", "granite-retriever", "hooks"], marker = "extra == 'all'" }, + { name = "mellea", extras = ["hooks"], marker = "extra == 'telemetry'" }, { name = "mellea", extras = ["watsonx", "hf", "vllm", "litellm"], marker = "extra == 'backends'" }, { name = "mistletoe", specifier = ">=1.4.0" }, { name = "numpy", marker = "extra == 'vllm'", specifier = "<=2.2" }, @@ -5813,11 +5816,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/75/2e/a9e28941c6dab6f06e6d3f6783d3373044be9b0f9a9d3492c3d8d2260ac0/pybase64-1.4.3-cp312-cp312-win32.whl", hash = "sha256:7bca1ed3a5df53305c629ca94276966272eda33c0d71f862d2d3d043f1e1b91a", size = 33686, upload-time = "2025-12-06T13:23:37.848Z" }, { url = "https://files.pythonhosted.org/packages/83/e3/507ab649d8c3512c258819c51d25c45d6e29d9ca33992593059e7b646a33/pybase64-1.4.3-cp312-cp312-win_amd64.whl", hash = "sha256:9f2da8f56d9b891b18b4daf463a0640eae45a80af548ce435be86aa6eff3603b", size = 35833, upload-time = "2025-12-06T13:23:38.877Z" }, { url = "https://files.pythonhosted.org/packages/bc/8a/6eba66cd549a2fc74bb4425fd61b839ba0ab3022d3c401b8a8dc2cc00c7a/pybase64-1.4.3-cp312-cp312-win_arm64.whl", hash = "sha256:0631d8a2d035de03aa9bded029b9513e1fee8ed80b7ddef6b8e9389ffc445da0", size = 31185, upload-time = "2025-12-06T13:23:39.908Z" }, - { url = "https://files.pythonhosted.org/packages/3a/50/b7170cb2c631944388fe2519507fe3835a4054a6a12a43f43781dae82be1/pybase64-1.4.3-cp313-cp313-android_21_arm64_v8a.whl", hash = "sha256:ea4b785b0607d11950b66ce7c328f452614aefc9c6d3c9c28bae795dc7f072e1", size = 33901, upload-time = "2025-12-06T13:23:40.951Z" }, { url = "https://files.pythonhosted.org/packages/48/8b/69f50578e49c25e0a26e3ee72c39884ff56363344b79fc3967f5af420ed6/pybase64-1.4.3-cp313-cp313-android_21_x86_64.whl", hash = "sha256:6a10b6330188c3026a8b9c10e6b9b3f2e445779cf16a4c453d51a072241c65a2", size = 40807, upload-time = "2025-12-06T13:23:42.006Z" }, - { url = "https://files.pythonhosted.org/packages/5c/8d/20b68f11adfc4c22230e034b65c71392e3e338b413bf713c8945bd2ccfb3/pybase64-1.4.3-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:27fdff227a0c0e182e0ba37a99109645188978b920dfb20d8b9c17eeee370d0d", size = 30932, upload-time = "2025-12-06T13:23:43.348Z" }, - { url = "https://files.pythonhosted.org/packages/f7/79/b1b550ac6bff51a4880bf6e089008b2e1ca16f2c98db5e039a08ac3ad157/pybase64-1.4.3-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:2a8204f1fdfec5aa4184249b51296c0de95445869920c88123978304aad42df1", size = 31394, upload-time = "2025-12-06T13:23:44.317Z" }, - { url = "https://files.pythonhosted.org/packages/82/70/b5d7c5932bf64ee1ec5da859fbac981930b6a55d432a603986c7f509c838/pybase64-1.4.3-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:874fc2a3777de6baf6aa921a7aa73b3be98295794bea31bd80568a963be30767", size = 38078, upload-time = "2025-12-06T13:23:45.348Z" }, { url = "https://files.pythonhosted.org/packages/1c/c9/24b3b905cf75e23a9a4deaf203b35ffcb9f473ac0e6d8257f91a05dfce62/pybase64-1.4.3-cp313-cp313-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:1d45c8fe8fe82b65c36b227bb4a2cf623d9ada16bed602ce2d3e18c35285b72a", size = 68244, upload-time = "2025-12-06T13:23:49.026Z" }, { url = "https://files.pythonhosted.org/packages/f8/cd/d15b0c3e25e5859fab0416dc5b96d34d6bd2603c1c96a07bb2202b68ab92/pybase64-1.4.3-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:ad70c26ba091d8f5167e9d4e1e86a0483a5414805cdb598a813db635bd3be8b8", size = 71620, upload-time = "2025-12-06T13:23:50.081Z" }, { url = "https://files.pythonhosted.org/packages/0d/31/4ca953cc3dcde2b3711d6bfd70a6f4ad2ca95a483c9698076ba605f1520f/pybase64-1.4.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e98310b7c43145221e7194ac9fa7fffc84763c87bfc5e2f59f9f92363475bdc1", size = 59930, upload-time = "2025-12-06T13:23:51.68Z" }, @@ -5852,11 +5851,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/42/10/abb7757c330bb869ebb95dab0c57edf5961ffbd6c095c8209cbbf75d117d/pybase64-1.4.3-cp313-cp313t-win32.whl", hash = "sha256:46d75c9387f354c5172582a9eaae153b53a53afeb9c19fcf764ea7038be3bd8b", size = 33965, upload-time = "2025-12-06T13:24:28.548Z" }, { url = "https://files.pythonhosted.org/packages/63/a0/2d4e5a59188e9e6aed0903d580541aaea72dcbbab7bf50fb8b83b490b6c3/pybase64-1.4.3-cp313-cp313t-win_amd64.whl", hash = "sha256:d7344625591d281bec54e85cbfdab9e970f6219cac1570f2aa140b8c942ccb81", size = 36207, upload-time = "2025-12-06T13:24:29.646Z" }, { url = "https://files.pythonhosted.org/packages/1f/05/95b902e8f567b4d4b41df768ccc438af618f8d111e54deaf57d2df46bd76/pybase64-1.4.3-cp313-cp313t-win_arm64.whl", hash = "sha256:28a3c60c55138e0028313f2eccd321fec3c4a0be75e57a8d3eb883730b1b0880", size = 31505, upload-time = "2025-12-06T13:24:30.687Z" }, - { url = "https://files.pythonhosted.org/packages/e4/80/4bd3dff423e5a91f667ca41982dc0b79495b90ec0c0f5d59aca513e50f8c/pybase64-1.4.3-cp314-cp314-android_24_arm64_v8a.whl", hash = "sha256:015bb586a1ea1467f69d57427abe587469392215f59db14f1f5c39b52fdafaf5", size = 33835, upload-time = "2025-12-06T13:24:31.767Z" }, { url = "https://files.pythonhosted.org/packages/45/60/a94d94cc1e3057f602e0b483c9ebdaef40911d84a232647a2fe593ab77bb/pybase64-1.4.3-cp314-cp314-android_24_x86_64.whl", hash = "sha256:d101e3a516f837c3dcc0e5a0b7db09582ebf99ed670865223123fb2e5839c6c0", size = 40673, upload-time = "2025-12-06T13:24:32.82Z" }, - { url = "https://files.pythonhosted.org/packages/e3/71/cf62b261d431857e8e054537a5c3c24caafa331de30daede7b2c6c558501/pybase64-1.4.3-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:8f183ac925a48046abe047360fe3a1b28327afb35309892132fe1915d62fb282", size = 30939, upload-time = "2025-12-06T13:24:34.001Z" }, - { url = "https://files.pythonhosted.org/packages/24/3e/d12f92a3c1f7c6ab5d53c155bff9f1084ba997a37a39a4f781ccba9455f3/pybase64-1.4.3-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:30bf3558e24dcce4da5248dcf6d73792adfcf4f504246967e9db155be4c439ad", size = 31401, upload-time = "2025-12-06T13:24:35.11Z" }, - { url = "https://files.pythonhosted.org/packages/9b/3d/9c27440031fea0d05146f8b70a460feb95d8b4e3d9ca8f45c972efb4c3d3/pybase64-1.4.3-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:a674b419de318d2ce54387dd62646731efa32b4b590907800f0bd40675c1771d", size = 38075, upload-time = "2025-12-06T13:24:36.53Z" }, { url = "https://files.pythonhosted.org/packages/db/26/b136a4b65e5c94ff06217f7726478df3f31ab1c777c2c02cf698e748183f/pybase64-1.4.3-cp314-cp314-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:b51204d349a4b208287a8aa5b5422be3baa88abf6cc8ff97ccbda34919bbc857", size = 68460, upload-time = "2025-12-06T13:24:41.735Z" }, { url = "https://files.pythonhosted.org/packages/68/6d/84ce50e7ee1ae79984d689e05a9937b2460d4efa1e5b202b46762fb9036c/pybase64-1.4.3-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:30f2fd53efecbdde4bdca73a872a68dcb0d1bf8a4560c70a3e7746df973e1ef3", size = 71688, upload-time = "2025-12-06T13:24:42.908Z" }, { url = "https://files.pythonhosted.org/packages/e3/57/6743e420416c3ff1b004041c85eb0ebd9c50e9cf05624664bfa1dc8b5625/pybase64-1.4.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0932b0c5cfa617091fd74f17d24549ce5de3628791998c94ba57be808078eeaf", size = 60040, upload-time = "2025-12-06T13:24:44.37Z" }, From d35d37a7ed2aec987ce5197caa110456cb956b09 Mon Sep 17 00:00:00 2001 From: Alex Bozarth Date: Fri, 13 Mar 2026 19:14:16 -0500 Subject: [PATCH 3/4] test: add telemetry field assertions to backend tests Add assertions to verify usage, model, and provider fields are populated in test_async_avalue tests for all 5 supported backends. Signed-off-by: Alex Bozarth --- docs/dev/metrics_refactor_plan.md | 673 --------------------------- test/backends/test_huggingface.py | 8 + test/backends/test_litellm_ollama.py | 8 + test/backends/test_ollama.py | 8 + test/backends/test_openai_ollama.py | 8 + test/backends/test_watsonx.py | 8 + 6 files changed, 40 insertions(+), 673 deletions(-) delete mode 100644 docs/dev/metrics_refactor_plan.md diff --git a/docs/dev/metrics_refactor_plan.md b/docs/dev/metrics_refactor_plan.md deleted file mode 100644 index 4a82ce7b..00000000 --- a/docs/dev/metrics_refactor_plan.md +++ /dev/null @@ -1,673 +0,0 @@ -# Metrics Refactor Plan: Migrate to Hooks/Plugins System - -## Overview - -This plan addresses issues [#607](https://github.com/generative-computing/mellea/issues/607) and [#608](https://github.com/generative-computing/mellea/issues/608) to refactor the token metrics implementation added in commit `0e71558` to use the hooks/plugins system introduced in commit `cbd63bd`. - -**Current State**: Token metrics are recorded via direct `record_token_usage_metrics()` calls in each backend's `post_processing()` method (5 backends: OpenAI, Ollama, LiteLLM, HuggingFace, WatsonX). - -**Target State**: Token metrics recorded via a plugin that hooks into `generation_post_call`, with token usage data standardized on `ModelOutputThunk` using a `usage` field that matches the OpenAI API standard. - -## Key Commits Analysis - -### Commit 0e71558 (Metrics Implementation) -- Added `mellea/telemetry/metrics.py` with `record_token_usage_metrics()` -- Modified 5 backends to call `record_token_usage_metrics()` in their `post_processing()` methods -- Token extraction logic varies per backend: - - **OpenAI/LiteLLM/WatsonX**: Extract from `usage` dict via `get_value(usage, "prompt_tokens")` - - **Ollama**: Extract from response attributes (`prompt_eval_count`, `eval_count`) - - **HuggingFace**: Calculate from `input_ids` and output sequences -- Each backend stores usage info in different `_meta` locations - -### Commit cbd63bd (Hooks/Plugins System) -- Added complete hooks/plugins infrastructure in `mellea/plugins/` -- Implemented `generation_post_call` hook that fires after `post_process()` completes -- Hook fires in `ModelOutputThunk.astream()` at line 384-398 in `mellea/core/base.py` -- Payload: `GenerationPostCallPayload(prompt, model_output, latency_ms)` -- Policy: `generation_post_call` is **observe-only** (no writable fields) - -## Problem Statement - -### Issue #607: Standardize Token Storage -**Problem**: Token usage is stored inconsistently across backends in various `_meta` locations, making programmatic access difficult. - -**Solution**: Add a standard `usage` field to `ModelOutputThunk` that all backends populate, matching the OpenAI API standard. - -### Issue #608: Use Hooks for Metrics -**Problem**: Metrics recording is duplicated across 5 backends, tightly coupled to backend implementation. - -**Solution**: Create a metrics plugin that hooks `generation_post_call` to record token usage from the standardized field. - -## Architecture Decision - -The refactor follows this sequence: - -1. **First** (Issue #607): Standardize token storage on `ModelOutputThunk` -2. **Then** (Issue #608): Create metrics plugin to consume standardized data - -This ordering is critical because: -- The plugin needs a consistent data source to read from -- Backends must populate the standard field before the hook fires -- The hook fires **after** `post_processing()` completes, so backends have already extracted tokens - -## Detailed Implementation Plan - -### Phase 1: Standardize Token Storage (Issue #607) - -#### 1.1 Add `usage` Field to ModelOutputThunk - -**File**: `mellea/core/base.py` - -**Changes**: -```python -class ModelOutputThunk(CBlock, Generic[S]): - def __init__( - self, - value: str | None, - meta: dict[str, Any] | None = None, - parsed_repr: S | None = None, - tool_calls: dict[str, ModelToolCall] | None = None, - ): - # ... existing code ... - - # Add new field for standardized usage information - self.usage: dict[str, int] | None = None - """Usage information following OpenAI API standard. - - Core fields: 'prompt_tokens', 'completion_tokens', 'total_tokens'. - Populated by backends during post_processing. None if unavailable. - - Future: May include optional breakdown fields like 'completion_tokens_details' - and 'prompt_tokens_details' for advanced features (reasoning, audio, caching). - """ -``` - -**Rationale**: -- Matches OpenAI API standard (industry convention) -- Consistent with existing `tool_calls` field pattern -- Extensible for future usage fields beyond tokens (cost, reasoning tokens, cached tokens) -- Simple dict format compatible with all backends -- Optional (None) when backend doesn't provide usage info -- Accessible via `model_output.usage` in hooks and user code - -#### 1.2 Update Backend `post_processing()` Methods - -Each backend's `post_processing()` method must populate `mot.token_usage` **before** the method returns (since `generation_post_call` hook fires after `post_processing` completes). - -**Pattern for all backends**: -```python -async def post_processing(self, mot: ModelOutputThunk, ...): - # ... existing processing logic ... - - # Extract token usage (backend-specific logic) - prompt_tokens = - completion_tokens = - - # Populate standardized field (matches OpenAI API format) - if prompt_tokens is not None or completion_tokens is not None: - mot.usage = { - "prompt_tokens": prompt_tokens or 0, - "completion_tokens": completion_tokens or 0, - "total_tokens": (prompt_tokens or 0) + (completion_tokens or 0), - } - - # REMOVE: Direct metrics recording - # from ..telemetry.metrics import record_token_usage_metrics - # record_token_usage_metrics(...) -``` - -**Backend-Specific Extraction Logic**: - -1. **OpenAI** (`mellea/backends/openai.py:562-646`): - ```python - # Extract from response or streaming usage (lines 620-626) - response = mot._meta["oai_chat_response"] - usage = response.get("usage") if isinstance(response, dict) else None - if usage is None: - usage = mot._meta.get("oai_streaming_usage") - - # Populate standardized field (already matches OpenAI format) - if usage: - mot.usage = { - "prompt_tokens": get_value(usage, "prompt_tokens") or 0, - "completion_tokens": get_value(usage, "completion_tokens") or 0, - "total_tokens": get_value(usage, "total_tokens") or 0, - } - ``` - -2. **Ollama** (`mellea/backends/ollama.py:583-642`): - ```python - # Extract from response attributes (lines 620-623) - response = mot._meta.get("ollama_response") - prompt_tokens = getattr(response, "prompt_eval_count", None) if response else None - completion_tokens = getattr(response, "eval_count", None) if response else None - - # Convert to OpenAI-compatible format - if prompt_tokens is not None or completion_tokens is not None: - mot.usage = { - "prompt_tokens": prompt_tokens or 0, - "completion_tokens": completion_tokens or 0, - "total_tokens": (prompt_tokens or 0) + (completion_tokens or 0), - } - ``` - -3. **LiteLLM** (`mellea/backends/litellm.py:425-508`): - ```python - # Extract from full response or streaming usage (lines 483-489) - full_response = mot._meta.get("litellm_full_response") - usage = full_response.get("usage") if isinstance(full_response, dict) else None - if usage is None: - usage = mot._meta.get("litellm_streaming_usage") - - # Populate standardized field (already matches OpenAI format) - if usage: - mot.usage = { - "prompt_tokens": get_value(usage, "prompt_tokens") or 0, - "completion_tokens": get_value(usage, "completion_tokens") or 0, - "total_tokens": get_value(usage, "total_tokens") or 0, - } - ``` - -4. **HuggingFace** (`mellea/backends/huggingface.py:1001-1092`): - ```python - # Calculate from sequences (lines 1063-1076) - hf_output = mot._meta.get("hf_output") - if isinstance(hf_output, GenerateDecoderOnlyOutput): - if input_ids is not None and hf_output.sequences is not None: - try: - n_prompt = input_ids.shape[1] - n_completion = hf_output.sequences[0].shape[0] - n_prompt - # Convert to OpenAI-compatible format - mot.usage = { - "prompt_tokens": n_prompt, - "completion_tokens": n_completion, - "total_tokens": n_prompt + n_completion, - } - except Exception: - pass # Leave as None if calculation fails - ``` - -5. **WatsonX** (`mellea/backends/watsonx.py:450-508`): - ```python - # Extract from response usage (similar to OpenAI) - response = mot._meta.get("watsonx_response") - usage = response.get("usage") if isinstance(response, dict) else None - - # Populate standardized field (already matches OpenAI format) - if usage: - mot.usage = { - "prompt_tokens": get_value(usage, "prompt_tokens") or 0, - "completion_tokens": get_value(usage, "completion_tokens") or 0, - "total_tokens": get_value(usage, "total_tokens") or 0, - } - ``` - -**Files to Modify**: -- `mellea/core/base.py` (add field) -- `mellea/backends/openai.py` (populate field, remove direct metrics call) -- `mellea/backends/ollama.py` (populate field, remove direct metrics call) -- `mellea/backends/litellm.py` (populate field, remove direct metrics call) -- `mellea/backends/huggingface.py` (populate field, remove direct metrics call) -- `mellea/backends/watsonx.py` (populate field, remove direct metrics call) - -### Phase 2: Create Metrics Plugin (Issue #608) - -#### 2.1 Create Token Metrics Plugin - -**New File**: `mellea/plugins/builtin/token_metrics.py` - -**Implementation**: -```python -"""Built-in plugin for recording token usage metrics via OpenTelemetry.""" - -from mellea.plugins import Plugin, hook, HookType, PluginMode -from mellea.plugins.hooks.generation import GenerationPostCallPayload - - -class TokenMetricsPlugin(Plugin, name="token-metrics", priority=100): - """Records token usage metrics from generation_post_call hook. - - This plugin automatically records input/output token counts to OpenTelemetry - metrics when MELLEA_METRICS_ENABLED=true. It reads from the standardized - ModelOutputThunk.usage field populated by backends. - - Execution mode: FIRE_AND_FORGET (async, non-blocking, observe-only) - Priority: 100 (runs after other plugins) - """ - - @hook(HookType.GENERATION_POST_CALL, mode=PluginMode.FIRE_AND_FORGET) - async def record_tokens( - self, - payload: GenerationPostCallPayload, - ctx - ): - """Record token usage metrics from the model output.""" - from mellea.telemetry.metrics import is_metrics_enabled - - # Early return if metrics disabled (zero overhead) - if not is_metrics_enabled(): - return - - mot = payload.model_output - if mot is None or mot.usage is None: - return - - # Extract backend info from context - backend = ctx.global_context.state.get("backend") - if backend is None: - return - - from mellea.telemetry.backend_instrumentation import ( - get_model_id_str, - get_system_name, - ) - from mellea.telemetry.metrics import record_token_usage_metrics - - # Record using standardized usage dict - record_token_usage_metrics( - input_tokens=mot.usage.get("prompt_tokens"), - output_tokens=mot.usage.get("completion_tokens"), - model=get_model_id_str(backend), - backend=backend.__class__.__name__, - system=get_system_name(backend), - ) -``` - -**Key Design Decisions**: -- **FIRE_AND_FORGET mode**: Metrics recording is async, non-blocking, and cannot fail the generation -- **Priority 100**: Runs after other plugins (lower priority = earlier execution) -- **Observe-only**: Reads from `model_output.usage`, doesn't modify payload -- **Zero overhead**: Early return when metrics disabled -- **Backend-agnostic**: Works with any backend that populates `usage` -- **OpenAI-compatible**: Uses standard field name for familiarity and future extensibility - -#### 2.2 Auto-Register Plugin When Metrics Enabled - -**File**: `mellea/telemetry/metrics.py` - -**Changes**: -```python -# At module initialization (after _meter setup, around line 247) -if _OTEL_AVAILABLE and _METRICS_ENABLED: - _meter_provider = _setup_meter_provider() - if _meter_provider is not None: - _meter = metrics.get_meter("mellea.metrics", version("mellea")) - - # Auto-register token metrics plugin - from mellea.plugins.builtin.token_metrics import TokenMetricsPlugin - from mellea.plugins import register - - _token_metrics_plugin = TokenMetricsPlugin() - register(_token_metrics_plugin) -``` - -**Rationale**: -- Automatic activation when `MELLEA_METRICS_ENABLED=true` -- No user code changes required -- Consistent with existing metrics module behavior -- Plugin is globally registered (works with both session and functional API) - -#### 2.3 Update Plugin Module Structure - -**New Directory**: `mellea/plugins/builtin/` - -**Files**: -- `mellea/plugins/builtin/__init__.py` - Export built-in plugins -- `mellea/plugins/builtin/token_metrics.py` - Token metrics plugin - -**Update**: `mellea/plugins/__init__.py` -```python -# Add to exports -from mellea.plugins.builtin import TokenMetricsPlugin - -__all__ = [ - # ... existing exports ... - "TokenMetricsPlugin", -] -``` - -### Phase 3: Update Tests - -#### 3.1 Update Backend Integration Tests - -**Files**: `test/telemetry/test_metrics_backend.py` - -**Changes**: -- Tests should verify `mot.token_usage` is populated (not just metrics recorded) -- Add assertions: `assert mot.token_usage is not None` -- Add assertions: `assert mot.token_usage["prompt_tokens"] > 0` -- Keep existing metrics verification (plugin should still record them) - -**Example**: -```python -async def test_openai_token_metrics(ollama_backend): - """Test that OpenAI backend populates usage and metrics are recorded.""" - # ... existing test setup ... - - # Verify usage field is populated - assert mot.usage is not None - assert mot.usage["prompt_tokens"] > 0 - assert mot.usage["completion_tokens"] > 0 - assert mot.usage["total_tokens"] > 0 - - # Verify metrics were recorded (via plugin) - # ... existing metrics verification ... -``` - -#### 3.2 Add Plugin-Specific Tests - -**New File**: `test/plugins/test_token_metrics_plugin.py` - -**Tests**: -1. Test plugin registers correctly when metrics enabled -2. Test plugin reads from `mot.usage` correctly -3. Test plugin handles missing `usage` gracefully -4. Test plugin is no-op when metrics disabled -5. Test plugin works with all backends -6. Test plugin doesn't block on errors (FIRE_AND_FORGET mode) - -#### 3.3 Update Unit Tests - -**File**: `test/telemetry/test_metrics_token.py` - -**Changes**: -- Tests for `record_token_usage_metrics()` remain unchanged (function still exists) -- Add note that function is now called by plugin, not backends directly - -### Phase 4: Update Documentation - -#### 4.1 Update Telemetry Documentation - -**File**: `docs/dev/telemetry.md` - -**Changes**: -- Update "Token Usage Metrics" section to explain plugin-based architecture -- Add note about `ModelOutputThunk.usage` field (OpenAI-compatible) -- Update backend support table to note standardized field -- Add example of accessing token usage programmatically - -**Example Addition**: -```markdown -### Programmatic Access to Token Usage - -Token usage information is available on the `ModelOutputThunk` after generation: - -```python -with start_session() as m: - result = m.instruct("What is the capital of France?") - - # Access token usage (OpenAI-compatible format) - if result.usage: - print(f"Prompt tokens: {result.usage['prompt_tokens']}") - print(f"Completion tokens: {result.usage['completion_tokens']}") - print(f"Total tokens: {result.usage['total_tokens']}") -``` - -Token metrics are automatically recorded to OpenTelemetry when -`MELLEA_METRICS_ENABLED=true` via the built-in `TokenMetricsPlugin`. -``` - -#### 4.2 Update Plugin Documentation - -**File**: `docs/docs/core-concept/plugins.mdx` - -**Changes**: -- Add section on built-in plugins -- Document `TokenMetricsPlugin` as an example of FIRE_AND_FORGET mode -- Show how built-in plugins auto-register - -#### 4.3 Update Examples - -**File**: `docs/examples/telemetry/metrics_example.py` - -**Changes**: -- Add example showing programmatic access to `usage` -- Add comment explaining plugin-based architecture -- No functional changes (metrics still work the same way) - -**Example Addition**: -```python -# Example 5: Programmatic token access -print("\n5. Accessing token usage programmatically...") -result = m.instruct("Count to five") -if result.usage: - print(f" Prompt tokens: {result.usage['prompt_tokens']}") - print(f" Completion tokens: {result.usage['completion_tokens']}") - print(f" Total tokens: {result.usage['total_tokens']}") -``` - -### Phase 5: Cleanup - -#### 5.1 Remove Backend-Specific Metrics Code - -**All Backend Files**: -- Remove `from ..telemetry.metrics import is_metrics_enabled` imports -- Remove `from ..telemetry.metrics import record_token_usage_metrics` imports -- Remove `from ..telemetry.backend_instrumentation import get_model_id_str, get_system_name` imports (if only used for metrics) -- Remove `if is_metrics_enabled():` blocks and `record_token_usage_metrics()` calls - -**Estimated Lines Removed**: ~15-20 lines per backend × 5 backends = ~75-100 lines - -#### 5.2 Keep Core Metrics Infrastructure - -**Files to Keep Unchanged**: -- `mellea/telemetry/metrics.py` - Core metrics functions still needed by plugin -- `mellea/telemetry/backend_instrumentation.py` - Helper functions still needed - -## Implementation Order - -### Step 1: Add `token_usage` Field (Issue #607) -1. Modify `mellea/core/base.py` to add `token_usage` field -2. Update all 5 backends to populate the field -3. Keep existing `record_token_usage_metrics()` calls temporarily (dual-write) -4. Run tests to verify field is populated correctly - -### Step 2: Create Metrics Plugin (Issue #608) -5. Create `mellea/plugins/builtin/` directory -6. Implement `TokenMetricsPlugin` in `token_metrics.py` -7. Auto-register plugin in `mellea/telemetry/metrics.py` -8. Run tests to verify plugin records metrics correctly - -### Step 3: Remove Duplicate Code -9. Remove direct `record_token_usage_metrics()` calls from all backends -10. Remove unused imports from backends -11. Run full test suite to verify no regressions - -### Step 4: Update Documentation and Tests -12. Update backend integration tests to verify `token_usage` field -13. Add plugin-specific tests -14. Update documentation (telemetry.md, plugins.mdx) -15. Update examples to show programmatic access - -## Testing Strategy - -### Test Coverage Required - -1. **Unit Tests** (existing, should pass unchanged): - - `test/telemetry/test_metrics_token.py` - Tests for `record_token_usage_metrics()` - -2. **Backend Integration Tests** (need updates): - - `test/telemetry/test_metrics_backend.py` - Add `token_usage` field assertions - - All 5 backend tests should verify field population - -3. **Plugin Tests** (new): - - `test/plugins/test_token_metrics_plugin.py` - Plugin-specific tests - - Test plugin registration, execution, error handling - -4. **End-to-End Tests** (existing, should pass): - - `docs/examples/telemetry/metrics_example.py` - Should work unchanged - -### Test Execution - -```bash -# Run metrics tests -uv run pytest test/telemetry/test_metrics*.py -v - -# Run plugin tests -uv run pytest test/plugins/test_token_metrics_plugin.py -v - -# Run backend tests -uv run pytest test/backends/ -k "metrics" -v - -# Run example as test -uv run pytest docs/examples/telemetry/metrics_example.py -v -``` - -## Migration Impact - -### Breaking Changes -**None** - This is a pure refactor with no API changes: -- Metrics still recorded automatically when `MELLEA_METRICS_ENABLED=true` -- Same metric names, attributes, and exporters -- User code unchanged - -### New Features -- **Programmatic access**: Users can now access `mot.token_usage` directly -- **Extensibility**: Users can create custom metrics plugins following this pattern -- **Consistency**: Token data in standard location across all backends - -### Performance Impact -- **Negligible**: Plugin uses FIRE_AND_FORGET mode (async, non-blocking) -- **Same overhead**: Metrics recording logic unchanged, just moved to plugin -- **Zero overhead when disabled**: Early return in plugin when metrics disabled - -## Risks and Mitigations - -### Risk 1: Hook Timing -**Risk**: `generation_post_call` fires after `post_processing()`, but what if backends don't populate `token_usage` in time? - -**Mitigation**: Backends populate `token_usage` **during** `post_processing()`, which completes **before** the hook fires. The hook call is at line 384-398 in `ModelOutputThunk.astream()`, after line 366 (`await self._post_process(self)`). - -### Risk 2: Missing Token Data -**Risk**: Some backends might not have token usage available. - -**Mitigation**: -- `token_usage` field is optional (None when unavailable) -- Plugin checks `if mot.token_usage is None: return` -- Existing behavior preserved (metrics not recorded when data unavailable) - -### Risk 3: Plugin Framework Dependency -**Risk**: Plugin requires `mellea[hooks]` extra dependency. - -**Mitigation**: -- Plugin only imported when metrics enabled -- Graceful fallback if hooks not installed (warning message) -- Most users enabling metrics will have full installation - -### Risk 4: Test Failures -**Risk**: Existing tests might fail if they expect metrics without the plugin. - -**Mitigation**: -- Plugin auto-registers when metrics enabled (same as before) -- Tests run with `MELLEA_METRICS_ENABLED=true` will get plugin automatically -- Dual-write during Step 1 ensures no test breakage during transition - -## Success Criteria - -### Phase 1 Complete When: -- [ ] `ModelOutputThunk.token_usage` field added -- [ ] All 5 backends populate the field correctly -- [ ] Backend integration tests verify field population -- [ ] Existing metrics tests still pass (dual-write active) - -### Phase 2 Complete When: -- [ ] `TokenMetricsPlugin` implemented and tested -- [ ] Plugin auto-registers when metrics enabled -- [ ] Plugin records metrics correctly (verified by existing tests) -- [ ] Plugin-specific tests added and passing - -### Phase 3 Complete When: -- [ ] Direct `record_token_usage_metrics()` calls removed from backends -- [ ] Unused imports cleaned up -- [ ] All tests pass (metrics now recorded via plugin only) - -### Phase 4 Complete When: -- [ ] Documentation updated (telemetry.md, plugins.mdx) -- [ ] Examples updated to show programmatic access -- [ ] All documentation builds without errors - -### Overall Success: -- [ ] All tests pass (`uv run pytest test/`) -- [ ] No breaking changes to user API -- [ ] Metrics still recorded correctly -- [ ] Code is cleaner (no duplication across backends) -- [ ] Token usage accessible programmatically via `mot.token_usage` - -## File Checklist - -### Files to Create -- [ ] `mellea/plugins/builtin/__init__.py` -- [ ] `mellea/plugins/builtin/token_metrics.py` -- [ ] `test/plugins/test_token_metrics_plugin.py` - -### Files to Modify -- [ ] `mellea/core/base.py` - Add `token_usage` field -- [ ] `mellea/backends/openai.py` - Populate field, remove metrics call -- [ ] `mellea/backends/ollama.py` - Populate field, remove metrics call -- [ ] `mellea/backends/litellm.py` - Populate field, remove metrics call -- [ ] `mellea/backends/huggingface.py` - Populate field, remove metrics call -- [ ] `mellea/backends/watsonx.py` - Populate field, remove metrics call -- [ ] `mellea/telemetry/metrics.py` - Auto-register plugin -- [ ] `mellea/plugins/__init__.py` - Export TokenMetricsPlugin -- [ ] `test/telemetry/test_metrics_backend.py` - Add token_usage assertions -- [ ] `docs/dev/telemetry.md` - Document new architecture -- [ ] `docs/docs/core-concept/plugins.mdx` - Document built-in plugin -- [ ] `docs/examples/telemetry/metrics_example.py` - Add programmatic access example - -### Files to Review (No Changes Expected) -- `mellea/telemetry/backend_instrumentation.py` - Helper functions still used -- `test/telemetry/test_metrics_token.py` - Unit tests for core function -- `mellea/plugins/hooks/generation.py` - Payload definition unchanged - -## Estimated Effort - -- **Phase 1** (Standardize storage): 2-3 hours - - Add field: 15 min - - Update 5 backends: 1.5 hours (30 min each) - - Update tests: 1 hour - -- **Phase 2** (Create plugin): 1-2 hours - - Implement plugin: 45 min - - Auto-registration: 15 min - - Plugin tests: 1 hour - -- **Phase 3** (Cleanup): 30 min - - Remove duplicate code: 20 min - - Verify tests: 10 min - -- **Phase 4** (Documentation): 1 hour - - Update docs: 30 min - - Update examples: 30 min - -**Total**: 4.5-6.5 hours - -## Open Questions - -1. **Should `token_usage` be added to `GenerationPostCallPayload`?** - - Pro: Makes it explicit in the hook payload - - Con: Redundant (already on `model_output`) - - **Recommendation**: No, keep it on MOT only (single source of truth) - -2. **Should the plugin be optional or always registered?** - - Current plan: Auto-register when `MELLEA_METRICS_ENABLED=true` - - Alternative: Always register, let plugin check `is_metrics_enabled()` internally - - **Recommendation**: Auto-register (cleaner, no overhead when disabled) - -3. **Should we support custom metrics plugins?** - - Users could create their own plugins reading from `token_usage` - - **Recommendation**: Yes, document this pattern in plugins.mdx - -4. **What about vLLM backend?** - - Not modified in commit 0e71558 (no metrics support) - - **Recommendation**: Out of scope for this refactor, can be added later - -## References - -- Issue #607: https://github.com/generative-computing/mellea/issues/607 -- Issue #608: https://github.com/generative-computing/mellea/issues/608 -- PR #563 (metrics): https://github.com/generative-computing/mellea/pull/563 -- PR #582 (hooks): https://github.com/generative-computing/mellea/pull/582 -- Commit 0e71558: Metrics implementation -- Commit cbd63bd: Hooks/plugins system -- Hook system spec: `docs/dev/hook_system.md` -- Plugin examples: `docs/examples/plugins/` \ No newline at end of file diff --git a/test/backends/test_huggingface.py b/test/backends/test_huggingface.py index 805f8c4d..e2881f3b 100644 --- a/test/backends/test_huggingface.py +++ b/test/backends/test_huggingface.py @@ -328,6 +328,14 @@ async def test_async_avalue(session) -> None: assert m1_final_val is not None assert m1_final_val == mot1.value + # Verify telemetry fields are populated + assert mot1.usage is not None + assert mot1.usage["prompt_tokens"] >= 0 + assert mot1.usage["completion_tokens"] > 0 + assert mot1.usage["total_tokens"] > 0 + assert mot1.model is not None + assert mot1.provider == "huggingface" + @pytest.mark.qualitative async def test_generate_with_lock(backend) -> None: diff --git a/test/backends/test_litellm_ollama.py b/test/backends/test_litellm_ollama.py index ece9c890..d41367bb 100644 --- a/test/backends/test_litellm_ollama.py +++ b/test/backends/test_litellm_ollama.py @@ -202,6 +202,14 @@ async def test_async_avalue(session): assert m1_final_val is not None assert m1_final_val == mot1.value + # Verify telemetry fields are populated + assert mot1.usage is not None + assert mot1.usage["prompt_tokens"] >= 0 + assert mot1.usage["completion_tokens"] > 0 + assert mot1.usage["total_tokens"] > 0 + assert mot1.model is not None + assert mot1.provider == "litellm" + if __name__ == "__main__": import pytest diff --git a/test/backends/test_ollama.py b/test/backends/test_ollama.py index 706766ea..a4b75f59 100644 --- a/test/backends/test_ollama.py +++ b/test/backends/test_ollama.py @@ -196,6 +196,14 @@ async def test_async_avalue(session) -> None: assert m1_final_val is not None assert m1_final_val == mot1.value + # Verify telemetry fields are populated + assert mot1.usage is not None + assert mot1.usage["prompt_tokens"] >= 0 + assert mot1.usage["completion_tokens"] > 0 + assert mot1.usage["total_tokens"] > 0 + assert mot1.model is not None + assert mot1.provider == "ollama" + def test_multiple_asyncio_runs(session) -> None: async def test(): diff --git a/test/backends/test_openai_ollama.py b/test/backends/test_openai_ollama.py index 142d0781..10e4ac0b 100644 --- a/test/backends/test_openai_ollama.py +++ b/test/backends/test_openai_ollama.py @@ -194,6 +194,14 @@ async def test_async_avalue(m_session) -> None: assert m1_final_val is not None assert m1_final_val == mot1.value + # Verify telemetry fields are populated + assert mot1.usage is not None + assert mot1.usage["prompt_tokens"] >= 0 + assert mot1.usage["completion_tokens"] > 0 + assert mot1.usage["total_tokens"] > 0 + assert mot1.model is not None + assert mot1.provider == "openai" + def test_client_cache(backend) -> None: first_client = backend._async_client diff --git a/test/backends/test_watsonx.py b/test/backends/test_watsonx.py index c9f27be2..87784e0b 100644 --- a/test/backends/test_watsonx.py +++ b/test/backends/test_watsonx.py @@ -207,6 +207,14 @@ async def test_async_avalue(session): assert m1_final_val is not None assert m1_final_val == mot1.value + # Verify telemetry fields are populated + assert mot1.usage is not None + assert mot1.usage["prompt_tokens"] >= 0 + assert mot1.usage["completion_tokens"] > 0 + assert mot1.usage["total_tokens"] > 0 + assert mot1.model is not None + assert mot1.provider == "watsonx" + def test_client_cache(backend): first_client = backend._model From 13345b6575b097d84a2d5e2274d31e1f30c0f635 Mon Sep 17 00:00:00 2001 From: Alex Bozarth Date: Fri, 13 Mar 2026 19:52:37 -0500 Subject: [PATCH 4/4] docs: update telemetry docs for plugin-based metrics Signed-off-by: Alex Bozarth --- AGENTS.md | 2 +- docs/dev/telemetry.md | 23 +++++++++++++++++++--- docs/examples/telemetry/metrics_example.py | 6 ++++++ 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 0396b617..d414f1a7 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -92,9 +92,9 @@ Tests/examples automatically skip if system lacks required resources. Heavy exam - **Google-style docstrings** - **Ruff** for linting/formatting - Use `...` in `@generative` function bodies -- Use `...` in `@generative` function bodies - Prefer primitives over classes - **Friendly Dependency Errors**: Wraps optional backend imports in `try/except ImportError` with a helpful message (e.g., "Please pip install mellea[hf]"). See `mellea/stdlib/session.py` for examples. +- **Backend telemetry fields**: All backends must populate `mot.usage` (dict with `prompt_tokens`, `completion_tokens`, `total_tokens`), `mot.model` (str), and `mot.provider` (str) in their `post_processing()` method. Metrics are automatically recorded by `TokenMetricsPlugin` — don't add manual `record_token_usage_metrics()` calls. ## 5. Commits & Hooks [Angular format](https://github.com/angular/angular/blob/main/CONTRIBUTING.md#commit): `feat:`, `fix:`, `docs:`, `test:`, `refactor:`, `release:` diff --git a/docs/dev/telemetry.md b/docs/dev/telemetry.md index 09fb342a..b0a52d6b 100644 --- a/docs/dev/telemetry.md +++ b/docs/dev/telemetry.md @@ -129,9 +129,8 @@ All token metrics include these attributes following Gen-AI semantic conventions | Attribute | Description | Example Values | |-----------|-------------|----------------| -| `gen_ai.system` | Backend system name | `openai`, `ollama`, `watsonx`, `litellm`, `huggingface` | +| `gen_ai.provider.name` | Backend provider name | `openai`, `ollama`, `watsonx`, `litellm`, `huggingface` | | `gen_ai.request.model` | Model identifier | `gpt-4`, `llama3.2:7b`, `granite-3.1-8b-instruct` | -| `mellea.backend` | Backend class name | `OpenAIBackend`, `OllamaBackend`, `WatsonxBackend` | #### Backend Support @@ -365,12 +364,30 @@ if is_metrics_enabled(): print("Token metrics are being collected") ``` +Access token usage data from `ModelOutputThunk`: + +```python +from mellea import start_session + +with start_session() as m: + result = m.instruct("Write a haiku about programming") + + # Access token usage (follows OpenAI API format) + if result.usage: + print(f"Prompt tokens: {result.usage['prompt_tokens']}") + print(f"Completion tokens: {result.usage['completion_tokens']}") + print(f"Total tokens: {result.usage['total_tokens']}") +``` + +The `usage` field is a dictionary with three keys: `prompt_tokens`, `completion_tokens`, and `total_tokens`. All backends populate this field consistently. + #### Performance -- **Zero overhead when disabled**: When `MELLEA_METRICS_ENABLED=false` (default), `record_token_usage_metrics()` returns immediately with no processing +- **Zero overhead when disabled**: When `MELLEA_METRICS_ENABLED=false` (default), the TokenMetricsPlugin is not registered and has no overhead - **Minimal overhead when enabled**: Counter increments are extremely fast (~nanoseconds per operation) - **Async export**: Metrics are batched and exported asynchronously (default: every 60 seconds) - **Non-blocking**: Metric recording never blocks LLM calls +- **Automatic collection**: Metrics are recorded via hooks after generation completes—no manual instrumentation needed #### Use Cases diff --git a/docs/examples/telemetry/metrics_example.py b/docs/examples/telemetry/metrics_example.py index d7bf1c54..c8630c55 100644 --- a/docs/examples/telemetry/metrics_example.py +++ b/docs/examples/telemetry/metrics_example.py @@ -101,6 +101,12 @@ def main(): ) print(f"Email: {str(email)[:100]}...") + # Token usage is available on the result from instruct() + if email.usage: + print(f" → Prompt tokens: {email.usage['prompt_tokens']}") + print(f" → Completion tokens: {email.usage['completion_tokens']}") + print(f" → Total tokens: {email.usage['total_tokens']}") + # Example 3: Multiple operations print("\n3. Multiple operations...") text = "Hello, how are you today?"